From f7a25cf1d4707da39b80df96a3be8a8abd07c35b Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Mon, 13 Nov 2023 11:40:26 +0800 Subject: [PATCH 001/310] x86/setup: Make relocated_ramdisk a local variable of relocate_initrd() After 0b62f6cb0773 ("x86/microcode/32: Move early loading after paging enable"), the global variable relocated_ramdisk is no longer used anywhere except for the relocate_initrd() function. Make it a local variable of that function. Signed-off-by: Yuntao Wang Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Baoquan He Link: https://lore.kernel.org/r/20231113034026.130679-1-ytcoode@gmail.com --- arch/x86/include/asm/setup.h | 2 -- arch/x86/kernel/setup.c | 4 +--- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index bf483fcb4e57..5c83729c8e71 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -31,8 +31,6 @@ #include #include -extern u64 relocated_ramdisk; - /* Interrupt control for vSMPowered x86_64 systems */ #ifdef CONFIG_X86_64 void vsmp_init(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1526747bedf2..ec2c21a1844e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -226,8 +226,6 @@ static void __init reserve_brk(void) _brk_start = 0; } -u64 relocated_ramdisk; - #ifdef CONFIG_BLK_DEV_INITRD static u64 __init get_ramdisk_image(void) @@ -261,7 +259,7 @@ static void __init relocate_initrd(void) u64 area_size = PAGE_ALIGN(ramdisk_size); /* We need to move the initrd down into directly mapped mem */ - relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0, + u64 relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0, PFN_PHYS(max_pfn_mapped)); if (!relocated_ramdisk) panic("Cannot find place for new RAMDISK of size %lld\n", From 9f3b130048bfa2e44a8cfb1b616f826d9d5d8188 Mon Sep 17 00:00:00 2001 From: Zhiquan Li Date: Thu, 26 Oct 2023 08:39:03 +0800 Subject: [PATCH 002/310] x86/mce: Mark fatal MCE's page as poison to avoid panic in the kdump kernel Memory errors don't happen very often, especially fatal ones. However, in large-scale scenarios such as data centers, that probability increases with the amount of machines present. When a fatal machine check happens, mce_panic() is called based on the severity grading of that error. The page containing the error is not marked as poison. However, when kexec is enabled, tools like makedumpfile understand when pages are marked as poison and do not touch them so as not to cause a fatal machine check exception again while dumping the previous kernel's memory. Therefore, mark the page containing the error as poisoned so that the kexec'ed kernel can avoid accessing the page. [ bp: Rewrite commit message and comment. ] Co-developed-by: Youquan Song Signed-off-by: Youquan Song Signed-off-by: Zhiquan Li Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Naoya Horiguchi Link: https://lore.kernel.org/r/20231014051754.3759099-1-zhiquan1.li@intel.com --- arch/x86/kernel/cpu/mce/core.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 7b397370b4d6..df8d25e744d1 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -233,6 +234,7 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) struct llist_node *pending; struct mce_evt_llist *l; int apei_err = 0; + struct page *p; /* * Allow instrumentation around external facilities usage. Not that it @@ -286,6 +288,20 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) if (!fake_panic) { if (panic_timeout == 0) panic_timeout = mca_cfg.panic_timeout; + + /* + * Kdump skips the poisoned page in order to avoid + * touching the error bits again. Poison the page even + * if the error is fatal and the machine is about to + * panic. + */ + if (kexec_crash_loaded()) { + if (final && (final->status & MCI_STATUS_ADDRV)) { + p = pfn_to_online_page(final->addr >> PAGE_SHIFT); + if (p) + SetPageHWPoison(p); + } + } panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); From 04c3024560d3a14acd18d0a51a1d0a89d29b7eb5 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 27 Oct 2023 14:24:16 +0200 Subject: [PATCH 003/310] x86/barrier: Do not serialize MSR accesses on AMD AMD does not have the requirement for a synchronization barrier when acccessing a certain group of MSRs. Do not incur that unnecessary penalty there. There will be a CPUID bit which explicitly states that a MFENCE is not needed. Once that bit is added to the APM, this will be extended with it. While at it, move to processor.h to avoid include hell. Untangling that file properly is a matter for another day. Some notes on the performance aspect of why this is relevant, courtesy of Kishon VijayAbraham : On a AMD Zen4 system with 96 cores, a modified ipi-bench[1] on a VM shows x2AVIC IPI rate is 3% to 4% lower than AVIC IPI rate. The ipi-bench is modified so that the IPIs are sent between two vCPUs in the same CCX. This also requires to pin the vCPU to a physical core to prevent any latencies. This simulates the use case of pinning vCPUs to the thread of a single CCX to avoid interrupt IPI latency. In order to avoid run-to-run variance (for both x2AVIC and AVIC), the below configurations are done: 1) Disable Power States in BIOS (to prevent the system from going to lower power state) 2) Run the system at fixed frequency 2500MHz (to prevent the system from increasing the frequency when the load is more) With the above configuration: *) Performance measured using ipi-bench for AVIC: Average Latency: 1124.98ns [Time to send IPI from one vCPU to another vCPU] Cumulative throughput: 42.6759M/s [Total number of IPIs sent in a second from 48 vCPUs simultaneously] *) Performance measured using ipi-bench for x2AVIC: Average Latency: 1172.42ns [Time to send IPI from one vCPU to another vCPU] Cumulative throughput: 40.9432M/s [Total number of IPIs sent in a second from 48 vCPUs simultaneously] From above, x2AVIC latency is ~4% more than AVIC. However, the expectation is x2AVIC performance to be better or equivalent to AVIC. Upon analyzing the perf captures, it is observed significant time is spent in weak_wrmsr_fence() invoked by x2apic_send_IPI(). With the fix to skip weak_wrmsr_fence() *) Performance measured using ipi-bench for x2AVIC: Average Latency: 1117.44ns [Time to send IPI from one vCPU to another vCPU] Cumulative throughput: 42.9608M/s [Total number of IPIs sent in a second from 48 vCPUs simultaneously] Comparing the performance of x2AVIC with and without the fix, it can be seen the performance improves by ~4%. Performance captured using an unmodified ipi-bench using the 'mesh-ipi' option with and without weak_wrmsr_fence() on a Zen4 system also showed significant performance improvement without weak_wrmsr_fence(). The 'mesh-ipi' option ignores CCX or CCD and just picks random vCPU. Average throughput (10 iterations) with weak_wrmsr_fence(), Cumulative throughput: 4933374 IPI/s Average throughput (10 iterations) without weak_wrmsr_fence(), Cumulative throughput: 6355156 IPI/s [1] https://github.com/bytedance/kvm-utils/tree/master/microbenchmark/ipi-bench Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20230622095212.20940-1-bp@alien8.de --- arch/x86/include/asm/barrier.h | 18 ------------------ arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/include/asm/processor.h | 18 ++++++++++++++++++ arch/x86/kernel/cpu/amd.c | 3 +++ arch/x86/kernel/cpu/common.c | 7 +++++++ arch/x86/kernel/cpu/hygon.c | 3 +++ 6 files changed, 32 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 35389b2af88e..0216f63a366b 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -81,22 +81,4 @@ do { \ #include -/* - * Make previous memory operations globally visible before - * a WRMSR. - * - * MFENCE makes writes visible, but only affects load/store - * instructions. WRMSR is unfortunately not a load/store - * instruction and is unaffected by MFENCE. The LFENCE ensures - * that the WRMSR is not reordered. - * - * Most WRMSRs are full serializing instructions themselves and - * do not require this barrier. This is only required for the - * IA32_TSC_DEADLINE and X2APIC MSRs. - */ -static inline void weak_wrmsr_fence(void) -{ - asm volatile("mfence; lfence" : : : "memory"); -} - #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4af140cf5719..3e973ff20e23 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -308,10 +308,10 @@ #define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */ #define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */ #define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */ - #define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */ #define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */ #define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */ +#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ae81a7191c1c..26620d7642a9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -749,4 +749,22 @@ enum mds_mitigations { extern bool gds_ucode_mitigated(void); +/* + * Make previous memory operations globally visible before + * a WRMSR. + * + * MFENCE makes writes visible, but only affects load/store + * instructions. WRMSR is unfortunately not a load/store + * instruction and is unaffected by MFENCE. The LFENCE ensures + * that the WRMSR is not reordered. + * + * Most WRMSRs are full serializing instructions themselves and + * do not require this barrier. This is only required for the + * IA32_TSC_DEADLINE and X2APIC MSRs. + */ +static inline void weak_wrmsr_fence(void) +{ + alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE)); +} + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a7eab05e5f29..841e21213668 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1162,6 +1162,9 @@ static void init_amd(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has_amd_erratum(c, amd_erratum_1485)) msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); + + /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ + clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b14fc8c1c953..98f7ea6b931c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1856,6 +1856,13 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); #endif + + /* + * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and + * Hygon will clear it in ->c_init() below. + */ + set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); + /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 6f247d66758d..f0cd95502faa 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -354,6 +354,9 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); check_null_seg_clears_base(c); + + /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ + clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); } static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) From 5c22c4726e4a9c6b2e182c0b21c2d3dd63f608c4 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Fri, 9 Jun 2023 17:45:31 +0800 Subject: [PATCH 004/310] x86/paravirt: Use relative reference for the original instruction offset Similar to the alternative patching, use a relative reference for original instruction offset rather than absolute one, which saves 8 bytes for one PARA_SITE entry on x86_64. As a result, a R_X86_64_PC32 relocation is generated instead of an R_X86_64_64 one, which also reduces relocation metadata on relocatable builds. Hardcode the alignment to 4 now. [ bp: Massage commit message. ] Signed-off-by: Hou Wenlong Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/9e6053107fbaabc0d33e5d2865c5af2c67ec9925.1686301237.git.houwenlong.hwl@antgroup.com --- arch/x86/include/asm/paravirt.h | 10 +++++----- arch/x86/include/asm/paravirt_types.h | 8 ++++---- arch/x86/kernel/alternative.c | 8 +++++--- arch/x86/kernel/callthunks.c | 2 +- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6c8ff12140ae..d9384e9d97f6 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -743,16 +743,16 @@ void native_pv_lock_init(void) __init; #else /* __ASSEMBLY__ */ -#define _PVSITE(ptype, ops, word, algn) \ +#define _PVSITE(ptype, ops) \ 771:; \ ops; \ 772:; \ .pushsection .parainstructions,"a"; \ - .align algn; \ - word 771b; \ + .align 4; \ + .long 771b-.; \ .byte ptype; \ .byte 772b-771b; \ - _ASM_ALIGN; \ + .align 4; \ .popsection @@ -760,7 +760,7 @@ void native_pv_lock_init(void) __init; #ifdef CONFIG_PARAVIRT_XXL #define PARA_PATCH(off) ((off) / 8) -#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8) +#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops) #define PARA_INDIRECT(addr) *addr(%rip) #ifdef CONFIG_DEBUG_ENTRY diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 772d03487520..e1bfb719fca3 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -5,7 +5,7 @@ #ifndef __ASSEMBLY__ /* These all sit in the .parainstructions section to tell us what to patch. */ struct paravirt_patch_site { - u8 *instr; /* original instructions */ + s32 instr_offset; /* original instructions */ u8 type; /* type of this instruction */ u8 len; /* length of original instruction */ }; @@ -263,11 +263,11 @@ extern struct paravirt_patch_template pv_ops; #define _paravirt_alt(insn_string, type) \ "771:\n\t" insn_string "\n" "772:\n" \ ".pushsection .parainstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR " 771b\n" \ + " .align 4\n" \ + " .long 771b-.\n" \ " .byte " type "\n" \ " .byte 772b-771b\n" \ - _ASM_ALIGN "\n" \ + " .align 4\n" \ ".popsection\n" /* Generate patchable code, with the default asm parameters. */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 73be3931e4f0..be35c8ccf826 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1431,20 +1431,22 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start, { struct paravirt_patch_site *p; char insn_buff[MAX_PATCH_LEN]; + u8 *instr; for (p = start; p < end; p++) { unsigned int used; + instr = (u8 *)&p->instr_offset + p->instr_offset; BUG_ON(p->len > MAX_PATCH_LEN); /* prep the buffer with the original instructions */ - memcpy(insn_buff, p->instr, p->len); - used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len); + memcpy(insn_buff, instr, p->len); + used = paravirt_patch(p->type, insn_buff, (unsigned long)instr, p->len); BUG_ON(used > p->len); /* Pad the rest with nops */ add_nops(insn_buff + used, p->len - used); - text_poke_early(p->instr, insn_buff, p->len); + text_poke_early(instr, insn_buff, p->len); } } extern struct paravirt_patch_site __start_parainstructions[], diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index e9ad518a5003..57e5c2e75c2a 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -240,7 +240,7 @@ patch_paravirt_call_sites(struct paravirt_patch_site *start, struct paravirt_patch_site *p; for (p = start; p < end; p++) - patch_call(p->instr, ct); + patch_call((void *)&p->instr_offset + p->instr_offset, ct); } static __init_or_module void From fe22bc430c9d24394e541e16e0941a075f02fcb7 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Fri, 9 Jun 2023 17:45:32 +0800 Subject: [PATCH 005/310] x86/paravirt: Make the struct paravirt_patch_site packed Similar to struct alt_instr, make the struct paravirt_patch_site packed and get rid of all the .align directives and save 2 bytes for one PARA_SITE entry on X86_64. [ bp: Massage commit message. ] Suggested-by: Nadav Amit Signed-off-by: Hou Wenlong Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/6dcb20159ded36586c5f7f2ae159e4e030256627.1686301237.git.houwenlong.hwl@antgroup.com --- arch/x86/include/asm/paravirt.h | 2 -- arch/x86/include/asm/paravirt_types.h | 4 +--- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index d9384e9d97f6..693c61dbdd9c 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -748,11 +748,9 @@ void native_pv_lock_init(void) __init; ops; \ 772:; \ .pushsection .parainstructions,"a"; \ - .align 4; \ .long 771b-.; \ .byte ptype; \ .byte 772b-771b; \ - .align 4; \ .popsection diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index e1bfb719fca3..f4fb2e3ec7b8 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -8,7 +8,7 @@ struct paravirt_patch_site { s32 instr_offset; /* original instructions */ u8 type; /* type of this instruction */ u8 len; /* length of original instruction */ -}; +} __packed; #endif #ifdef CONFIG_PARAVIRT @@ -263,11 +263,9 @@ extern struct paravirt_patch_template pv_ops; #define _paravirt_alt(insn_string, type) \ "771:\n\t" insn_string "\n" "772:\n" \ ".pushsection .parainstructions,\"a\"\n" \ - " .align 4\n" \ " .long 771b-.\n" \ " .byte " type "\n" \ " .byte 772b-771b\n" \ - " .align 4\n" \ ".popsection\n" /* Generate patchable code, with the default asm parameters. */ From e4ab322fbaaaf84b23d6cb0e3317a7f68baf36dc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 17 Sep 2023 13:22:17 +0200 Subject: [PATCH 006/310] cleanup: Add conditional guard support Adds: - DEFINE_GUARD_COND() / DEFINE_LOCK_GUARD_1_COND() to extend existing guards with conditional lock primitives, eg. mutex_trylock(), mutex_lock_interruptible(). nb. both primitives allow NULL 'locks', which cause the lock to fail (obviously). - extends scoped_guard() to not take the body when the the conditional guard 'fails'. eg. scoped_guard (mutex_intr, &task->signal_cred_guard_mutex) { ... } will only execute the body when the mutex is held. - provides scoped_cond_guard(name, fail, args...); which extends scoped_guard() to do fail when the lock-acquire fails. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231102110706.460851167%40infradead.org --- include/linux/cleanup.h | 52 +++++++++++++++++++++++++++++++++++++--- include/linux/mutex.h | 3 ++- include/linux/rwsem.h | 8 +++---- include/linux/spinlock.h | 15 ++++++++++++ 4 files changed, 70 insertions(+), 8 deletions(-) diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 9f1a9c455b68..c2d09bc4f976 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -125,25 +125,55 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ * trivial wrapper around DEFINE_CLASS() above specifically * for locks. * + * DEFINE_GUARD_COND(name, ext, condlock) + * wrapper around EXTEND_CLASS above to add conditional lock + * variants to a base class, eg. mutex_trylock() or + * mutex_lock_interruptible(). + * * guard(name): - * an anonymous instance of the (guard) class + * an anonymous instance of the (guard) class, not recommended for + * conditional locks. * * scoped_guard (name, args...) { }: * similar to CLASS(name, scope)(args), except the variable (with the * explicit name 'scope') is declard in a for-loop such that its scope is * bound to the next (compound) statement. * + * for conditional locks the loop body is skipped when the lock is not + * acquired. + * + * scoped_cond_guard (name, fail, args...) { }: + * similar to scoped_guard(), except it does fail when the lock + * acquire fails. + * */ #define DEFINE_GUARD(_name, _type, _lock, _unlock) \ - DEFINE_CLASS(_name, _type, _unlock, ({ _lock; _T; }), _type _T) + DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \ + static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ + { return *_T; } + +#define DEFINE_GUARD_COND(_name, _ext, _condlock) \ + EXTEND_CLASS(_name, _ext, \ + ({ void *_t = _T; if (_T && !(_condlock)) _t = NULL; _t; }), \ + class_##_name##_t _T) \ + static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + { return class_##_name##_lock_ptr(_T); } #define guard(_name) \ CLASS(_name, __UNIQUE_ID(guard)) +#define __guard_ptr(_name) class_##_name##_lock_ptr + #define scoped_guard(_name, args...) \ for (CLASS(_name, scope)(args), \ - *done = NULL; !done; done = (void *)1) + *done = NULL; __guard_ptr(_name)(&scope) && !done; done = (void *)1) + +#define scoped_cond_guard(_name, _fail, args...) \ + for (CLASS(_name, scope)(args), \ + *done = NULL; !done; done = (void *)1) \ + if (!__guard_ptr(_name)(&scope)) _fail; \ + else /* * Additional helper macros for generating lock guards with types, either for @@ -152,6 +182,7 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ * * DEFINE_LOCK_GUARD_0(name, lock, unlock, ...) * DEFINE_LOCK_GUARD_1(name, type, lock, unlock, ...) + * DEFINE_LOCK_GUARD_1_COND(name, ext, condlock) * * will result in the following type: * @@ -173,6 +204,11 @@ typedef struct { \ static inline void class_##_name##_destructor(class_##_name##_t *_T) \ { \ if (_T->lock) { _unlock; } \ +} \ + \ +static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ +{ \ + return _T->lock; \ } @@ -201,4 +237,14 @@ __DEFINE_LOCK_GUARD_1(_name, _type, _lock) __DEFINE_UNLOCK_GUARD(_name, void, _unlock, __VA_ARGS__) \ __DEFINE_LOCK_GUARD_0(_name, _lock) +#define DEFINE_LOCK_GUARD_1_COND(_name, _ext, _condlock) \ + EXTEND_CLASS(_name, _ext, \ + ({ class_##_name##_t _t = { .lock = l }, *_T = &_t;\ + if (_T->lock && !(_condlock)) _T->lock = NULL; \ + _t; }), \ + typeof_member(class_##_name##_t, lock) l) \ + static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + { return class_##_name##_lock_ptr(_T); } + + #endif /* __LINUX_GUARDS_H */ diff --git a/include/linux/mutex.h b/include/linux/mutex.h index a33aa9eb9fc3..95d11308f995 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -221,6 +221,7 @@ extern void mutex_unlock(struct mutex *lock); extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T)) -DEFINE_FREE(mutex, struct mutex *, if (_T) mutex_unlock(_T)) +DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T)) +DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0) #endif /* __LINUX_MUTEX_H */ diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 1dd530ce8b45..9c29689ff505 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -203,11 +203,11 @@ extern void up_read(struct rw_semaphore *sem); extern void up_write(struct rw_semaphore *sem); DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T)) +DEFINE_GUARD_COND(rwsem_read, _try, down_read_trylock(_T)) +DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T) == 0) + DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T)) - -DEFINE_FREE(up_read, struct rw_semaphore *, if (_T) up_read(_T)) -DEFINE_FREE(up_write, struct rw_semaphore *, if (_T) up_write(_T)) - +DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T)) /* * downgrade write lock to read lock diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 31d3d747a9db..ceb56b39c70f 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -507,6 +507,8 @@ DEFINE_LOCK_GUARD_1(raw_spinlock, raw_spinlock_t, raw_spin_lock(_T->lock), raw_spin_unlock(_T->lock)) +DEFINE_LOCK_GUARD_1_COND(raw_spinlock, _try, raw_spin_trylock(_T->lock)) + DEFINE_LOCK_GUARD_1(raw_spinlock_nested, raw_spinlock_t, raw_spin_lock_nested(_T->lock, SINGLE_DEPTH_NESTING), raw_spin_unlock(_T->lock)) @@ -515,23 +517,36 @@ DEFINE_LOCK_GUARD_1(raw_spinlock_irq, raw_spinlock_t, raw_spin_lock_irq(_T->lock), raw_spin_unlock_irq(_T->lock)) +DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irq, _try, raw_spin_trylock_irq(_T->lock)) + DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t, raw_spin_lock_irqsave(_T->lock, _T->flags), raw_spin_unlock_irqrestore(_T->lock, _T->flags), unsigned long flags) +DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irqsave, _try, + raw_spin_trylock_irqsave(_T->lock, _T->flags)) + DEFINE_LOCK_GUARD_1(spinlock, spinlock_t, spin_lock(_T->lock), spin_unlock(_T->lock)) +DEFINE_LOCK_GUARD_1_COND(spinlock, _try, spin_trylock(_T->lock)) + DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t, spin_lock_irq(_T->lock), spin_unlock_irq(_T->lock)) +DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try, + spin_trylock_irq(_T->lock)) + DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t, spin_lock_irqsave(_T->lock, _T->flags), spin_unlock_irqrestore(_T->lock, _T->flags), unsigned long flags) +DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try, + spin_trylock_irqsave(_T->lock, _T->flags)) + #undef __LINUX_INSIDE_SPINLOCK_H #endif /* __LINUX_SPINLOCK_H */ From 612905e13b8769caca7ec4194a8aceb24efa4d5c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 7 Nov 2023 18:55:29 +0200 Subject: [PATCH 007/310] x86/mce: Remove redundant check from mce_device_create() mce_device_create() is called only from mce_cpu_online() which in turn will be called iff MCA support is available. That is, at the time of mce_device_create() call it's guaranteed that MCA support is available. No need to duplicate this check so remove it. [ bp: Massage commit message. ] Signed-off-by: Nikolay Borisov Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231107165529.407349-1-nik.borisov@suse.com --- arch/x86/kernel/cpu/mce/core.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index df8d25e744d1..1642018dd6c9 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2584,9 +2584,6 @@ static int mce_device_create(unsigned int cpu) int err; int i, j; - if (!mce_available(&boot_cpu_data)) - return -EIO; - dev = per_cpu(mce_device, cpu); if (dev) return 0; From a24d61c609813963aacc9f6ec8343f4fcaac7243 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 2 Nov 2023 17:49:01 +0000 Subject: [PATCH 008/310] x86/lib: Fix overflow when counting digits tl;dr: The num_digits() function has a theoretical overflow issue. But it doesn't affect any actual in-tree users. Fix it by using a larger type for one of the local variables. Long version: There is an overflow in variable m in function num_digits when val is >= 1410065408 which leads to the digit calculation loop to iterate more times than required. This results in either more digits being counted or in some cases (for example where val is 1932683193) the value of m eventually overflows to zero and the while loop spins forever). Currently the function num_digits is currently only being used for small values of val in the SMP boot stage for digit counting on the number of cpus and NUMA nodes, so the overflow is never encountered. However it is useful to fix the overflow issue in case the function is used for other purposes in the future. (The issue was discovered while investigating the digit counting performance in various kernel helper functions rather than any real-world use-case). The simplest fix is to make m a long long, the overhead in multiplication speed for a long long is very minor for small values of val less than 10000 on modern processors. The alternative fix is to replace the multiplication with a constant division by 10 loop (this compiles down to an multiplication and shift) without needing to make m a long long, but this is slightly slower than the fix in this commit when measured on a range of x86 processors). [ dhansen: subject and changelog tweaks ] Fixes: 646e29a1789a ("x86: Improve the printout of the SMP bootup CPU table") Signed-off-by: Colin Ian King Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20231102174901.2590325-1-colin.i.king%40gmail.com --- arch/x86/lib/misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/lib/misc.c b/arch/x86/lib/misc.c index 92cd8ecc3a2c..40b81c338ae5 100644 --- a/arch/x86/lib/misc.c +++ b/arch/x86/lib/misc.c @@ -8,7 +8,7 @@ */ int num_digits(int val) { - int m = 10; + long long m = 10; int d = 1; if (val < 0) { From 1b09892962048a66ab46bf489c2b1264c1ae99ee Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 15 Nov 2023 15:02:01 -0600 Subject: [PATCH 009/310] EDAC/altera: Use device_get_match_data() Use preferred device_get_match_data() instead of of_match_device() to get the driver match data. With this, adjust the includes to explicitly include the correct headers. Signed-off-by: Rob Herring Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231115210201.3743564-1-robh@kernel.org --- drivers/edac/altera_edac.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 8b31cd54bdb6..541acf5eba05 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -279,7 +280,6 @@ release: static int altr_sdram_probe(struct platform_device *pdev) { - const struct of_device_id *id; struct edac_mc_layer layers[2]; struct mem_ctl_info *mci; struct altr_sdram_mc_data *drvdata; @@ -290,10 +290,6 @@ static int altr_sdram_probe(struct platform_device *pdev) int irq, irq2, res = 0; unsigned long mem_size, irqflags = 0; - id = of_match_device(altr_sdram_ctrl_of_match, &pdev->dev); - if (!id) - return -ENODEV; - /* Grab the register range from the sdr controller in device tree */ mc_vbase = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, "altr,sdr-syscon"); @@ -304,8 +300,7 @@ static int altr_sdram_probe(struct platform_device *pdev) } /* Check specific dependencies for the module */ - priv = of_match_node(altr_sdram_ctrl_of_match, - pdev->dev.of_node)->data; + priv = device_get_match_data(&pdev->dev); /* Validate the SDRAM controller has ECC enabled */ if (regmap_read(mc_vbase, priv->ecc_ctrl_offset, &read_reg) || From 4e15b91c5b7919c530c27f39c7f2d392bf0a95e3 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Mon, 13 Nov 2023 14:52:52 +0100 Subject: [PATCH 010/310] x86/mtrr: Document missing function parameters in kernel-doc Add text explaining what they do. No functional changes. Closes: https://lore.kernel.org/oe-kbuild-all/202311130104.9xKAKzke-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/202311130104.9xKAKzke-lkp@intel.com --- arch/x86/kernel/cpu/mtrr/generic.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 2d6aa5d2e3d7..d3524778a545 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -428,6 +428,10 @@ void __init mtrr_copy_map(void) * from the x86_init.hyper.init_platform() hook. It can be called only once. * The MTRR state can't be changed afterwards. To ensure that, X86_FEATURE_MTRR * is cleared. + * + * @var: MTRR variable range array to use + * @num_var: length of the @var array + * @def_type: default caching type */ void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var, mtrr_type def_type) @@ -492,13 +496,15 @@ static u8 type_merge(u8 type, u8 new_type, u8 *uniform) /** * mtrr_type_lookup - look up memory type in MTRR * + * @start: Begin of the physical address range + * @end: End of the physical address range + * @uniform: output argument: + * - 1: the returned MTRR type is valid for the whole region + * - 0: otherwise + * * Return Values: * MTRR_TYPE_(type) - The effective MTRR type for the region * MTRR_TYPE_INVALID - MTRR is disabled - * - * Output Argument: - * uniform - Set to 1 when the returned MTRR type is valid for the whole - * region, set to 0 else. */ u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) { From b73e11c87339cac6a701cfed19a26c90f5fa0581 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:34 +0200 Subject: [PATCH 011/310] EDAC/altera: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231004131254.2673842-2-u.kleine-koenig@pengutronix.de --- drivers/edac/altera_edac.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 541acf5eba05..ae17ce4d9722 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -454,15 +454,13 @@ free: return res; } -static int altr_sdram_remove(struct platform_device *pdev) +static void altr_sdram_remove(struct platform_device *pdev) { struct mem_ctl_info *mci = platform_get_drvdata(pdev); edac_mc_del_mc(&pdev->dev); edac_mc_free(mci); platform_set_drvdata(pdev, NULL); - - return 0; } /* @@ -484,7 +482,7 @@ static const struct dev_pm_ops altr_sdram_pm_ops = { static struct platform_driver altr_sdram_edac_driver = { .probe = altr_sdram_probe, - .remove = altr_sdram_remove, + .remove_new = altr_sdram_remove, .driver = { .name = "altr_sdram_edac", #ifdef CONFIG_PM @@ -807,7 +805,7 @@ fail: return res; } -static int altr_edac_device_remove(struct platform_device *pdev) +static void altr_edac_device_remove(struct platform_device *pdev) { struct edac_device_ctl_info *dci = platform_get_drvdata(pdev); struct altr_edac_device_dev *drvdata = dci->pvt_info; @@ -815,13 +813,11 @@ static int altr_edac_device_remove(struct platform_device *pdev) debugfs_remove_recursive(drvdata->debugfs_dir); edac_device_del_device(&pdev->dev); edac_device_free_ctl_info(dci); - - return 0; } static struct platform_driver altr_edac_device_driver = { .probe = altr_edac_device_probe, - .remove = altr_edac_device_remove, + .remove_new = altr_edac_device_remove, .driver = { .name = "altr_edac_device", .of_match_table = altr_edac_device_of_match, From 5aafd02da7e29acc0b16fb178f140214eb6ea5a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:35 +0200 Subject: [PATCH 012/310] EDAC/armada_xp: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231004131254.2673842-3-u.kleine-koenig@pengutronix.de --- drivers/edac/armada_xp_edac.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/edac/armada_xp_edac.c b/drivers/edac/armada_xp_edac.c index c4bd2fb9c46b..5bcd34f23baf 100644 --- a/drivers/edac/armada_xp_edac.c +++ b/drivers/edac/armada_xp_edac.c @@ -351,20 +351,18 @@ static int axp_mc_probe(struct platform_device *pdev) return 0; } -static int axp_mc_remove(struct platform_device *pdev) +static void axp_mc_remove(struct platform_device *pdev) { struct mem_ctl_info *mci = platform_get_drvdata(pdev); edac_mc_del_mc(&pdev->dev); edac_mc_free(mci); platform_set_drvdata(pdev, NULL); - - return 0; } static struct platform_driver axp_mc_driver = { .probe = axp_mc_probe, - .remove = axp_mc_remove, + .remove_new = axp_mc_remove, .driver = { .name = "armada_xp_mc_edac", .of_match_table = of_match_ptr(axp_mc_of_match), @@ -564,7 +562,7 @@ static int aurora_l2_probe(struct platform_device *pdev) return 0; } -static int aurora_l2_remove(struct platform_device *pdev) +static void aurora_l2_remove(struct platform_device *pdev) { struct edac_device_ctl_info *dci = platform_get_drvdata(pdev); #ifdef CONFIG_EDAC_DEBUG @@ -575,13 +573,11 @@ static int aurora_l2_remove(struct platform_device *pdev) edac_device_del_device(&pdev->dev); edac_device_free_ctl_info(dci); platform_set_drvdata(pdev, NULL); - - return 0; } static struct platform_driver aurora_l2_driver = { .probe = aurora_l2_probe, - .remove = aurora_l2_remove, + .remove_new = aurora_l2_remove, .driver = { .name = "aurora_l2_edac", .of_match_table = of_match_ptr(aurora_l2_of_match), From 2546fffd91299e60e0b1558b61448d054f37274b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:36 +0200 Subject: [PATCH 013/310] EDAC/aspeed: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231004131254.2673842-4-u.kleine-koenig@pengutronix.de --- drivers/edac/aspeed_edac.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/edac/aspeed_edac.c b/drivers/edac/aspeed_edac.c index 6bd5f8815919..157a480eb761 100644 --- a/drivers/edac/aspeed_edac.c +++ b/drivers/edac/aspeed_edac.c @@ -357,7 +357,7 @@ probe_exit02: } -static int aspeed_remove(struct platform_device *pdev) +static void aspeed_remove(struct platform_device *pdev) { struct mem_ctl_info *mci; @@ -369,8 +369,6 @@ static int aspeed_remove(struct platform_device *pdev) mci = edac_mc_del_mc(&pdev->dev); if (mci) edac_mc_free(mci); - - return 0; } @@ -389,7 +387,7 @@ static struct platform_driver aspeed_driver = { .of_match_table = aspeed_of_match }, .probe = aspeed_probe, - .remove = aspeed_remove + .remove_new = aspeed_remove }; module_platform_driver(aspeed_driver); From a5347591eb6fa1263bb0902714c3c8f07dd4748b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:37 +0200 Subject: [PATCH 014/310] EDAC/bluefield: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231004131254.2673842-5-u.kleine-koenig@pengutronix.de --- drivers/edac/bluefield_edac.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/edac/bluefield_edac.c b/drivers/edac/bluefield_edac.c index e4736eb37bfb..5b3164560648 100644 --- a/drivers/edac/bluefield_edac.c +++ b/drivers/edac/bluefield_edac.c @@ -323,14 +323,12 @@ err: } -static int bluefield_edac_mc_remove(struct platform_device *pdev) +static void bluefield_edac_mc_remove(struct platform_device *pdev) { struct mem_ctl_info *mci = platform_get_drvdata(pdev); edac_mc_del_mc(&pdev->dev); edac_mc_free(mci); - - return 0; } static const struct acpi_device_id bluefield_mc_acpi_ids[] = { @@ -346,7 +344,7 @@ static struct platform_driver bluefield_edac_mc_driver = { .acpi_match_table = bluefield_mc_acpi_ids, }, .probe = bluefield_edac_mc_probe, - .remove = bluefield_edac_mc_remove, + .remove_new = bluefield_edac_mc_remove, }; module_platform_driver(bluefield_edac_mc_driver); From d8d9f99fd03322519066f97eac982c464404f221 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:38 +0200 Subject: [PATCH 015/310] EDAC/cell: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231004131254.2673842-6-u.kleine-koenig@pengutronix.de --- drivers/edac/cell_edac.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/edac/cell_edac.c b/drivers/edac/cell_edac.c index bc1f3416400e..2000f66fbf5c 100644 --- a/drivers/edac/cell_edac.c +++ b/drivers/edac/cell_edac.c @@ -234,12 +234,11 @@ static int cell_edac_probe(struct platform_device *pdev) return 0; } -static int cell_edac_remove(struct platform_device *pdev) +static void cell_edac_remove(struct platform_device *pdev) { struct mem_ctl_info *mci = edac_mc_del_mc(&pdev->dev); if (mci) edac_mc_free(mci); - return 0; } static struct platform_driver cell_edac_driver = { @@ -247,7 +246,7 @@ static struct platform_driver cell_edac_driver = { .name = "cbe-mic", }, .probe = cell_edac_probe, - .remove = cell_edac_remove, + .remove_new = cell_edac_remove, }; static int __init cell_edac_init(void) From 0576ded05b33dd460082b29d900e2f17a56c5a6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:39 +0200 Subject: [PATCH 016/310] EDAC/cpc925: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231004131254.2673842-7-u.kleine-koenig@pengutronix.de --- drivers/edac/cpc925_edac.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/edac/cpc925_edac.c b/drivers/edac/cpc925_edac.c index 9797e6d60dde..5075dc7526e3 100644 --- a/drivers/edac/cpc925_edac.c +++ b/drivers/edac/cpc925_edac.c @@ -1010,7 +1010,7 @@ out: return res; } -static int cpc925_remove(struct platform_device *pdev) +static void cpc925_remove(struct platform_device *pdev) { struct mem_ctl_info *mci = platform_get_drvdata(pdev); @@ -1023,13 +1023,11 @@ static int cpc925_remove(struct platform_device *pdev) edac_mc_del_mc(&pdev->dev); edac_mc_free(mci); - - return 0; } static struct platform_driver cpc925_edac_driver = { .probe = cpc925_probe, - .remove = cpc925_remove, + .remove_new = cpc925_remove, .driver = { .name = "cpc925_edac", } From d27cb32e00eff60d8fa94de265a8d84c63ef059c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:40 +0200 Subject: [PATCH 017/310] EDAC/dmc520: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231004131254.2673842-8-u.kleine-koenig@pengutronix.de --- drivers/edac/dmc520_edac.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/edac/dmc520_edac.c b/drivers/edac/dmc520_edac.c index 1fa5ca57e9ec..4e30b989a1a4 100644 --- a/drivers/edac/dmc520_edac.c +++ b/drivers/edac/dmc520_edac.c @@ -602,7 +602,7 @@ err: return ret; } -static int dmc520_edac_remove(struct platform_device *pdev) +static void dmc520_edac_remove(struct platform_device *pdev) { u32 reg_val, idx, irq_mask_all = 0; struct mem_ctl_info *mci; @@ -626,8 +626,6 @@ static int dmc520_edac_remove(struct platform_device *pdev) edac_mc_del_mc(&pdev->dev); edac_mc_free(mci); - - return 0; } static const struct of_device_id dmc520_edac_driver_id[] = { @@ -644,7 +642,7 @@ static struct platform_driver dmc520_edac_driver = { }, .probe = dmc520_edac_probe, - .remove = dmc520_edac_remove + .remove_new = dmc520_edac_remove }; module_platform_driver(dmc520_edac_driver); From 7aca2e9b7bc464e90481568359ab975b199d8213 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:41 +0200 Subject: [PATCH 018/310] EDAC/highbank_l2: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Andre Przywara Link: https://lore.kernel.org/r/20231004131254.2673842-9-u.kleine-koenig@pengutronix.de --- drivers/edac/highbank_l2_edac.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/edac/highbank_l2_edac.c b/drivers/edac/highbank_l2_edac.c index 140d4431bd0d..5646c049a934 100644 --- a/drivers/edac/highbank_l2_edac.c +++ b/drivers/edac/highbank_l2_edac.c @@ -118,18 +118,17 @@ err: return res; } -static int highbank_l2_err_remove(struct platform_device *pdev) +static void highbank_l2_err_remove(struct platform_device *pdev) { struct edac_device_ctl_info *dci = platform_get_drvdata(pdev); edac_device_del_device(&pdev->dev); edac_device_free_ctl_info(dci); - return 0; } static struct platform_driver highbank_l2_edac_driver = { .probe = highbank_l2_err_probe, - .remove = highbank_l2_err_remove, + .remove_new = highbank_l2_err_remove, .driver = { .name = "hb_l2_edac", .of_match_table = hb_l2_err_of_match, From 81b3e87411ebf3cc27fc15655ef1555b8c853acf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:42 +0200 Subject: [PATCH 019/310] EDAC/highbank_mc: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Andre Przywara Link: https://lore.kernel.org/r/20231004131254.2673842-10-u.kleine-koenig@pengutronix.de --- drivers/edac/highbank_mc_edac.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/edac/highbank_mc_edac.c b/drivers/edac/highbank_mc_edac.c index a0c04a7f95e9..1c5b888ab11d 100644 --- a/drivers/edac/highbank_mc_edac.c +++ b/drivers/edac/highbank_mc_edac.c @@ -251,18 +251,17 @@ free: return res; } -static int highbank_mc_remove(struct platform_device *pdev) +static void highbank_mc_remove(struct platform_device *pdev) { struct mem_ctl_info *mci = platform_get_drvdata(pdev); edac_mc_del_mc(&pdev->dev); edac_mc_free(mci); - return 0; } static struct platform_driver highbank_mc_edac_driver = { .probe = highbank_mc_probe, - .remove = highbank_mc_remove, + .remove_new = highbank_mc_remove, .driver = { .name = "hb_mc_edac", .of_match_table = hb_ddr_ctrl_of_match, From 1baf49724e8dac791351509828b377b5c002322e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:43 +0200 Subject: [PATCH 020/310] EDAC/mpc85xx: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231004131254.2673842-11-u.kleine-koenig@pengutronix.de --- drivers/edac/mpc85xx_edac.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c index 2b5703e5066e..e0d702cef66f 100644 --- a/drivers/edac/mpc85xx_edac.c +++ b/drivers/edac/mpc85xx_edac.c @@ -300,7 +300,7 @@ err: return res; } -static int mpc85xx_pci_err_remove(struct platform_device *op) +static void mpc85xx_pci_err_remove(struct platform_device *op) { struct edac_pci_ctl_info *pci = dev_get_drvdata(&op->dev); struct mpc85xx_pci_pdata *pdata = pci->pvt_info; @@ -312,8 +312,6 @@ static int mpc85xx_pci_err_remove(struct platform_device *op) edac_pci_del_device(&op->dev); edac_pci_free_ctl_info(pci); - - return 0; } static const struct platform_device_id mpc85xx_pci_err_match[] = { @@ -325,7 +323,7 @@ static const struct platform_device_id mpc85xx_pci_err_match[] = { static struct platform_driver mpc85xx_pci_err_driver = { .probe = mpc85xx_pci_err_probe, - .remove = mpc85xx_pci_err_remove, + .remove_new = mpc85xx_pci_err_remove, .id_table = mpc85xx_pci_err_match, .driver = { .name = "mpc85xx_pci_err", @@ -591,7 +589,7 @@ err: return res; } -static int mpc85xx_l2_err_remove(struct platform_device *op) +static void mpc85xx_l2_err_remove(struct platform_device *op) { struct edac_device_ctl_info *edac_dev = dev_get_drvdata(&op->dev); struct mpc85xx_l2_pdata *pdata = edac_dev->pvt_info; @@ -606,7 +604,6 @@ static int mpc85xx_l2_err_remove(struct platform_device *op) out_be32(pdata->l2_vbase + MPC85XX_L2_ERRDIS, orig_l2_err_disable); edac_device_del_device(&op->dev); edac_device_free_ctl_info(edac_dev); - return 0; } static const struct of_device_id mpc85xx_l2_err_of_match[] = { @@ -630,7 +627,7 @@ MODULE_DEVICE_TABLE(of, mpc85xx_l2_err_of_match); static struct platform_driver mpc85xx_l2_err_driver = { .probe = mpc85xx_l2_err_probe, - .remove = mpc85xx_l2_err_remove, + .remove_new = mpc85xx_l2_err_remove, .driver = { .name = "mpc85xx_l2_err", .of_match_table = mpc85xx_l2_err_of_match, From 8510e004d5d50fa193e96d22de84c0d625193a15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:44 +0200 Subject: [PATCH 021/310] EDAC/npcm: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231004131254.2673842-12-u.kleine-koenig@pengutronix.de --- drivers/edac/npcm_edac.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/edac/npcm_edac.c b/drivers/edac/npcm_edac.c index 6d15c1550263..2e2133b784e9 100644 --- a/drivers/edac/npcm_edac.c +++ b/drivers/edac/npcm_edac.c @@ -410,7 +410,7 @@ free_edac_mc: return rc; } -static int edac_remove(struct platform_device *pdev) +static void edac_remove(struct platform_device *pdev) { struct mem_ctl_info *mci = platform_get_drvdata(pdev); struct priv_data *priv = mci->pvt_info; @@ -426,8 +426,6 @@ static int edac_remove(struct platform_device *pdev) regmap_write(npcm_regmap, pdata->ctl_int_mask_master, pdata->int_mask_master_global_mask); regmap_update_bits(npcm_regmap, pdata->ctl_ecc_en, pdata->ecc_en_mask, 0); - - return 0; } static const struct npcm_platform_data npcm750_edac = { @@ -533,7 +531,7 @@ static struct platform_driver npcm_edac_driver = { .of_match_table = npcm_edac_of_match, }, .probe = edac_probe, - .remove = edac_remove, + .remove_new = edac_remove, }; module_platform_driver(npcm_edac_driver); From 01314f277299f45c5758166d2ad3c195719187a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:45 +0200 Subject: [PATCH 022/310] EDAC/octeon-l2c: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20231004131254.2673842-13-u.kleine-koenig@pengutronix.de --- drivers/edac/octeon_edac-l2c.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/edac/octeon_edac-l2c.c b/drivers/edac/octeon_edac-l2c.c index c33059e9b0be..4015eb9af6fe 100644 --- a/drivers/edac/octeon_edac-l2c.c +++ b/drivers/edac/octeon_edac-l2c.c @@ -184,19 +184,17 @@ err: return -ENXIO; } -static int octeon_l2c_remove(struct platform_device *pdev) +static void octeon_l2c_remove(struct platform_device *pdev) { struct edac_device_ctl_info *l2c = platform_get_drvdata(pdev); edac_device_del_device(&pdev->dev); edac_device_free_ctl_info(l2c); - - return 0; } static struct platform_driver octeon_l2c_driver = { .probe = octeon_l2c_probe, - .remove = octeon_l2c_remove, + .remove_new = octeon_l2c_remove, .driver = { .name = "octeon_l2c_edac", } From c2a962933c761e4c046f4d519530e78757eff682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:46 +0200 Subject: [PATCH 023/310] EDAC/octeon-lmc: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20231004131254.2673842-14-u.kleine-koenig@pengutronix.de --- drivers/edac/octeon_edac-lmc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/edac/octeon_edac-lmc.c b/drivers/edac/octeon_edac-lmc.c index aeb222ca3ed1..18615cbcd9ea 100644 --- a/drivers/edac/octeon_edac-lmc.c +++ b/drivers/edac/octeon_edac-lmc.c @@ -302,18 +302,17 @@ static int octeon_lmc_edac_probe(struct platform_device *pdev) return 0; } -static int octeon_lmc_edac_remove(struct platform_device *pdev) +static void octeon_lmc_edac_remove(struct platform_device *pdev) { struct mem_ctl_info *mci = platform_get_drvdata(pdev); edac_mc_del_mc(&pdev->dev); edac_mc_free(mci); - return 0; } static struct platform_driver octeon_lmc_edac_driver = { .probe = octeon_lmc_edac_probe, - .remove = octeon_lmc_edac_remove, + .remove_new = octeon_lmc_edac_remove, .driver = { .name = "octeon_lmc_edac", } From a92dd68e163a244f828d0124a5c3dde6d4e1088e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:47 +0200 Subject: [PATCH 024/310] EDAC/octeon-pc: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20231004131254.2673842-15-u.kleine-koenig@pengutronix.de --- drivers/edac/octeon_edac-pc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/edac/octeon_edac-pc.c b/drivers/edac/octeon_edac-pc.c index 754eced59c32..ea8a8e337b1e 100644 --- a/drivers/edac/octeon_edac-pc.c +++ b/drivers/edac/octeon_edac-pc.c @@ -119,19 +119,18 @@ err: return -ENXIO; } -static int co_cache_error_remove(struct platform_device *pdev) +static void co_cache_error_remove(struct platform_device *pdev) { struct co_cache_error *p = platform_get_drvdata(pdev); unregister_co_cache_error_notifier(&p->notifier); edac_device_del_device(&pdev->dev); edac_device_free_ctl_info(p->ed); - return 0; } static struct platform_driver co_cache_error_driver = { .probe = co_cache_error_probe, - .remove = co_cache_error_remove, + .remove_new = co_cache_error_remove, .driver = { .name = "octeon_pc_edac", } From 524d3e56fb5e01b80f80b86e8d7d452413a2f9b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:48 +0200 Subject: [PATCH 025/310] EDAC/octeon-pci: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20231004131254.2673842-16-u.kleine-koenig@pengutronix.de --- drivers/edac/octeon_edac-pci.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/edac/octeon_edac-pci.c b/drivers/edac/octeon_edac-pci.c index 28b238eecefc..108ad9493cfb 100644 --- a/drivers/edac/octeon_edac-pci.c +++ b/drivers/edac/octeon_edac-pci.c @@ -87,19 +87,17 @@ err: return res; } -static int octeon_pci_remove(struct platform_device *pdev) +static void octeon_pci_remove(struct platform_device *pdev) { struct edac_pci_ctl_info *pci = platform_get_drvdata(pdev); edac_pci_del_device(&pdev->dev); edac_pci_free_ctl_info(pci); - - return 0; } static struct platform_driver octeon_pci_driver = { .probe = octeon_pci_probe, - .remove = octeon_pci_remove, + .remove_new = octeon_pci_remove, .driver = { .name = "octeon_pci_edac", } From 58758ffa11a79834d69d560448e5fa965cc80248 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:49 +0200 Subject: [PATCH 026/310] EDAC/ppc4xx: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231004131254.2673842-17-u.kleine-koenig@pengutronix.de --- drivers/edac/ppc4xx_edac.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/edac/ppc4xx_edac.c b/drivers/edac/ppc4xx_edac.c index 046969b4e82e..1eea3341a916 100644 --- a/drivers/edac/ppc4xx_edac.c +++ b/drivers/edac/ppc4xx_edac.c @@ -1329,8 +1329,7 @@ static int ppc4xx_edac_probe(struct platform_device *op) * * Unconditionally returns 0. */ -static int -ppc4xx_edac_remove(struct platform_device *op) +static void ppc4xx_edac_remove(struct platform_device *op) { struct mem_ctl_info *mci = dev_get_drvdata(&op->dev); struct ppc4xx_edac_pdata *pdata = mci->pvt_info; @@ -1344,8 +1343,6 @@ ppc4xx_edac_remove(struct platform_device *op) edac_mc_del_mc(mci->pdev); edac_mc_free(mci); - - return 0; } /** @@ -1379,7 +1376,7 @@ ppc4xx_edac_opstate_init(void) static struct platform_driver ppc4xx_edac_driver = { .probe = ppc4xx_edac_probe, - .remove = ppc4xx_edac_remove, + .remove_new = ppc4xx_edac_remove, .driver = { .name = PPC4XX_EDAC_MODULE_NAME, .of_match_table = ppc4xx_edac_match, From bfee05aa3806e5d37090e450989efbe0109ebd70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:50 +0200 Subject: [PATCH 027/310] EDAC/qcom: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231004131254.2673842-18-u.kleine-koenig@pengutronix.de --- drivers/edac/qcom_edac.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/edac/qcom_edac.c b/drivers/edac/qcom_edac.c index b2db545c6810..5539917c01dd 100644 --- a/drivers/edac/qcom_edac.c +++ b/drivers/edac/qcom_edac.c @@ -390,14 +390,12 @@ irq_done: return rc; } -static int qcom_llcc_edac_remove(struct platform_device *pdev) +static void qcom_llcc_edac_remove(struct platform_device *pdev) { struct edac_device_ctl_info *edev_ctl = dev_get_drvdata(&pdev->dev); edac_device_del_device(edev_ctl->dev); edac_device_free_ctl_info(edev_ctl); - - return 0; } static const struct platform_device_id qcom_llcc_edac_id_table[] = { @@ -408,7 +406,7 @@ MODULE_DEVICE_TABLE(platform, qcom_llcc_edac_id_table); static struct platform_driver qcom_llcc_edac_driver = { .probe = qcom_llcc_edac_probe, - .remove = qcom_llcc_edac_remove, + .remove_new = qcom_llcc_edac_remove, .driver = { .name = "qcom_llcc_edac", }, From f30e2fac7da30f7f8bb683f3a3ad7db5dc4cffc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:51 +0200 Subject: [PATCH 028/310] EDAC/synopsys: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231004131254.2673842-19-u.kleine-koenig@pengutronix.de --- drivers/edac/synopsys_edac.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c index c4fc64cbecd0..709babce43ba 100644 --- a/drivers/edac/synopsys_edac.c +++ b/drivers/edac/synopsys_edac.c @@ -1410,7 +1410,7 @@ free_edac_mc: * * Return: Unconditionally 0 */ -static int mc_remove(struct platform_device *pdev) +static void mc_remove(struct platform_device *pdev) { struct mem_ctl_info *mci = platform_get_drvdata(pdev); struct synps_edac_priv *priv = mci->pvt_info; @@ -1425,8 +1425,6 @@ static int mc_remove(struct platform_device *pdev) edac_mc_del_mc(&pdev->dev); edac_mc_free(mci); - - return 0; } static struct platform_driver synps_edac_mc_driver = { @@ -1435,7 +1433,7 @@ static struct platform_driver synps_edac_mc_driver = { .of_match_table = synps_edac_match, }, .probe = mc_probe, - .remove = mc_remove, + .remove_new = mc_remove, }; module_platform_driver(synps_edac_mc_driver); From 8312b2bbddb6aceebc6815f34b9a3f97722901e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:52 +0200 Subject: [PATCH 029/310] EDAC/ti: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231004131254.2673842-20-u.kleine-koenig@pengutronix.de --- drivers/edac/ti_edac.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/edac/ti_edac.c b/drivers/edac/ti_edac.c index 6971ded598de..29723c9592f7 100644 --- a/drivers/edac/ti_edac.c +++ b/drivers/edac/ti_edac.c @@ -312,19 +312,17 @@ err: return ret; } -static int ti_edac_remove(struct platform_device *pdev) +static void ti_edac_remove(struct platform_device *pdev) { struct mem_ctl_info *mci = platform_get_drvdata(pdev); edac_mc_del_mc(&pdev->dev); edac_mc_free(mci); - - return 0; } static struct platform_driver ti_edac_driver = { .probe = ti_edac_probe, - .remove = ti_edac_remove, + .remove_new = ti_edac_remove, .driver = { .name = EDAC_MOD_NAME, .of_match_table = ti_edac_of_match, From 9441e5ca3ad3a4f7ae81fda5a1247d03098caf2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:53 +0200 Subject: [PATCH 030/310] EDAC/xgene: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231004131254.2673842-21-u.kleine-koenig@pengutronix.de --- drivers/edac/xgene_edac.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/edac/xgene_edac.c b/drivers/edac/xgene_edac.c index c52b9dd9154c..1b50f8160013 100644 --- a/drivers/edac/xgene_edac.c +++ b/drivers/edac/xgene_edac.c @@ -1960,7 +1960,7 @@ out_err: return rc; } -static int xgene_edac_remove(struct platform_device *pdev) +static void xgene_edac_remove(struct platform_device *pdev) { struct xgene_edac *edac = dev_get_drvdata(&pdev->dev); struct xgene_edac_mc_ctx *mcu; @@ -1981,8 +1981,6 @@ static int xgene_edac_remove(struct platform_device *pdev) list_for_each_entry_safe(node, temp_node, &edac->socs, next) xgene_edac_soc_remove(node); - - return 0; } static const struct of_device_id xgene_edac_of_match[] = { @@ -1993,7 +1991,7 @@ MODULE_DEVICE_TABLE(of, xgene_edac_of_match); static struct platform_driver xgene_edac_driver = { .probe = xgene_edac_probe, - .remove = xgene_edac_remove, + .remove_new = xgene_edac_remove, .driver = { .name = "xgene-edac", .of_match_table = xgene_edac_of_match, From ec886cf8813bbfd4dd3f7aba3f660edd5f0a69d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 4 Oct 2023 15:12:54 +0200 Subject: [PATCH 031/310] EDAC/zynqmp: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Acked-by: Michal Simek Link: https://lore.kernel.org/r/20231004131254.2673842-22-u.kleine-koenig@pengutronix.de --- drivers/edac/zynqmp_edac.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/edac/zynqmp_edac.c b/drivers/edac/zynqmp_edac.c index ac7d1e0b324c..2d9a5cfd8931 100644 --- a/drivers/edac/zynqmp_edac.c +++ b/drivers/edac/zynqmp_edac.c @@ -426,7 +426,7 @@ free_dev_ctl: return ret; } -static int edac_remove(struct platform_device *pdev) +static void edac_remove(struct platform_device *pdev) { struct edac_device_ctl_info *dci = platform_get_drvdata(pdev); struct edac_priv *priv = dci->pvt_info; @@ -440,8 +440,6 @@ static int edac_remove(struct platform_device *pdev) edac_device_del_device(&pdev->dev); edac_device_free_ctl_info(dci); - - return 0; } static const struct of_device_id zynqmp_ocm_edac_match[] = { @@ -457,7 +455,7 @@ static struct platform_driver zynqmp_ocm_edac_driver = { .of_match_table = zynqmp_ocm_edac_match, }, .probe = edac_probe, - .remove = edac_remove, + .remove_new = edac_remove, }; module_platform_driver(zynqmp_ocm_edac_driver); From 0c7c7ba0c7215de44410e9c4acce28d762dd907a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 13 Oct 2023 12:04:23 +0200 Subject: [PATCH 032/310] EDAC/fsl_ddr: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). fsl_mc_err_remove() is used as callback in two drivers. So these have to be converted together to the void returning remove callback. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231013100422.1382040-2-u.kleine-koenig@pengutronix.de --- drivers/edac/fsl_ddr_edac.c | 3 +-- drivers/edac/fsl_ddr_edac.h | 2 +- drivers/edac/layerscape_edac.c | 2 +- drivers/edac/mpc85xx_edac.c | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/edac/fsl_ddr_edac.c b/drivers/edac/fsl_ddr_edac.c index b81757555a8a..d148d262d0d4 100644 --- a/drivers/edac/fsl_ddr_edac.c +++ b/drivers/edac/fsl_ddr_edac.c @@ -612,7 +612,7 @@ err: return res; } -int fsl_mc_err_remove(struct platform_device *op) +void fsl_mc_err_remove(struct platform_device *op) { struct mem_ctl_info *mci = dev_get_drvdata(&op->dev); struct fsl_mc_pdata *pdata = mci->pvt_info; @@ -629,5 +629,4 @@ int fsl_mc_err_remove(struct platform_device *op) edac_mc_del_mc(&op->dev); edac_mc_free(mci); - return 0; } diff --git a/drivers/edac/fsl_ddr_edac.h b/drivers/edac/fsl_ddr_edac.h index 332439d7b2d9..c0994a2a003c 100644 --- a/drivers/edac/fsl_ddr_edac.h +++ b/drivers/edac/fsl_ddr_edac.h @@ -72,5 +72,5 @@ struct fsl_mc_pdata { int irq; }; int fsl_mc_err_probe(struct platform_device *op); -int fsl_mc_err_remove(struct platform_device *op); +void fsl_mc_err_remove(struct platform_device *op); #endif diff --git a/drivers/edac/layerscape_edac.c b/drivers/edac/layerscape_edac.c index 7c5e2b3c0daa..d2f895033280 100644 --- a/drivers/edac/layerscape_edac.c +++ b/drivers/edac/layerscape_edac.c @@ -27,7 +27,7 @@ MODULE_DEVICE_TABLE(of, fsl_ddr_mc_err_of_match); static struct platform_driver fsl_ddr_mc_err_driver = { .probe = fsl_mc_err_probe, - .remove = fsl_mc_err_remove, + .remove_new = fsl_mc_err_remove, .driver = { .name = "fsl_ddr_mc_err", .of_match_table = fsl_ddr_mc_err_of_match, diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c index e0d702cef66f..c1bc53f4e184 100644 --- a/drivers/edac/mpc85xx_edac.c +++ b/drivers/edac/mpc85xx_edac.c @@ -656,7 +656,7 @@ MODULE_DEVICE_TABLE(of, mpc85xx_mc_err_of_match); static struct platform_driver mpc85xx_mc_err_driver = { .probe = fsl_mc_err_probe, - .remove = fsl_mc_err_remove, + .remove_new = fsl_mc_err_remove, .driver = { .name = "mpc85xx_mc_err", .of_match_table = mpc85xx_mc_err_of_match, From 04c40eed3f7ac48ddaf20104489510e743a53c47 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Nov 2023 13:58:38 +0100 Subject: [PATCH 033/310] powerpc/ps3: move udbg_shutdown_ps3gelic prototype Allmodconfig kernels produce a missing-prototypes warning: arch/powerpc/platforms/ps3/gelic_udbg.c:239:6: error: no previous prototype for 'udbg_shutdown_ps3gelic' [-Werror=missing-prototypes] Move the declaration from a local header to asm/ps3.h where it can be seen from both the caller and the definition. Signed-off-by: Arnd Bergmann Signed-off-by: Geoff Levand Acked-by: Jakub Kicinski [mpe: Drop CONFIG_PS3GELIC_UDBG to fix build error] Signed-off-by: Michael Ellerman Link: https://msgid.link/20231108125843.3806765-18-arnd@kernel.org --- arch/powerpc/Kconfig.debug | 1 - arch/powerpc/include/asm/ps3.h | 6 ++++++ arch/powerpc/platforms/ps3/Kconfig | 12 ------------ arch/powerpc/platforms/ps3/Makefile | 2 +- arch/powerpc/platforms/ps3/gelic_udbg.c | 1 + drivers/net/ethernet/toshiba/ps3_gelic_net.h | 6 ------ 6 files changed, 8 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index ea4033abc07d..8c80b154e814 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -271,7 +271,6 @@ config PPC_EARLY_DEBUG_USBGECKO config PPC_EARLY_DEBUG_PS3GELIC bool "Early debugging through the PS3 Ethernet port" depends on PPC_PS3 - select PS3GELIC_UDBG help Select this to enable early debugging for the PlayStation3 via UDP broadcasts sent out through the Ethernet port. diff --git a/arch/powerpc/include/asm/ps3.h b/arch/powerpc/include/asm/ps3.h index a5f36546a052..d13d8fdc3411 100644 --- a/arch/powerpc/include/asm/ps3.h +++ b/arch/powerpc/include/asm/ps3.h @@ -514,4 +514,10 @@ u64 ps3_get_spe_id(void *arg); void ps3_early_mm_init(void); +#ifdef CONFIG_PPC_EARLY_DEBUG_PS3GELIC +void udbg_shutdown_ps3gelic(void); +#else +static inline void udbg_shutdown_ps3gelic(void) {} +#endif + #endif diff --git a/arch/powerpc/platforms/ps3/Kconfig b/arch/powerpc/platforms/ps3/Kconfig index a44869e5ea70..e9c1087dd42e 100644 --- a/arch/powerpc/platforms/ps3/Kconfig +++ b/arch/powerpc/platforms/ps3/Kconfig @@ -167,16 +167,4 @@ config PS3_LPM profiling support of the Cell processor with programs like perfmon2, then say Y or M, otherwise say N. -config PS3GELIC_UDBG - bool "PS3 udbg output via UDP broadcasts on Ethernet" - depends on PPC_PS3 - help - Enables udbg early debugging output by sending broadcast UDP - via the Ethernet port (UDP port number 18194). - - This driver uses a trivial implementation and is independent - from the main PS3 gelic network driver. - - If in doubt, say N here. - endmenu diff --git a/arch/powerpc/platforms/ps3/Makefile b/arch/powerpc/platforms/ps3/Makefile index 86bf2967a8d4..bc79bb124d1e 100644 --- a/arch/powerpc/platforms/ps3/Makefile +++ b/arch/powerpc/platforms/ps3/Makefile @@ -3,7 +3,7 @@ obj-y += setup.o mm.o time.o hvcall.o htab.o repository.o obj-y += interrupt.o exports.o os-area.o obj-y += system-bus.o -obj-$(CONFIG_PS3GELIC_UDBG) += gelic_udbg.o +obj-$(CONFIG_PPC_EARLY_DEBUG_PS3GELIC) += gelic_udbg.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SPU_BASE) += spu.o obj-y += device-init.o diff --git a/arch/powerpc/platforms/ps3/gelic_udbg.c b/arch/powerpc/platforms/ps3/gelic_udbg.c index 6b298010fd84..a5202c18c236 100644 --- a/arch/powerpc/platforms/ps3/gelic_udbg.c +++ b/arch/powerpc/platforms/ps3/gelic_udbg.c @@ -14,6 +14,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.h b/drivers/net/ethernet/toshiba/ps3_gelic_net.h index 0d98defb011e..0ec7412febc7 100644 --- a/drivers/net/ethernet/toshiba/ps3_gelic_net.h +++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.h @@ -346,12 +346,6 @@ static inline void *port_priv(struct gelic_port *port) return port->priv; } -#ifdef CONFIG_PPC_EARLY_DEBUG_PS3GELIC -void udbg_shutdown_ps3gelic(void); -#else -static inline void udbg_shutdown_ps3gelic(void) {} -#endif - int gelic_card_set_irq_mask(struct gelic_card *card, u64 mask); /* shared netdev ops */ void gelic_card_up(struct gelic_card *card); From 0c9a768de64d24e38e27652b8c273725ccc31916 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Nov 2023 13:58:39 +0100 Subject: [PATCH 034/310] powerpc/pasemi: mark pas_shutdown() static Allmodconfig builds show a warning about one function that is accidentally marked global: arch/powerpc/platforms/pasemi/setup.c:67:6: error: no previous prototype for 'pas_shutdown' [-Werror=missing-prototypes] Fixes: 656fdf3ad8e0 ("powerpc/pasemi: Add Nemo board device init code.") Signed-off-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://msgid.link/20231108125843.3806765-19-arnd@kernel.org --- arch/powerpc/platforms/pasemi/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pasemi/setup.c b/arch/powerpc/platforms/pasemi/setup.c index ef985ba2bf21..0761d98e5be3 100644 --- a/arch/powerpc/platforms/pasemi/setup.c +++ b/arch/powerpc/platforms/pasemi/setup.c @@ -64,7 +64,7 @@ static void __noreturn pas_restart(char *cmd) } #ifdef CONFIG_PPC_PASEMI_NEMO -void pas_shutdown(void) +static void pas_shutdown(void) { /* Set the PLD bit that makes the SB600 think the power button is being pressed */ void __iomem *pld_map = ioremap(0xf5000000,4096); From afb36ac386783d2ef2ed839293c03fd06f470be0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Nov 2023 13:58:40 +0100 Subject: [PATCH 035/310] powerpc/powermac: mark smp_psurge_{give,take}_timebase static These functions are only called locally and should be static like the other corresponding functions are: arch/powerpc/platforms/powermac/smp.c:416:13: error: no previous prototype for 'smp_psurge_take_timebase' [-Werror=missing-prototypes] 416 | void __init smp_psurge_take_timebase(void) | ^~~~~~~~~~~~~~~~~~~~~~~~ arch/powerpc/platforms/powermac/smp.c:432:13: error: no previous prototype for 'smp_psurge_give_timebase' [-Werror=missing-prototypes] 432 | void __init smp_psurge_give_timebase(void) | ^~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://msgid.link/20231108125843.3806765-20-arnd@kernel.org --- arch/powerpc/platforms/powermac/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index c83d1e14077e..15644be31990 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -413,7 +413,7 @@ static void __init smp_psurge_setup_cpu(int cpu_nr) printk(KERN_ERR "Couldn't get primary IPI interrupt"); } -void __init smp_psurge_take_timebase(void) +static void __init smp_psurge_take_timebase(void) { if (psurge_type != PSURGE_DUAL) return; @@ -429,7 +429,7 @@ void __init smp_psurge_take_timebase(void) set_dec(tb_ticks_per_jiffy/2); } -void __init smp_psurge_give_timebase(void) +static void __init smp_psurge_give_timebase(void) { /* Nothing to do here */ } From 981d1c997fbc5e193b282f3a325a0230bf697363 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 6 Nov 2023 07:42:55 -0600 Subject: [PATCH 036/310] powerpc/rtas: Drop declaration of undefined call_rtas() function The call_rtas() function has never been a part of arch/powerpc, and its implementation was removed from arch/ppc by 0a26b1364f14 ("ppc: Remove CHRP, POWER3 and POWER4 support from arch/ppc"). Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231106-rtas-trivial-v1-3-61847655c51f@linux.ibm.com --- arch/powerpc/include/asm/rtas.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index c697c3c74694..3bf7f0a4b07e 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -542,8 +542,6 @@ static inline void pSeries_coalesce_init(void) { } static inline void rtas_initialize(void) { } #endif -extern int call_rtas(const char *, int, int, unsigned long *, ...); - #ifdef CONFIG_HV_PERF_CTRS void read_24x7_sys_info(void); #else From 1d8faf1f41b550eb7ab7ac841ebd70f205840dde Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 6 Nov 2023 07:42:56 -0600 Subject: [PATCH 037/310] powerpc/rtas: Remove unused rtas_service_present() rtas_service_present() has no more users. rtas_function_implemented() is now the appropriate API for determining whether a given RTAS function is available to call. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231106-rtas-trivial-v1-4-61847655c51f@linux.ibm.com --- arch/powerpc/include/asm/rtas.h | 1 - arch/powerpc/kernel/rtas.c | 5 ----- 2 files changed, 6 deletions(-) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 3bf7f0a4b07e..c6568a647cd0 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -409,7 +409,6 @@ static inline bool rtas_function_implemented(const rtas_fn_handle_t handle) return rtas_function_token(handle) != RTAS_UNKNOWN_SERVICE; } extern int rtas_token(const char *service); -extern int rtas_service_present(const char *service); extern int rtas_call(int token, int, int, int *, ...); void rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, ...); diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index eddc031c4b95..b5b340a91157 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -900,11 +900,6 @@ int rtas_token(const char *service) } EXPORT_SYMBOL_GPL(rtas_token); -int rtas_service_present(const char *service) -{ - return rtas_token(service) != RTAS_UNKNOWN_SERVICE; -} - #ifdef CONFIG_RTAS_ERROR_LOGGING static u32 rtas_error_log_max __ro_after_init = RTAS_ERROR_LOG_MAX; From 010862d235c9fab4f0f9dd169efc72df94110758 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 6 Nov 2023 07:42:57 -0600 Subject: [PATCH 038/310] powerpc/rtas: Move post_mobility_fixup() declaration to pseries This is a pseries-specific function declaration that doesn't belong in rtas.h. Move it to the pseries platform code and adjust pseries/suspend.c accordingly. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231106-rtas-trivial-v1-5-61847655c51f@linux.ibm.com --- arch/powerpc/include/asm/rtas.h | 1 - arch/powerpc/platforms/pseries/pseries.h | 1 + arch/powerpc/platforms/pseries/suspend.c | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index c6568a647cd0..2365668fc13e 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -444,7 +444,6 @@ extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES extern time64_t last_rtas_event; extern int clobbering_unread_rtas_event(void); -extern void post_mobility_fixup(void); int rtas_syscall_dispatch_ibm_suspend_me(u64 handle); #else static inline int clobbering_unread_rtas_event(void) { return 0; } diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 8376f03f932a..bba4ad192b0f 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -55,6 +55,7 @@ extern int dlpar_detach_node(struct device_node *); extern int dlpar_acquire_drc(u32 drc_index); extern int dlpar_release_drc(u32 drc_index); extern int dlpar_unisolate_drc(u32 drc_index); +extern void post_mobility_fixup(void); void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog); int handle_dlpar_errorlog(struct pseries_hp_errorlog *hp_errlog); diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c index 5c43435472cc..382003dfdb9a 100644 --- a/arch/powerpc/platforms/pseries/suspend.c +++ b/arch/powerpc/platforms/pseries/suspend.c @@ -13,6 +13,7 @@ #include #include #include +#include "pseries.h" static struct device suspend_dev; From 19773eda86e289526b7f08fa56c92e75cd7796f6 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 6 Nov 2023 07:42:58 -0600 Subject: [PATCH 039/310] powerpc/rtas: Remove trailing space Use scripts/cleanfile to remove instances of trailing space in the core RTAS code and header. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231106-rtas-trivial-v1-6-61847655c51f@linux.ibm.com --- arch/powerpc/include/asm/rtas.h | 6 +++--- arch/powerpc/kernel/rtas.c | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 2365668fc13e..1bed6be8ada3 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -268,7 +268,7 @@ typedef struct { #define RTAS_TYPE_DEALLOC 0xE3 #define RTAS_TYPE_DUMP 0xE4 #define RTAS_TYPE_HOTPLUG 0xE5 -/* I don't add PowerMGM events right now, this is a different topic */ +/* I don't add PowerMGM events right now, this is a different topic */ #define RTAS_TYPE_PMGM_POWER_SW_ON 0x60 #define RTAS_TYPE_PMGM_POWER_SW_OFF 0x61 #define RTAS_TYPE_PMGM_LID_OPEN 0x62 @@ -461,7 +461,7 @@ static inline void rtas_cancel_event_scan(void) { } /* Error types logged. */ #define ERR_FLAG_ALREADY_LOGGED 0x0 -#define ERR_FLAG_BOOT 0x1 /* log was pulled from NVRAM on boot */ +#define ERR_FLAG_BOOT 0x1 /* log was pulled from NVRAM on boot */ #define ERR_TYPE_RTAS_LOG 0x2 /* from rtas event-scan */ #define ERR_TYPE_KERNEL_PANIC 0x4 /* from die()/panic() */ #define ERR_TYPE_KERNEL_PANIC_GZ 0x8 /* ditto, compressed */ @@ -471,7 +471,7 @@ static inline void rtas_cancel_event_scan(void) { } (ERR_TYPE_RTAS_LOG | ERR_TYPE_KERNEL_PANIC | ERR_TYPE_KERNEL_PANIC_GZ) #define RTAS_DEBUG KERN_DEBUG "RTAS: " - + #define RTAS_ERROR_LOG_MAX 2048 /* diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index b5b340a91157..c49f078382a9 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -670,7 +670,7 @@ static void call_rtas_display_status_delay(char c) static int pending_newline = 0; /* did last write end with unprinted newline? */ static int width = 16; - if (c == '\n') { + if (c == '\n') { while (width-- > 0) call_rtas_display_status(' '); width = 16; @@ -680,7 +680,7 @@ static void call_rtas_display_status_delay(char c) if (pending_newline) { call_rtas_display_status('\r'); call_rtas_display_status('\n'); - } + } pending_newline = 0; if (width--) { call_rtas_display_status(c); @@ -820,7 +820,7 @@ void rtas_progress(char *s, unsigned short hex) else rtas_call(display_character, 1, 1, NULL, '\r'); } - + if (row_width) width = row_width[current_line]; else @@ -840,9 +840,9 @@ void rtas_progress(char *s, unsigned short hex) spin_unlock(&progress_lock); return; } - + /* RTAS wants CR-LF, not just LF */ - + if (*os == '\n') { rtas_call(display_character, 1, 1, NULL, '\r'); rtas_call(display_character, 1, 1, NULL, '\n'); @@ -852,7 +852,7 @@ void rtas_progress(char *s, unsigned short hex) */ rtas_call(display_character, 1, 1, NULL, *os); } - + if (row_width) width = row_width[current_line]; else @@ -861,15 +861,15 @@ void rtas_progress(char *s, unsigned short hex) width--; rtas_call(display_character, 1, 1, NULL, *os); } - + os++; - + /* if we overwrite the screen length */ if (width <= 0) while ((*os != 0) && (*os != '\n') && (*os != '\r')) os++; } - + spin_unlock(&progress_lock); } EXPORT_SYMBOL_GPL(rtas_progress); /* needed by rtas_flash module */ From c516213726fb572700cce4a5909aa8d82b77192a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Nov 2023 15:33:45 +0100 Subject: [PATCH 040/310] x86/entry: Optimize common_interrupt_return() The code in common_interrupt_return() does a bunch of unconditional work that is really only needed on PTI kernels. Specifically it unconditionally copies the IRET frame back onto the entry stack, swizzles onto the entry stack and does IRET from there. However, without PTI we can simply IRET from whatever stack we're on. ivb-ep, mitigations=off, gettid-1m: PRE: 140,118,538 cycles:k ( +- 0.01% ) 236,692,878 instructions:k # 1.69 insn per cycle ( +- 0.00% ) POST: 140,026,608 cycles:k ( +- 0.01% ) 236,696,176 instructions:k # 1.69 insn per cycle ( +- 0.00% ) (this is with --repeat 100 and the run-to-run variance is bigger than the difference shown) Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231120143626.638107480@infradead.org --- arch/x86/entry/calling.h | 12 +++++++++--- arch/x86/entry/entry_64.S | 17 +++++++++++++++-- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index f6907627172b..9f1d94790a54 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -175,8 +175,7 @@ For 32-bit we have the following conventions - kernel is built with #define THIS_CPU_user_pcid_flush_mask \ PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask -.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req - ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI +.macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req mov %cr3, \scratch_reg ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID @@ -206,13 +205,20 @@ For 32-bit we have the following conventions - kernel is built with /* Flip the PGD to the user version */ orq $(PTI_USER_PGTABLE_MASK), \scratch_reg mov \scratch_reg, %cr3 +.endm + +.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + SWITCH_TO_USER_CR3 \scratch_reg \scratch_reg2 .Lend_\@: .endm .macro SWITCH_TO_USER_CR3_STACK scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI pushq %rax - SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax + SWITCH_TO_USER_CR3 scratch_reg=\scratch_reg scratch_reg2=%rax popq %rax +.Lend_\@: .endm .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index de6469dffe3a..dfbf799b7311 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -569,7 +569,18 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) #ifdef CONFIG_XEN_PV ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV #endif +#ifdef CONFIG_PAGE_TABLE_ISOLATION + ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI +#endif + STACKLEAK_ERASE + POP_REGS + add $8, %rsp /* orig_ax */ + swapgs + jmp .Lnative_iret + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +.Lpti_restore_regs_and_return_to_usermode: POP_REGS pop_rdi=0 /* @@ -596,13 +607,15 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) */ STACKLEAK_ERASE_NOCLOBBER - SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + push %rax + SWITCH_TO_USER_CR3 scratch_reg=%rdi scratch_reg2=%rax + pop %rax /* Restore RDI. */ popq %rdi swapgs jmp .Lnative_iret - +#endif SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL) #ifdef CONFIG_DEBUG_ENTRY From 1e4d3001f59fb7a9917cb746544b65e616b5f809 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Nov 2023 15:33:46 +0100 Subject: [PATCH 041/310] x86/entry: Harden return-to-user Make the CONFIG_DEBUG_ENTRY=y check that validates CS is a user segment unconditional and move it nearer to IRET. PRE: 140,026,608 cycles:k ( +- 0.01% ) 236,696,176 instructions:k # 1.69 insn per cycle ( +- 0.00% ) POST: 139,957,681 cycles:k ( +- 0.01% ) 236,681,819 instructions:k # 1.69 insn per cycle ( +- 0.00% ) (this is with --repeat 100 and the run-to-run variance is bigger than the difference shown) Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231120143626.753200755@infradead.org --- arch/x86/entry/entry_64.S | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index dfbf799b7311..c40f89ab1b4c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -559,13 +559,6 @@ __irqentry_text_end: SYM_CODE_START_LOCAL(common_interrupt_return) SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) IBRS_EXIT -#ifdef CONFIG_DEBUG_ENTRY - /* Assert that pt_regs indicates user mode. */ - testb $3, CS(%rsp) - jnz 1f - ud2 -1: -#endif #ifdef CONFIG_XEN_PV ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV #endif @@ -576,8 +569,14 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) STACKLEAK_ERASE POP_REGS add $8, %rsp /* orig_ax */ + UNWIND_HINT_IRET_REGS + +.Lswapgs_and_iret: swapgs - jmp .Lnative_iret + /* Assert that the IRET frame indicates user mode. */ + testb $3, 8(%rsp) + jnz .Lnative_iret + ud2 #ifdef CONFIG_PAGE_TABLE_ISOLATION .Lpti_restore_regs_and_return_to_usermode: @@ -613,8 +612,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) /* Restore RDI. */ popq %rdi - swapgs - jmp .Lnative_iret + jmp .Lswapgs_and_iret #endif SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL) From 646477fc47905157a8440cdc45aad22901b5b3ce Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 6 Nov 2023 07:42:59 -0600 Subject: [PATCH 042/310] powerpc/rtas: Remove 'extern' from function declarations in rtas.h This header occasionally gains new function declarations without the leading extern in accordance with current style rules. Leaving the legacy externs in place is making the header more difficult to read over time because of the inconsistency. Remove them. Signed-off-by: Nathan Lynch [mpe: Add names to rtas_call() parameters] Signed-off-by: Michael Ellerman Link: https://msgid.link/20231106-rtas-trivial-v1-7-61847655c51f@linux.ibm.com --- arch/powerpc/include/asm/rtas.h | 53 ++++++++++++++++----------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 1bed6be8ada3..a7110ed52e25 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -408,42 +408,41 @@ static inline bool rtas_function_implemented(const rtas_fn_handle_t handle) { return rtas_function_token(handle) != RTAS_UNKNOWN_SERVICE; } -extern int rtas_token(const char *service); -extern int rtas_call(int token, int, int, int *, ...); +int rtas_token(const char *service); +int rtas_call(int token, int nargs, int nret, int *outputs, ...); void rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, ...); -extern void __noreturn rtas_restart(char *cmd); -extern void rtas_power_off(void); -extern void __noreturn rtas_halt(void); -extern void rtas_os_term(char *str); +void __noreturn rtas_restart(char *cmd); +void rtas_power_off(void); +void __noreturn rtas_halt(void); +void rtas_os_term(char *str); void rtas_activate_firmware(void); -extern int rtas_get_sensor(int sensor, int index, int *state); -extern int rtas_get_sensor_fast(int sensor, int index, int *state); -extern int rtas_get_power_level(int powerdomain, int *level); -extern int rtas_set_power_level(int powerdomain, int level, int *setlevel); -extern bool rtas_indicator_present(int token, int *maxindex); -extern int rtas_set_indicator(int indicator, int index, int new_value); -extern int rtas_set_indicator_fast(int indicator, int index, int new_value); -extern void rtas_progress(char *s, unsigned short hex); +int rtas_get_sensor(int sensor, int index, int *state); +int rtas_get_sensor_fast(int sensor, int index, int *state); +int rtas_get_power_level(int powerdomain, int *level); +int rtas_set_power_level(int powerdomain, int level, int *setlevel); +bool rtas_indicator_present(int token, int *maxindex); +int rtas_set_indicator(int indicator, int index, int new_value); +int rtas_set_indicator_fast(int indicator, int index, int new_value); +void rtas_progress(char *s, unsigned short hex); int rtas_ibm_suspend_me(int *fw_status); int rtas_error_rc(int rtas_rc); struct rtc_time; -extern time64_t rtas_get_boot_time(void); -extern void rtas_get_rtc_time(struct rtc_time *rtc_time); -extern int rtas_set_rtc_time(struct rtc_time *rtc_time); +time64_t rtas_get_boot_time(void); +void rtas_get_rtc_time(struct rtc_time *rtc_time); +int rtas_set_rtc_time(struct rtc_time *rtc_time); -extern unsigned int rtas_busy_delay_time(int status); +unsigned int rtas_busy_delay_time(int status); bool rtas_busy_delay(int status); -extern int early_init_dt_scan_rtas(unsigned long node, - const char *uname, int depth, void *data); +int early_init_dt_scan_rtas(unsigned long node, const char *uname, int depth, void *data); -extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); +void pSeries_log_error(char *buf, unsigned int err_type, int fatal); #ifdef CONFIG_PPC_PSERIES extern time64_t last_rtas_event; -extern int clobbering_unread_rtas_event(void); +int clobbering_unread_rtas_event(void); int rtas_syscall_dispatch_ibm_suspend_me(u64 handle); #else static inline int clobbering_unread_rtas_event(void) { return 0; } @@ -454,7 +453,7 @@ static inline int rtas_syscall_dispatch_ibm_suspend_me(u64 handle) #endif #ifdef CONFIG_PPC_RTAS_DAEMON -extern void rtas_cancel_event_scan(void); +void rtas_cancel_event_scan(void); #else static inline void rtas_cancel_event_scan(void) { } #endif @@ -479,7 +478,7 @@ static inline void rtas_cancel_event_scan(void) { } * for all rtas calls that require an error buffer argument. * This includes 'check-exception' and 'rtas-last-error'. */ -extern int rtas_get_error_log_max(void); +int rtas_get_error_log_max(void); /* Event Scan Parameters */ #define EVENT_SCAN_ALL_EVENTS 0xf0000000 @@ -518,8 +517,8 @@ static inline u32 rtas_config_addr(int busno, int devfn, int reg) (devfn << 8) | (reg & 0xff); } -extern void rtas_give_timebase(void); -extern void rtas_take_timebase(void); +void rtas_give_timebase(void); +void rtas_take_timebase(void); #ifdef CONFIG_PPC_RTAS static inline int page_is_rtas_user_buf(unsigned long pfn) @@ -532,7 +531,7 @@ static inline int page_is_rtas_user_buf(unsigned long pfn) /* Not the best place to put pSeries_coalesce_init, will be fixed when we * move some of the rtas suspend-me stuff to pseries */ -extern void pSeries_coalesce_init(void); +void pSeries_coalesce_init(void); void rtas_initialize(void); #else static inline int page_is_rtas_user_buf(unsigned long pfn) { return 0;} From 07e8f88568f558fb0f9529f49b3ab120cbe750fe Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Thu, 2 Nov 2023 12:26:19 +0000 Subject: [PATCH 043/310] x86/apic: Drop apic::delivery_mode This field is set to APIC_DELIVERY_MODE_FIXED in all cases, and is read exactly once. Fold the constant in uv_program_mmr() and drop the field. Searching for the origin of the stale HyperV comment reveals commit a31e58e129f7 ("x86/apic: Switch all APICs to Fixed delivery mode") which notes: As a consequence of this change, the apic::irq_delivery_mode field is now pointless, but this needs to be cleaned up in a separate patch. 6 years is long enough for this technical debt to have survived. [ bp: Fold in https://lore.kernel.org/r/20231121123034.1442059-1-andrew.cooper3@citrix.com ] Signed-off-by: Andrew Cooper Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Steve Wahl Link: https://lore.kernel.org/r/20231102-x86-apic-v1-1-bf049a2a0ed6@citrix.com --- arch/x86/include/asm/apic.h | 2 -- arch/x86/kernel/apic/apic_flat_64.c | 2 -- arch/x86/kernel/apic/apic_noop.c | 1 - arch/x86/kernel/apic/apic_numachip.c | 2 -- arch/x86/kernel/apic/bigsmp_32.c | 1 - arch/x86/kernel/apic/probe_32.c | 1 - arch/x86/kernel/apic/x2apic_cluster.c | 1 - arch/x86/kernel/apic/x2apic_phys.c | 1 - arch/x86/kernel/apic/x2apic_uv_x.c | 1 - arch/x86/platform/uv/uv_irq.c | 2 +- drivers/iommu/amd/iommu.c | 4 ++-- drivers/iommu/intel/irq_remapping.c | 2 +- drivers/pci/controller/pci-hyperv.c | 7 ------- 13 files changed, 4 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index d21f48f1c242..9d159b771dc8 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -272,8 +272,6 @@ struct apic { void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); - enum apic_delivery_modes delivery_mode; - u32 disable_esr : 1, dest_mode_logical : 1, x2apic_set_max_apicid : 1, diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 7139867d69cd..b295a056a4fc 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -82,7 +82,6 @@ static struct apic apic_flat __ro_after_init = { .acpi_madt_oem_check = flat_acpi_madt_oem_check, .apic_id_registered = default_apic_id_registered, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, .disable_esr = 0, @@ -154,7 +153,6 @@ static struct apic apic_physflat __ro_after_init = { .acpi_madt_oem_check = physflat_acpi_madt_oem_check, .apic_id_registered = default_apic_id_registered, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index b00d52ae84fa..9f1d553eb48f 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -47,7 +47,6 @@ static void noop_apic_write(u32 reg, u32 val) struct apic apic_noop __ro_after_init = { .name = "noop", - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 456a14c44f67..7d0c51b9d3bc 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -222,7 +222,6 @@ static const struct apic apic_numachip1 __refconst = { .probe = numachip1_probe, .acpi_madt_oem_check = numachip1_acpi_madt_oem_check, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, @@ -259,7 +258,6 @@ static const struct apic apic_numachip2 __refconst = { .probe = numachip2_probe, .acpi_madt_oem_check = numachip2_acpi_madt_oem_check, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 7ee3c486cb33..5a0d60b38e6b 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -80,7 +80,6 @@ static struct apic apic_bigsmp __ro_after_init = { .name = "bigsmp", .probe = probe_bigsmp, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 1, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 5eb3fbe472da..c0f78059f06a 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -45,7 +45,6 @@ static struct apic apic_default __ro_after_init = { .probe = probe_default, .apic_id_registered = default_apic_id_registered, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index a8306089c91b..28a7d3f2312d 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -227,7 +227,6 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 558a4a8824f4..409815a40668 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -145,7 +145,6 @@ static struct apic apic_x2apic_phys __ro_after_init = { .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 1b0d7336a28f..f1766b18dcd0 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -805,7 +805,6 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 4221259a5870..a379501b7a69 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -35,7 +35,7 @@ static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info) mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; entry->vector = cfg->vector; - entry->delivery_mode = apic->delivery_mode; + entry->delivery_mode = APIC_DELIVERY_MODE_FIXED; entry->dest_mode = apic->dest_mode_logical; entry->polarity = 0; entry->trigger = 0; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index fcc987f5d4ed..b9a0523cbb0a 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3357,7 +3357,7 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data, data->irq_2_irte.devid = devid; data->irq_2_irte.index = index + sub_handle; - iommu->irte_ops->prepare(data->entry, apic->delivery_mode, + iommu->irte_ops->prepare(data->entry, APIC_DELIVERY_MODE_FIXED, apic->dest_mode_logical, irq_cfg->vector, irq_cfg->dest_apicid, devid); @@ -3634,7 +3634,7 @@ int amd_iommu_deactivate_guest_mode(void *data) entry->lo.fields_remap.valid = valid; entry->lo.fields_remap.dm = apic->dest_mode_logical; - entry->lo.fields_remap.int_type = apic->delivery_mode; + entry->lo.fields_remap.int_type = APIC_DELIVERY_MODE_FIXED; entry->hi.fields.vector = cfg->vector; entry->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(cfg->dest_apicid); diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 29b9e55dcf26..566297bc87dd 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1112,7 +1112,7 @@ static void prepare_irte(struct irte *irte, int vector, unsigned int dest) * irq migration in the presence of interrupt-remapping. */ irte->trigger_mode = 0; - irte->dlvry_mode = apic->delivery_mode; + irte->dlvry_mode = APIC_DELIVERY_MODE_FIXED; irte->vector = vector; irte->dest_id = IRTE_DEST(dest); irte->redir_hint = 1; diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 30c7dfeccb16..1eaffff40b8d 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -650,13 +650,6 @@ static void hv_arch_irq_unmask(struct irq_data *data) PCI_FUNC(pdev->devfn); params->int_target.vector = hv_msi_get_int_vector(data); - /* - * Honoring apic->delivery_mode set to APIC_DELIVERY_MODE_FIXED by - * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a - * spurious interrupt storm. Not doing so does not seem to have a - * negative effect (yet?). - */ - if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) { /* * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the From 855da7cdf974f3902397e5bf9423c7442bdfd75f Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Thu, 2 Nov 2023 12:26:20 +0000 Subject: [PATCH 044/310] x86/apic: Drop enum apic_delivery_modes The type is not used any more. Replace the constants with plain defines so they can live outside of an __ASSEMBLY__ block, allowing for more cleanup in subsequent changes. Signed-off-by: Andrew Cooper Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Steve Wahl Link: https://lore.kernel.org/r/20231102-x86-apic-v1-2-bf049a2a0ed6@citrix.com --- arch/x86/include/asm/apicdef.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 4b125e5b3187..ddcbf00db19d 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -20,6 +20,13 @@ */ #define IO_APIC_SLOT_SIZE 1024 +#define APIC_DELIVERY_MODE_FIXED 0 +#define APIC_DELIVERY_MODE_LOWESTPRIO 1 +#define APIC_DELIVERY_MODE_SMI 2 +#define APIC_DELIVERY_MODE_NMI 4 +#define APIC_DELIVERY_MODE_INIT 5 +#define APIC_DELIVERY_MODE_EXTINT 7 + #define APIC_ID 0x20 #define APIC_LVR 0x30 @@ -430,14 +437,5 @@ struct local_apic { #define BAD_APICID 0xFFFFu #endif -enum apic_delivery_modes { - APIC_DELIVERY_MODE_FIXED = 0, - APIC_DELIVERY_MODE_LOWESTPRIO = 1, - APIC_DELIVERY_MODE_SMI = 2, - APIC_DELIVERY_MODE_NMI = 4, - APIC_DELIVERY_MODE_INIT = 5, - APIC_DELIVERY_MODE_EXTINT = 7, -}; - #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_APICDEF_H */ From 5a7d6d26af7714718b673e40bdc97f9f207e265a Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Thu, 2 Nov 2023 12:26:21 +0000 Subject: [PATCH 045/310] x86/apic: Drop struct local_apic This type predates recorded history in tglx/history.git, making it older than Feb 5th 2002. This structure is literally old enough to drink in most juristictions in the world, and has not been used once in that time. Lay it to rest in /dev/null. Signed-off-by: Andrew Cooper Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Steve Wahl Link: https://lore.kernel.org/r/20231102-x86-apic-v1-3-bf049a2a0ed6@citrix.com --- arch/x86/include/asm/apicdef.h | 260 --------------------------------- 1 file changed, 260 deletions(-) diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index ddcbf00db19d..094106b6a538 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -172,270 +172,10 @@ #define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK) #define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT) -#ifndef __ASSEMBLY__ -/* - * the local APIC register structure, memory mapped. Not terribly well - * tested, but we might eventually use this one in the future - the - * problem why we cannot use it right now is the P5 APIC, it has an - * errata which cannot take 8-bit reads and writes, only 32-bit ones ... - */ -#define u32 unsigned int - -struct local_apic { - -/*000*/ struct { u32 __reserved[4]; } __reserved_01; - -/*010*/ struct { u32 __reserved[4]; } __reserved_02; - -/*020*/ struct { /* APIC ID Register */ - u32 __reserved_1 : 24, - phys_apic_id : 4, - __reserved_2 : 4; - u32 __reserved[3]; - } id; - -/*030*/ const - struct { /* APIC Version Register */ - u32 version : 8, - __reserved_1 : 8, - max_lvt : 8, - __reserved_2 : 8; - u32 __reserved[3]; - } version; - -/*040*/ struct { u32 __reserved[4]; } __reserved_03; - -/*050*/ struct { u32 __reserved[4]; } __reserved_04; - -/*060*/ struct { u32 __reserved[4]; } __reserved_05; - -/*070*/ struct { u32 __reserved[4]; } __reserved_06; - -/*080*/ struct { /* Task Priority Register */ - u32 priority : 8, - __reserved_1 : 24; - u32 __reserved_2[3]; - } tpr; - -/*090*/ const - struct { /* Arbitration Priority Register */ - u32 priority : 8, - __reserved_1 : 24; - u32 __reserved_2[3]; - } apr; - -/*0A0*/ const - struct { /* Processor Priority Register */ - u32 priority : 8, - __reserved_1 : 24; - u32 __reserved_2[3]; - } ppr; - -/*0B0*/ struct { /* End Of Interrupt Register */ - u32 eoi; - u32 __reserved[3]; - } eoi; - -/*0C0*/ struct { u32 __reserved[4]; } __reserved_07; - -/*0D0*/ struct { /* Logical Destination Register */ - u32 __reserved_1 : 24, - logical_dest : 8; - u32 __reserved_2[3]; - } ldr; - -/*0E0*/ struct { /* Destination Format Register */ - u32 __reserved_1 : 28, - model : 4; - u32 __reserved_2[3]; - } dfr; - -/*0F0*/ struct { /* Spurious Interrupt Vector Register */ - u32 spurious_vector : 8, - apic_enabled : 1, - focus_cpu : 1, - __reserved_2 : 22; - u32 __reserved_3[3]; - } svr; - -/*100*/ struct { /* In Service Register */ -/*170*/ u32 bitfield; - u32 __reserved[3]; - } isr [8]; - -/*180*/ struct { /* Trigger Mode Register */ -/*1F0*/ u32 bitfield; - u32 __reserved[3]; - } tmr [8]; - -/*200*/ struct { /* Interrupt Request Register */ -/*270*/ u32 bitfield; - u32 __reserved[3]; - } irr [8]; - -/*280*/ union { /* Error Status Register */ - struct { - u32 send_cs_error : 1, - receive_cs_error : 1, - send_accept_error : 1, - receive_accept_error : 1, - __reserved_1 : 1, - send_illegal_vector : 1, - receive_illegal_vector : 1, - illegal_register_address : 1, - __reserved_2 : 24; - u32 __reserved_3[3]; - } error_bits; - struct { - u32 errors; - u32 __reserved_3[3]; - } all_errors; - } esr; - -/*290*/ struct { u32 __reserved[4]; } __reserved_08; - -/*2A0*/ struct { u32 __reserved[4]; } __reserved_09; - -/*2B0*/ struct { u32 __reserved[4]; } __reserved_10; - -/*2C0*/ struct { u32 __reserved[4]; } __reserved_11; - -/*2D0*/ struct { u32 __reserved[4]; } __reserved_12; - -/*2E0*/ struct { u32 __reserved[4]; } __reserved_13; - -/*2F0*/ struct { u32 __reserved[4]; } __reserved_14; - -/*300*/ struct { /* Interrupt Command Register 1 */ - u32 vector : 8, - delivery_mode : 3, - destination_mode : 1, - delivery_status : 1, - __reserved_1 : 1, - level : 1, - trigger : 1, - __reserved_2 : 2, - shorthand : 2, - __reserved_3 : 12; - u32 __reserved_4[3]; - } icr1; - -/*310*/ struct { /* Interrupt Command Register 2 */ - union { - u32 __reserved_1 : 24, - phys_dest : 4, - __reserved_2 : 4; - u32 __reserved_3 : 24, - logical_dest : 8; - } dest; - u32 __reserved_4[3]; - } icr2; - -/*320*/ struct { /* LVT - Timer */ - u32 vector : 8, - __reserved_1 : 4, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - timer_mode : 1, - __reserved_3 : 14; - u32 __reserved_4[3]; - } lvt_timer; - -/*330*/ struct { /* LVT - Thermal Sensor */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - __reserved_3 : 15; - u32 __reserved_4[3]; - } lvt_thermal; - -/*340*/ struct { /* LVT - Performance Counter */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - __reserved_3 : 15; - u32 __reserved_4[3]; - } lvt_pc; - -/*350*/ struct { /* LVT - LINT0 */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - polarity : 1, - remote_irr : 1, - trigger : 1, - mask : 1, - __reserved_2 : 15; - u32 __reserved_3[3]; - } lvt_lint0; - -/*360*/ struct { /* LVT - LINT1 */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - polarity : 1, - remote_irr : 1, - trigger : 1, - mask : 1, - __reserved_2 : 15; - u32 __reserved_3[3]; - } lvt_lint1; - -/*370*/ struct { /* LVT - Error */ - u32 vector : 8, - __reserved_1 : 4, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - __reserved_3 : 15; - u32 __reserved_4[3]; - } lvt_error; - -/*380*/ struct { /* Timer Initial Count Register */ - u32 initial_count; - u32 __reserved_2[3]; - } timer_icr; - -/*390*/ const - struct { /* Timer Current Count Register */ - u32 curr_count; - u32 __reserved_2[3]; - } timer_ccr; - -/*3A0*/ struct { u32 __reserved[4]; } __reserved_16; - -/*3B0*/ struct { u32 __reserved[4]; } __reserved_17; - -/*3C0*/ struct { u32 __reserved[4]; } __reserved_18; - -/*3D0*/ struct { u32 __reserved[4]; } __reserved_19; - -/*3E0*/ struct { /* Timer Divide Configuration Register */ - u32 divisor : 4, - __reserved_1 : 28; - u32 __reserved_2[3]; - } timer_dcr; - -/*3F0*/ struct { u32 __reserved[4]; } __reserved_20; - -} __attribute__ ((packed)); - -#undef u32 - #ifdef CONFIG_X86_32 #define BAD_APICID 0xFFu #else #define BAD_APICID 0xFFFFu #endif -#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_APICDEF_H */ From 6175b407756b22e7fdc771181b7d832ebdedef5c Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Sat, 18 Nov 2023 13:32:29 -0600 Subject: [PATCH 046/310] x86/mce/inject: Clear test status value AMD systems generally allow MCA "simulation" where MCA registers can be written with valid data and the full MCA handling flow can be tested by software. However, the platform on Scalable MCA systems, can prevent software from writing data to the MCA registers. There is no architectural way to determine this configuration. Therefore, the MCE injection module will check for this behavior by writing and reading back a test status value. This is done during module init, and the check can run on any CPU with any valid MCA bank. If MCA_STATUS writes are ignored by the platform, then there are no side effects on the hardware state. If the writes are not ignored, then the test status value will remain in the hardware MCA_STATUS register. It is likely that the value will not be overwritten by hardware or software, since the tested CPU and bank are arbitrary. Therefore, the user may see a spurious, synthetic MCA error reported whenever MCA is polled for this CPU. Clear the test value immediately after writing it. It is very unlikely that a valid MCA error is logged by hardware during the test. Errors that cause an #MC won't be affected. Fixes: 891e465a1bd8 ("x86/mce: Check whether writes to MCA_STATUS are getting ignored") Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231118193248.1296798-2-yazen.ghannam@amd.com --- arch/x86/kernel/cpu/mce/inject.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 4d8d4bcf915d..72f0695c3dc1 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -746,6 +746,7 @@ static void check_hw_inj_possible(void) wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status); rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status); + wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), 0); if (!status) { hw_injection_possible = false; From 03f111710af9ea9cd5a08ecc98e456d1cc0c2284 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Thu, 23 Nov 2023 11:49:11 +0800 Subject: [PATCH 047/310] x86/io: Remove the unused 'bw' parameter from the BUILDIO() macro Commit 1e8f93e18379 ("x86: Consolidate port I/O helpers") moved some port I/O helpers to , which caused the 'bw' parameter in the BUILDIO() macro to become unused. Remove it. Signed-off-by: Yuntao Wang Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231123034911.217791-1-ytcoode@gmail.com --- arch/x86/include/asm/io.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 76238842406a..3814a9263d64 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -242,7 +242,7 @@ static inline void slow_down_io(void) #endif -#define BUILDIO(bwl, bw, type) \ +#define BUILDIO(bwl, type) \ static inline void out##bwl##_p(type value, u16 port) \ { \ out##bwl(value, port); \ @@ -288,9 +288,9 @@ static inline void ins##bwl(u16 port, void *addr, unsigned long count) \ } \ } -BUILDIO(b, b, u8) -BUILDIO(w, w, u16) -BUILDIO(l, , u32) +BUILDIO(b, u8) +BUILDIO(w, u16) +BUILDIO(l, u32) #undef BUILDIO #define inb_p inb_p From 5e1c8a47fc6ec6251ddd126f4245279fc072f1c0 Mon Sep 17 00:00:00 2001 From: Adrian Huang Date: Thu, 23 Nov 2023 12:13:37 +0800 Subject: [PATCH 048/310] x86/ioapic: Remove unfinished sentence from comment [ mingo: Refine changelog. ] Signed-off-by: Adrian Huang Signed-off-by: Ingo Molnar Cc: linux-kernel@vger.kernel.org --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 00da6cf6b07d..40c7cf180c20 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -997,7 +997,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, /* * Legacy ISA IRQ has already been allocated, just add pin to * the pin list associated with this IRQ and program the IOAPIC - * entry. The IOAPIC entry + * entry. */ if (irq_data && irq_data->parent_data) { if (!mp_check_pin_attr(irq, info)) From 475c58e1a471e9b873e3e39958c64a2d278275c8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 22 Nov 2023 23:19:53 +0100 Subject: [PATCH 049/310] EDAC/thunderx: Fix possible out-of-bounds string access Enabling -Wstringop-overflow globally exposes a warning for a common bug in the usage of strncat(): drivers/edac/thunderx_edac.c: In function 'thunderx_ocx_com_threaded_isr': drivers/edac/thunderx_edac.c:1136:17: error: 'strncat' specified bound 1024 equals destination size [-Werror=stringop-overflow=] 1136 | strncat(msg, other, OCX_MESSAGE_SIZE); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ... 1145 | strncat(msg, other, OCX_MESSAGE_SIZE); ... 1150 | strncat(msg, other, OCX_MESSAGE_SIZE); ... Apparently the author of this driver expected strncat() to behave the way that strlcat() does, which uses the size of the destination buffer as its third argument rather than the length of the source buffer. The result is that there is no check on the size of the allocated buffer. Change it to strlcat(). [ bp: Trim compiler output, fixup commit message. ] Fixes: 41003396f932 ("EDAC, thunderx: Add Cavium ThunderX EDAC driver") Signed-off-by: Arnd Bergmann Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20231122222007.3199885-1-arnd@kernel.org --- drivers/edac/thunderx_edac.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/edac/thunderx_edac.c b/drivers/edac/thunderx_edac.c index b9c5772da959..90d46e5c4ff0 100644 --- a/drivers/edac/thunderx_edac.c +++ b/drivers/edac/thunderx_edac.c @@ -1133,7 +1133,7 @@ static irqreturn_t thunderx_ocx_com_threaded_isr(int irq, void *irq_id) decode_register(other, OCX_OTHER_SIZE, ocx_com_errors, ctx->reg_com_int); - strncat(msg, other, OCX_MESSAGE_SIZE); + strlcat(msg, other, OCX_MESSAGE_SIZE); for (lane = 0; lane < OCX_RX_LANES; lane++) if (ctx->reg_com_int & BIT(lane)) { @@ -1142,12 +1142,12 @@ static irqreturn_t thunderx_ocx_com_threaded_isr(int irq, void *irq_id) lane, ctx->reg_lane_int[lane], lane, ctx->reg_lane_stat11[lane]); - strncat(msg, other, OCX_MESSAGE_SIZE); + strlcat(msg, other, OCX_MESSAGE_SIZE); decode_register(other, OCX_OTHER_SIZE, ocx_lane_errors, ctx->reg_lane_int[lane]); - strncat(msg, other, OCX_MESSAGE_SIZE); + strlcat(msg, other, OCX_MESSAGE_SIZE); } if (ctx->reg_com_int & OCX_COM_INT_CE) @@ -1217,7 +1217,7 @@ static irqreturn_t thunderx_ocx_lnk_threaded_isr(int irq, void *irq_id) decode_register(other, OCX_OTHER_SIZE, ocx_com_link_errors, ctx->reg_com_link_int); - strncat(msg, other, OCX_MESSAGE_SIZE); + strlcat(msg, other, OCX_MESSAGE_SIZE); if (ctx->reg_com_link_int & OCX_COM_LINK_INT_UE) edac_device_handle_ue(ocx->edac_dev, 0, 0, msg); @@ -1896,7 +1896,7 @@ static irqreturn_t thunderx_l2c_threaded_isr(int irq, void *irq_id) decode_register(other, L2C_OTHER_SIZE, l2_errors, ctx->reg_int); - strncat(msg, other, L2C_MESSAGE_SIZE); + strlcat(msg, other, L2C_MESSAGE_SIZE); if (ctx->reg_int & mask_ue) edac_device_handle_ue(l2c->edac_dev, 0, 0, msg); From 5bfa0e45e9e7212b87fe1564ab45f146c7d56e5f Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 24 Nov 2023 09:38:53 +0000 Subject: [PATCH 050/310] x86/cpu/intel_epb: Don't rely on link order intel_epb_init() is called as a subsys_initcall() to register cpuhp callbacks. The callbacks make use of get_cpu_device() which will return NULL unless register_cpu() has been called. register_cpu() is called from topology_init(), which is also a subsys_initcall(). This is fragile. Moving the register_cpu() to a different subsys_initcall() leads to a NULL dereference during boot. Make intel_epb_init() a late_initcall(), user-space can't provide a policy before this point anyway. Signed-off-by: James Morse Signed-off-by: Russell King (Oracle) Signed-off-by: Ingo Molnar Reviewed-by: Gavin Shan Acked-by: Rafael J. Wysocki --- arch/x86/kernel/cpu/intel_epb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index e4c3ba91321c..f18d35fe27a9 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -237,4 +237,4 @@ err_out_online: cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE); return ret; } -subsys_initcall(intel_epb_init); +late_initcall(intel_epb_init); From 5f57b717ccce747e95ecbbcdcfd321e5c8d2d4e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 24 Nov 2023 11:09:17 +0200 Subject: [PATCH 051/310] EDAC/pci_sysfs: Use PCI_HEADER_TYPE_MASK instead of literals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace literal 0x7f with PCI_HEADER_TYPE_MASK. No functional changes: $ sha1sum drivers/edac/edac_pci_sysfs.o.* 805b33a090d8019d8b3b348191f630c72c748c9c drivers/edac/edac_pci_sysfs.o.before 805b33a090d8019d8b3b348191f630c72c748c9c drivers/edac/edac_pci_sysfs.o.after Signed-off-by: Ilpo Järvinen Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231124090919.23687-5-ilpo.jarvinen@linux.intel.com --- drivers/edac/edac_pci_sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/edac/edac_pci_sysfs.c b/drivers/edac/edac_pci_sysfs.c index 287cc51dbc86..901d4cd3ca38 100644 --- a/drivers/edac/edac_pci_sysfs.c +++ b/drivers/edac/edac_pci_sysfs.c @@ -521,7 +521,7 @@ static void edac_pci_dev_parity_clear(struct pci_dev *dev) /* read the device TYPE, looking for bridges */ pci_read_config_byte(dev, PCI_HEADER_TYPE, &header_type); - if ((header_type & 0x7F) == PCI_HEADER_TYPE_BRIDGE) + if ((header_type & PCI_HEADER_TYPE_MASK) == PCI_HEADER_TYPE_BRIDGE) get_pci_parity_status(dev, 1); } @@ -583,7 +583,7 @@ static void edac_pci_dev_parity_test(struct pci_dev *dev) edac_dbg(4, "PCI HEADER TYPE= 0x%02x %s\n", header_type, dev_name(&dev->dev)); - if ((header_type & 0x7F) == PCI_HEADER_TYPE_BRIDGE) { + if ((header_type & PCI_HEADER_TYPE_MASK) == PCI_HEADER_TYPE_BRIDGE) { /* On bridges, need to examine secondary status register */ status = get_pci_parity_status(dev, 1); From 357a18d033143617e9c7d420c8f0dd4cbab5f34d Mon Sep 17 00:00:00 2001 From: Jia Zhu Date: Mon, 20 Nov 2023 12:14:18 +0800 Subject: [PATCH 052/310] cachefiles: introduce object ondemand state Previously, @ondemand_id field was used not only to identify ondemand state of the object, but also to represent the index of the xarray. This commit introduces @state field to decouple the role of @ondemand_id and adds helpers to access it. Signed-off-by: Jia Zhu Link: https://lore.kernel.org/r/20231120041422.75170-2-zhujia.zj@bytedance.com Reviewed-by: Jingbo Xu Reviewed-by: David Howells Signed-off-by: Christian Brauner --- fs/cachefiles/internal.h | 21 +++++++++++++++++++++ fs/cachefiles/ondemand.c | 21 +++++++++------------ 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 2ad58c465208..00beedeaec18 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -44,6 +44,11 @@ struct cachefiles_volume { struct dentry *fanout[256]; /* Fanout subdirs */ }; +enum cachefiles_object_state { + CACHEFILES_ONDEMAND_OBJSTATE_CLOSE, /* Anonymous fd closed by daemon or initial state */ + CACHEFILES_ONDEMAND_OBJSTATE_OPEN, /* Anonymous fd associated with object is available */ +}; + /* * Backing file state. */ @@ -62,6 +67,7 @@ struct cachefiles_object { #define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */ #ifdef CONFIG_CACHEFILES_ONDEMAND int ondemand_id; + enum cachefiles_object_state state; #endif }; @@ -296,6 +302,21 @@ extern void cachefiles_ondemand_clean_object(struct cachefiles_object *object); extern int cachefiles_ondemand_read(struct cachefiles_object *object, loff_t pos, size_t len); +#define CACHEFILES_OBJECT_STATE_FUNCS(_state, _STATE) \ +static inline bool \ +cachefiles_ondemand_object_is_##_state(const struct cachefiles_object *object) \ +{ \ + return object->state == CACHEFILES_ONDEMAND_OBJSTATE_##_STATE; \ +} \ + \ +static inline void \ +cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \ +{ \ + object->state = CACHEFILES_ONDEMAND_OBJSTATE_##_STATE; \ +} + +CACHEFILES_OBJECT_STATE_FUNCS(open, OPEN); +CACHEFILES_OBJECT_STATE_FUNCS(close, CLOSE); #else static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, char __user *_buffer, size_t buflen) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 0254ed39f68c..90456b8a4b3e 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -15,6 +15,7 @@ static int cachefiles_ondemand_fd_release(struct inode *inode, xa_lock(&cache->reqs); object->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED; + cachefiles_ondemand_set_object_close(object); /* * Flush all pending READ requests since their completion depends on @@ -176,6 +177,8 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); trace_cachefiles_ondemand_copen(req->object, id, size); + cachefiles_ondemand_set_object_open(req->object); + out: complete(&req->done); return ret; @@ -363,7 +366,8 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, /* coupled with the barrier in cachefiles_flush_reqs() */ smp_mb(); - if (opcode != CACHEFILES_OP_OPEN && object->ondemand_id <= 0) { + if (opcode != CACHEFILES_OP_OPEN && + !cachefiles_ondemand_object_is_open(object)) { WARN_ON_ONCE(object->ondemand_id == 0); xas_unlock(&xas); ret = -EIO; @@ -430,18 +434,11 @@ static int cachefiles_ondemand_init_close_req(struct cachefiles_req *req, void *private) { struct cachefiles_object *object = req->object; - int object_id = object->ondemand_id; - /* - * It's possible that object id is still 0 if the cookie looking up - * phase failed before OPEN request has ever been sent. Also avoid - * sending CLOSE request for CACHEFILES_ONDEMAND_ID_CLOSED, which means - * anon_fd has already been closed. - */ - if (object_id <= 0) + if (!cachefiles_ondemand_object_is_open(object)) return -ENOENT; - req->msg.object_id = object_id; + req->msg.object_id = object->ondemand_id; trace_cachefiles_ondemand_close(object, &req->msg); return 0; } @@ -460,7 +457,7 @@ static int cachefiles_ondemand_init_read_req(struct cachefiles_req *req, int object_id = object->ondemand_id; /* Stop enqueuing requests when daemon has closed anon_fd. */ - if (object_id <= 0) { + if (!cachefiles_ondemand_object_is_open(object)) { WARN_ON_ONCE(object_id == 0); pr_info_once("READ: anonymous fd closed prematurely.\n"); return -EIO; @@ -485,7 +482,7 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object) * creating a new tmpfile as the cache file. Reuse the previously * allocated object ID if any. */ - if (object->ondemand_id > 0) + if (cachefiles_ondemand_object_is_open(object)) return 0; volume_key_size = volume->key[0] + 1; From 3c5ecfe16e7699011c12c2d44e55437415331fa3 Mon Sep 17 00:00:00 2001 From: Jia Zhu Date: Mon, 20 Nov 2023 12:14:19 +0800 Subject: [PATCH 053/310] cachefiles: extract ondemand info field from cachefiles_object We'll introduce a @work_struct field for @object in subsequent patches, it will enlarge the size of @object. As the result of that, this commit extracts ondemand info field from @object. Signed-off-by: Jia Zhu Link: https://lore.kernel.org/r/20231120041422.75170-3-zhujia.zj@bytedance.com Reviewed-by: Jingbo Xu Reviewed-by: David Howells Signed-off-by: Christian Brauner --- fs/cachefiles/interface.c | 7 ++++++- fs/cachefiles/internal.h | 26 ++++++++++++++++++++++---- fs/cachefiles/ondemand.c | 34 ++++++++++++++++++++++++++++------ 3 files changed, 56 insertions(+), 11 deletions(-) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 40052bdb3365..35ba2117a6f6 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -31,6 +31,11 @@ struct cachefiles_object *cachefiles_alloc_object(struct fscache_cookie *cookie) if (!object) return NULL; + if (cachefiles_ondemand_init_obj_info(object, volume)) { + kmem_cache_free(cachefiles_object_jar, object); + return NULL; + } + refcount_set(&object->ref, 1); spin_lock_init(&object->lock); @@ -88,7 +93,7 @@ void cachefiles_put_object(struct cachefiles_object *object, ASSERTCMP(object->file, ==, NULL); kfree(object->d_name); - + cachefiles_ondemand_deinit_obj_info(object); cache = object->volume->cache->cache; fscache_put_cookie(object->cookie, fscache_cookie_put_object); object->cookie = NULL; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 00beedeaec18..b0fe76964bc0 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -49,6 +49,12 @@ enum cachefiles_object_state { CACHEFILES_ONDEMAND_OBJSTATE_OPEN, /* Anonymous fd associated with object is available */ }; +struct cachefiles_ondemand_info { + int ondemand_id; + enum cachefiles_object_state state; + struct cachefiles_object *object; +}; + /* * Backing file state. */ @@ -66,8 +72,7 @@ struct cachefiles_object { unsigned long flags; #define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */ #ifdef CONFIG_CACHEFILES_ONDEMAND - int ondemand_id; - enum cachefiles_object_state state; + struct cachefiles_ondemand_info *ondemand; #endif }; @@ -302,17 +307,21 @@ extern void cachefiles_ondemand_clean_object(struct cachefiles_object *object); extern int cachefiles_ondemand_read(struct cachefiles_object *object, loff_t pos, size_t len); +extern int cachefiles_ondemand_init_obj_info(struct cachefiles_object *obj, + struct cachefiles_volume *volume); +extern void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *obj); + #define CACHEFILES_OBJECT_STATE_FUNCS(_state, _STATE) \ static inline bool \ cachefiles_ondemand_object_is_##_state(const struct cachefiles_object *object) \ { \ - return object->state == CACHEFILES_ONDEMAND_OBJSTATE_##_STATE; \ + return object->ondemand->state == CACHEFILES_ONDEMAND_OBJSTATE_##_STATE; \ } \ \ static inline void \ cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \ { \ - object->state = CACHEFILES_ONDEMAND_OBJSTATE_##_STATE; \ + object->ondemand->state = CACHEFILES_ONDEMAND_OBJSTATE_##_STATE; \ } CACHEFILES_OBJECT_STATE_FUNCS(open, OPEN); @@ -338,6 +347,15 @@ static inline int cachefiles_ondemand_read(struct cachefiles_object *object, { return -EOPNOTSUPP; } + +static inline int cachefiles_ondemand_init_obj_info(struct cachefiles_object *obj, + struct cachefiles_volume *volume) +{ + return 0; +} +static inline void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *obj) +{ +} #endif /* diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 90456b8a4b3e..deb7e3007aa1 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -9,12 +9,13 @@ static int cachefiles_ondemand_fd_release(struct inode *inode, { struct cachefiles_object *object = file->private_data; struct cachefiles_cache *cache = object->volume->cache; - int object_id = object->ondemand_id; + struct cachefiles_ondemand_info *info = object->ondemand; + int object_id = info->ondemand_id; struct cachefiles_req *req; XA_STATE(xas, &cache->reqs, 0); xa_lock(&cache->reqs); - object->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED; + info->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED; cachefiles_ondemand_set_object_close(object); /* @@ -222,7 +223,7 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req) load = (void *)req->msg.data; load->fd = fd; req->msg.object_id = object_id; - object->ondemand_id = object_id; + object->ondemand->ondemand_id = object_id; cachefiles_get_unbind_pincount(cache); trace_cachefiles_ondemand_open(object, &req->msg, load); @@ -368,7 +369,7 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, if (opcode != CACHEFILES_OP_OPEN && !cachefiles_ondemand_object_is_open(object)) { - WARN_ON_ONCE(object->ondemand_id == 0); + WARN_ON_ONCE(object->ondemand->ondemand_id == 0); xas_unlock(&xas); ret = -EIO; goto out; @@ -438,7 +439,7 @@ static int cachefiles_ondemand_init_close_req(struct cachefiles_req *req, if (!cachefiles_ondemand_object_is_open(object)) return -ENOENT; - req->msg.object_id = object->ondemand_id; + req->msg.object_id = object->ondemand->ondemand_id; trace_cachefiles_ondemand_close(object, &req->msg); return 0; } @@ -454,7 +455,7 @@ static int cachefiles_ondemand_init_read_req(struct cachefiles_req *req, struct cachefiles_object *object = req->object; struct cachefiles_read *load = (void *)req->msg.data; struct cachefiles_read_ctx *read_ctx = private; - int object_id = object->ondemand_id; + int object_id = object->ondemand->ondemand_id; /* Stop enqueuing requests when daemon has closed anon_fd. */ if (!cachefiles_ondemand_object_is_open(object)) { @@ -500,6 +501,27 @@ void cachefiles_ondemand_clean_object(struct cachefiles_object *object) cachefiles_ondemand_init_close_req, NULL); } +int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object, + struct cachefiles_volume *volume) +{ + if (!cachefiles_in_ondemand_mode(volume->cache)) + return 0; + + object->ondemand = kzalloc(sizeof(struct cachefiles_ondemand_info), + GFP_KERNEL); + if (!object->ondemand) + return -ENOMEM; + + object->ondemand->object = object; + return 0; +} + +void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *object) +{ + kfree(object->ondemand); + object->ondemand = NULL; +} + int cachefiles_ondemand_read(struct cachefiles_object *object, loff_t pos, size_t len) { From 0a7e54c1959c0feb2de23397ec09c7692364313e Mon Sep 17 00:00:00 2001 From: Jia Zhu Date: Mon, 20 Nov 2023 12:14:20 +0800 Subject: [PATCH 054/310] cachefiles: resend an open request if the read request's object is closed When an anonymous fd is closed by user daemon, if there is a new read request for this file comes up, the anonymous fd should be re-opened to handle that read request rather than fail it directly. 1. Introduce reopening state for objects that are closed but have inflight/subsequent read requests. 2. No longer flush READ requests but only CLOSE requests when anonymous fd is closed. 3. Enqueue the reopen work to workqueue, thus user daemon could get rid of daemon_read context and handle that request smoothly. Otherwise, the user daemon will send a reopen request and wait for itself to process the request. Signed-off-by: Jia Zhu Link: https://lore.kernel.org/r/20231120041422.75170-4-zhujia.zj@bytedance.com Reviewed-by: Jingbo Xu Reviewed-by: David Howells Signed-off-by: Christian Brauner --- fs/cachefiles/internal.h | 3 ++ fs/cachefiles/ondemand.c | 98 ++++++++++++++++++++++++++++------------ 2 files changed, 72 insertions(+), 29 deletions(-) diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index b0fe76964bc0..b9a90f1a0c01 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -47,9 +47,11 @@ struct cachefiles_volume { enum cachefiles_object_state { CACHEFILES_ONDEMAND_OBJSTATE_CLOSE, /* Anonymous fd closed by daemon or initial state */ CACHEFILES_ONDEMAND_OBJSTATE_OPEN, /* Anonymous fd associated with object is available */ + CACHEFILES_ONDEMAND_OBJSTATE_REOPENING, /* Object that was closed and is being reopened. */ }; struct cachefiles_ondemand_info { + struct work_struct ondemand_work; int ondemand_id; enum cachefiles_object_state state; struct cachefiles_object *object; @@ -326,6 +328,7 @@ cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \ CACHEFILES_OBJECT_STATE_FUNCS(open, OPEN); CACHEFILES_OBJECT_STATE_FUNCS(close, CLOSE); +CACHEFILES_OBJECT_STATE_FUNCS(reopening, REOPENING); #else static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, char __user *_buffer, size_t buflen) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index deb7e3007aa1..8e130de952f7 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -18,14 +18,10 @@ static int cachefiles_ondemand_fd_release(struct inode *inode, info->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED; cachefiles_ondemand_set_object_close(object); - /* - * Flush all pending READ requests since their completion depends on - * anon_fd. - */ - xas_for_each(&xas, req, ULONG_MAX) { + /* Only flush CACHEFILES_REQ_NEW marked req to avoid race with daemon_read */ + xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) { if (req->msg.object_id == object_id && - req->msg.opcode == CACHEFILES_OP_READ) { - req->error = -EIO; + req->msg.opcode == CACHEFILES_OP_CLOSE) { complete(&req->done); xas_store(&xas, NULL); } @@ -179,6 +175,7 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args) trace_cachefiles_ondemand_copen(req->object, id, size); cachefiles_ondemand_set_object_open(req->object); + wake_up_all(&cache->daemon_pollwq); out: complete(&req->done); @@ -222,7 +219,6 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req) load = (void *)req->msg.data; load->fd = fd; - req->msg.object_id = object_id; object->ondemand->ondemand_id = object_id; cachefiles_get_unbind_pincount(cache); @@ -238,6 +234,43 @@ err: return ret; } +static void ondemand_object_worker(struct work_struct *work) +{ + struct cachefiles_ondemand_info *info = + container_of(work, struct cachefiles_ondemand_info, ondemand_work); + + cachefiles_ondemand_init_object(info->object); +} + +/* + * If there are any inflight or subsequent READ requests on the + * closed object, reopen it. + * Skip read requests whose related object is reopening. + */ +static struct cachefiles_req *cachefiles_ondemand_select_req(struct xa_state *xas, + unsigned long xa_max) +{ + struct cachefiles_req *req; + struct cachefiles_object *object; + struct cachefiles_ondemand_info *info; + + xas_for_each_marked(xas, req, xa_max, CACHEFILES_REQ_NEW) { + if (req->msg.opcode != CACHEFILES_OP_READ) + return req; + object = req->object; + info = object->ondemand; + if (cachefiles_ondemand_object_is_close(object)) { + cachefiles_ondemand_set_object_reopening(object); + queue_work(fscache_wq, &info->ondemand_work); + continue; + } + if (cachefiles_ondemand_object_is_reopening(object)) + continue; + return req; + } + return NULL; +} + ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, char __user *_buffer, size_t buflen) { @@ -248,16 +281,16 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, int ret = 0; XA_STATE(xas, &cache->reqs, cache->req_id_next); + xa_lock(&cache->reqs); /* * Cyclically search for a request that has not ever been processed, * to prevent requests from being processed repeatedly, and make * request distribution fair. */ - xa_lock(&cache->reqs); - req = xas_find_marked(&xas, UINT_MAX, CACHEFILES_REQ_NEW); + req = cachefiles_ondemand_select_req(&xas, ULONG_MAX); if (!req && cache->req_id_next > 0) { xas_set(&xas, 0); - req = xas_find_marked(&xas, cache->req_id_next - 1, CACHEFILES_REQ_NEW); + req = cachefiles_ondemand_select_req(&xas, cache->req_id_next - 1); } if (!req) { xa_unlock(&cache->reqs); @@ -277,14 +310,18 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, xa_unlock(&cache->reqs); id = xas.xa_index; - msg->msg_id = id; if (msg->opcode == CACHEFILES_OP_OPEN) { ret = cachefiles_ondemand_get_fd(req); - if (ret) + if (ret) { + cachefiles_ondemand_set_object_close(req->object); goto error; + } } + msg->msg_id = id; + msg->object_id = req->object->ondemand->ondemand_id; + if (copy_to_user(_buffer, msg, n) != 0) { ret = -EFAULT; goto err_put_fd; @@ -317,19 +354,23 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, void *private) { struct cachefiles_cache *cache = object->volume->cache; - struct cachefiles_req *req; + struct cachefiles_req *req = NULL; XA_STATE(xas, &cache->reqs, 0); int ret; if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags)) return 0; - if (test_bit(CACHEFILES_DEAD, &cache->flags)) - return -EIO; + if (test_bit(CACHEFILES_DEAD, &cache->flags)) { + ret = -EIO; + goto out; + } req = kzalloc(sizeof(*req) + data_len, GFP_KERNEL); - if (!req) - return -ENOMEM; + if (!req) { + ret = -ENOMEM; + goto out; + } req->object = object; init_completion(&req->done); @@ -367,7 +408,7 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, /* coupled with the barrier in cachefiles_flush_reqs() */ smp_mb(); - if (opcode != CACHEFILES_OP_OPEN && + if (opcode == CACHEFILES_OP_CLOSE && !cachefiles_ondemand_object_is_open(object)) { WARN_ON_ONCE(object->ondemand->ondemand_id == 0); xas_unlock(&xas); @@ -392,7 +433,15 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, wake_up_all(&cache->daemon_pollwq); wait_for_completion(&req->done); ret = req->error; + kfree(req); + return ret; out: + /* Reset the object to close state in error handling path. + * If error occurs after creating the anonymous fd, + * cachefiles_ondemand_fd_release() will set object to close. + */ + if (opcode == CACHEFILES_OP_OPEN) + cachefiles_ondemand_set_object_close(object); kfree(req); return ret; } @@ -439,7 +488,6 @@ static int cachefiles_ondemand_init_close_req(struct cachefiles_req *req, if (!cachefiles_ondemand_object_is_open(object)) return -ENOENT; - req->msg.object_id = object->ondemand->ondemand_id; trace_cachefiles_ondemand_close(object, &req->msg); return 0; } @@ -455,16 +503,7 @@ static int cachefiles_ondemand_init_read_req(struct cachefiles_req *req, struct cachefiles_object *object = req->object; struct cachefiles_read *load = (void *)req->msg.data; struct cachefiles_read_ctx *read_ctx = private; - int object_id = object->ondemand->ondemand_id; - /* Stop enqueuing requests when daemon has closed anon_fd. */ - if (!cachefiles_ondemand_object_is_open(object)) { - WARN_ON_ONCE(object_id == 0); - pr_info_once("READ: anonymous fd closed prematurely.\n"); - return -EIO; - } - - req->msg.object_id = object_id; load->off = read_ctx->off; load->len = read_ctx->len; trace_cachefiles_ondemand_read(object, &req->msg, load); @@ -513,6 +552,7 @@ int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object, return -ENOMEM; object->ondemand->object = object; + INIT_WORK(&object->ondemand->ondemand_work, ondemand_object_worker); return 0; } From b817e22b2e91257ace32a6768c3c003faeaa1c5c Mon Sep 17 00:00:00 2001 From: Jia Zhu Date: Mon, 20 Nov 2023 12:14:21 +0800 Subject: [PATCH 055/310] cachefiles: narrow the scope of triggering EPOLLIN events in ondemand mode Don't trigger EPOLLIN when there are only reopening read requests in xarray. Suggested-by: Xin Yin Signed-off-by: Jia Zhu Link: https://lore.kernel.org/r/20231120041422.75170-5-zhujia.zj@bytedance.com Reviewed-by: Jingbo Xu Reviewed-by: David Howells Signed-off-by: Christian Brauner --- fs/cachefiles/daemon.c | 14 ++++++++++++-- fs/cachefiles/internal.h | 12 ++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index aa4efcabb5e3..70caa1946207 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -355,14 +355,24 @@ static __poll_t cachefiles_daemon_poll(struct file *file, struct poll_table_struct *poll) { struct cachefiles_cache *cache = file->private_data; + XA_STATE(xas, &cache->reqs, 0); + struct cachefiles_req *req; __poll_t mask; poll_wait(file, &cache->daemon_pollwq, poll); mask = 0; if (cachefiles_in_ondemand_mode(cache)) { - if (!xa_empty(&cache->reqs)) - mask |= EPOLLIN; + if (!xa_empty(&cache->reqs)) { + rcu_read_lock(); + xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) { + if (!cachefiles_ondemand_is_reopening_read(req)) { + mask |= EPOLLIN; + break; + } + } + rcu_read_unlock(); + } } else { if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags)) mask |= EPOLLIN; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index b9a90f1a0c01..26e5f8f123ef 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -329,6 +329,13 @@ cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \ CACHEFILES_OBJECT_STATE_FUNCS(open, OPEN); CACHEFILES_OBJECT_STATE_FUNCS(close, CLOSE); CACHEFILES_OBJECT_STATE_FUNCS(reopening, REOPENING); + +static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req) +{ + return cachefiles_ondemand_object_is_reopening(req->object) && + req->msg.opcode == CACHEFILES_OP_READ; +} + #else static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, char __user *_buffer, size_t buflen) @@ -359,6 +366,11 @@ static inline int cachefiles_ondemand_init_obj_info(struct cachefiles_object *ob static inline void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *obj) { } + +static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req) +{ + return false; +} #endif /* From e73fa11a356ca0905c3cc648eaacc6f0f2d2c8b3 Mon Sep 17 00:00:00 2001 From: Jia Zhu Date: Mon, 20 Nov 2023 12:14:22 +0800 Subject: [PATCH 056/310] cachefiles: add restore command to recover inflight ondemand read requests Previously, in ondemand read scenario, if the anonymous fd was closed by user daemon, inflight and subsequent read requests would return EIO. As long as the device connection is not released, user daemon can hold and restore inflight requests by setting the request flag to CACHEFILES_REQ_NEW. Suggested-by: Gao Xiang Signed-off-by: Jia Zhu Signed-off-by: Xin Yin Link: https://lore.kernel.org/r/20231120041422.75170-6-zhujia.zj@bytedance.com Reviewed-by: Jingbo Xu Reviewed-by: David Howells Signed-off-by: Christian Brauner --- fs/cachefiles/daemon.c | 1 + fs/cachefiles/internal.h | 3 +++ fs/cachefiles/ondemand.c | 23 +++++++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 70caa1946207..3f24905f4066 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -77,6 +77,7 @@ static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = { { "tag", cachefiles_daemon_tag }, #ifdef CONFIG_CACHEFILES_ONDEMAND { "copen", cachefiles_ondemand_copen }, + { "restore", cachefiles_ondemand_restore }, #endif { "", NULL } }; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 26e5f8f123ef..4a87c9d714a9 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -303,6 +303,9 @@ extern ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache, extern int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args); +extern int cachefiles_ondemand_restore(struct cachefiles_cache *cache, + char *args); + extern int cachefiles_ondemand_init_object(struct cachefiles_object *object); extern void cachefiles_ondemand_clean_object(struct cachefiles_object *object); diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 8e130de952f7..b8fbbb1961bb 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -182,6 +182,29 @@ out: return ret; } +int cachefiles_ondemand_restore(struct cachefiles_cache *cache, char *args) +{ + struct cachefiles_req *req; + + XA_STATE(xas, &cache->reqs, 0); + + if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags)) + return -EOPNOTSUPP; + + /* + * Reset the requests to CACHEFILES_REQ_NEW state, so that the + * requests have been processed halfway before the crash of the + * user daemon could be reprocessed after the recovery. + */ + xas_lock(&xas); + xas_for_each(&xas, req, ULONG_MAX) + xas_set_mark(&xas, CACHEFILES_REQ_NEW); + xas_unlock(&xas); + + wake_up_all(&cache->daemon_pollwq); + return 0; +} + static int cachefiles_ondemand_get_fd(struct cachefiles_req *req) { struct cachefiles_object *object; From 28a9466d75a861c26ba57b64a0a9a436a7c61e77 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 26 Nov 2023 10:00:59 +0100 Subject: [PATCH 057/310] MAINTAINERS: Add include/linux/lockdep*.h Have lockdep_api.h and lockdep_types.h match as well. Signed-off-by: Christophe JAILLET Signed-off-by: Ingo Molnar Acked-by: Waiman Long Link: https://lore.kernel.org/r/e722abd043e5de64d2acd28d581e4a952994a94e.1700989248.git.christophe.jaillet@wanadoo.fr --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cf..9834adcdd50a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12458,7 +12458,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core F: Documentation/locking/ F: arch/*/include/asm/spinlock*.h -F: include/linux/lockdep.h +F: include/linux/lockdep*.h F: include/linux/mutex*.h F: include/linux/rwlock*.h F: include/linux/rwsem*.h From 18caaedaf4c3712ab6821f292598a8f86e6d7972 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 26 Nov 2023 09:56:29 +0100 Subject: [PATCH 058/310] locking/lockdep: Slightly reorder 'struct lock_class' to save some memory Based on pahole, 2 holes can be combined in the 'struct lock_class'. This saves 8 bytes in the structure on my x86_64. On a x86_64 configured with allmodconfig, this saves ~64kb of memory in 'kernel/locking/lockdep.o': text data bss dec filename Before: 102,501 1,912,490 11,531,636 13,546,627 kernel/locking/lockdep.o After: 102,181 1,912,490 11,466,100 13,480,771 kernel/locking/lockdep.o because of: struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; After the reorder, pahole gives: struct lock_class { struct hlist_node hash_entry; /* 0 16 */ struct list_head lock_entry; /* 16 16 */ struct list_head locks_after; /* 32 16 */ struct list_head locks_before; /* 48 16 */ /* --- cacheline 1 boundary (64 bytes) --- */ const struct lockdep_subclass_key * key; /* 64 8 */ lock_cmp_fn cmp_fn; /* 72 8 */ lock_print_fn print_fn; /* 80 8 */ unsigned int subclass; /* 88 4 */ unsigned int dep_gen_id; /* 92 4 */ long unsigned int usage_mask; /* 96 8 */ const struct lock_trace * usage_traces[10]; /* 104 80 */ /* --- cacheline 2 boundary (128 bytes) was 56 bytes ago --- */ const char * name; /* 184 8 */ /* --- cacheline 3 boundary (192 bytes) --- */ int name_version; /* 192 4 */ u8 wait_type_inner; /* 196 1 */ u8 wait_type_outer; /* 197 1 */ u8 lock_type; /* 198 1 */ /* XXX 1 byte hole, try to pack */ long unsigned int contention_point[4]; /* 200 32 */ long unsigned int contending_point[4]; /* 232 32 */ /* size: 264, cachelines: 5, members: 18 */ /* sum members: 263, holes: 1, sum holes: 1 */ /* last cacheline: 8 bytes */ }; Signed-off-by: Christophe JAILLET Signed-off-by: Ingo Molnar Acked-by: Waiman Long Link: https://lore.kernel.org/r/801258371fc4101f96495a5aaecef638d6cbd8d3.1700988869.git.christophe.jaillet@wanadoo.fr --- include/linux/lockdep_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index 2ebc323d345a..857d785e89e6 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -127,12 +127,12 @@ struct lock_class { unsigned long usage_mask; const struct lock_trace *usage_traces[LOCK_TRACE_STATES]; + const char *name; /* * Generation counter, when doing certain classes of graph walking, * to ensure that we check one node only once: */ int name_version; - const char *name; u8 wait_type_inner; u8 wait_type_outer; From 82d30723d58fccbd2d7d707fab7649b541fafa1b Mon Sep 17 00:00:00 2001 From: Li zeming Date: Mon, 13 Nov 2023 09:15:43 +0800 Subject: [PATCH 059/310] misc: ocxl: context: Remove unnecessary (void*) conversions The ctx pointer does not need to cast the type. Signed-off-by: Li zeming Acked-by: Andrew Donnellan Acked-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://msgid.link/20231113011543.6940-1-zeming@nfschina.com --- drivers/misc/ocxl/context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c index 7f83116ae11a..cded7d1caf32 100644 --- a/drivers/misc/ocxl/context.c +++ b/drivers/misc/ocxl/context.c @@ -55,7 +55,7 @@ EXPORT_SYMBOL_GPL(ocxl_context_alloc); */ static void xsl_fault_error(void *data, u64 addr, u64 dsisr) { - struct ocxl_context *ctx = (struct ocxl_context *) data; + struct ocxl_context *ctx = data; mutex_lock(&ctx->xsl_error_lock); ctx->xsl_error.addr = addr; From 84ba5d3675e23e6fa824a2268c5b6a04b52dde4d Mon Sep 17 00:00:00 2001 From: Li zeming Date: Mon, 13 Nov 2023 09:22:02 +0800 Subject: [PATCH 060/310] misc: ocxl: afu_irq: Remove unnecessary (void*) conversions The irq pointer does not need to cast the type. Signed-off-by: Li zeming Acked-by: Andrew Donnellan Acked-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://msgid.link/20231113012202.7887-1-zeming@nfschina.com --- drivers/misc/ocxl/afu_irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/ocxl/afu_irq.c b/drivers/misc/ocxl/afu_irq.c index a06920b7e049..36f7379b8e2d 100644 --- a/drivers/misc/ocxl/afu_irq.c +++ b/drivers/misc/ocxl/afu_irq.c @@ -57,7 +57,7 @@ EXPORT_SYMBOL_GPL(ocxl_irq_set_handler); static irqreturn_t afu_irq_handler(int virq, void *data) { - struct afu_irq *irq = (struct afu_irq *) data; + struct afu_irq *irq = data; trace_ocxl_afu_irq_receive(virq); From 220f3ced8e42b1efe9c6b84778fb0c77c0c56611 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Mon, 13 Nov 2023 09:45:33 +0800 Subject: [PATCH 061/310] misc: ocxl: link: Remove unnecessary (void*) conversions The link pointer does not need to cast the type. Signed-off-by: Li zeming Acked-by: Andrew Donnellan Acked-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://msgid.link/20231113014533.11064-1-zeming@nfschina.com --- drivers/misc/ocxl/link.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c index c06c699c0e7b..03402203cacd 100644 --- a/drivers/misc/ocxl/link.c +++ b/drivers/misc/ocxl/link.c @@ -188,7 +188,7 @@ ack: static irqreturn_t xsl_fault_handler(int irq, void *data) { - struct ocxl_link *link = (struct ocxl_link *) data; + struct ocxl_link *link = data; struct spa *spa = link->spa; u64 dsisr, dar, pe_handle; struct pe_data *pe_data; @@ -483,7 +483,7 @@ static void release_xsl(struct kref *ref) void ocxl_link_release(struct pci_dev *dev, void *link_handle) { - struct ocxl_link *link = (struct ocxl_link *) link_handle; + struct ocxl_link *link = link_handle; mutex_lock(&links_list_lock); kref_put(&link->ref, release_xsl); @@ -540,7 +540,7 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr), void *xsl_err_data) { - struct ocxl_link *link = (struct ocxl_link *) link_handle; + struct ocxl_link *link = link_handle; struct spa *spa = link->spa; struct ocxl_process_element *pe; int pe_handle, rc = 0; @@ -630,7 +630,7 @@ EXPORT_SYMBOL_GPL(ocxl_link_add_pe); int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid) { - struct ocxl_link *link = (struct ocxl_link *) link_handle; + struct ocxl_link *link = link_handle; struct spa *spa = link->spa; struct ocxl_process_element *pe; int pe_handle, rc; @@ -666,7 +666,7 @@ int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid) int ocxl_link_remove_pe(void *link_handle, int pasid) { - struct ocxl_link *link = (struct ocxl_link *) link_handle; + struct ocxl_link *link = link_handle; struct spa *spa = link->spa; struct ocxl_process_element *pe; struct pe_data *pe_data; @@ -752,7 +752,7 @@ EXPORT_SYMBOL_GPL(ocxl_link_remove_pe); int ocxl_link_irq_alloc(void *link_handle, int *hw_irq) { - struct ocxl_link *link = (struct ocxl_link *) link_handle; + struct ocxl_link *link = link_handle; int irq; if (atomic_dec_if_positive(&link->irq_available) < 0) @@ -771,7 +771,7 @@ EXPORT_SYMBOL_GPL(ocxl_link_irq_alloc); void ocxl_link_free_irq(void *link_handle, int hw_irq) { - struct ocxl_link *link = (struct ocxl_link *) link_handle; + struct ocxl_link *link = link_handle; xive_native_free_irq(hw_irq); atomic_inc(&link->irq_available); From 29685ea5754f04c84ad443fd7c6869c68f636c26 Mon Sep 17 00:00:00 2001 From: Li kunyu Date: Mon, 13 Nov 2023 09:52:29 +0800 Subject: [PATCH 062/310] =?UTF-8?q?misc:=20ocxl:=20main:=20Remove=20unnece?= =?UTF-8?q?ssary=20=E2=80=980=E2=80=99=20values=20from=20rc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rc is assigned first, so it does not need to initialize the assignment. Signed-off-by: Li kunyu Acked-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://msgid.link/20231113015229.12074-1-kunyu@nfschina.com --- drivers/misc/ocxl/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/ocxl/main.c b/drivers/misc/ocxl/main.c index ef73cf35dda2..658974143c3c 100644 --- a/drivers/misc/ocxl/main.c +++ b/drivers/misc/ocxl/main.c @@ -7,7 +7,7 @@ static int __init init_ocxl(void) { - int rc = 0; + int rc; if (!tlbie_capable) return -EINVAL; From 183bc0c640c785a710885a10b614193f114fe760 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 24 Oct 2023 22:27:25 +1100 Subject: [PATCH 063/310] powerpc/configs/64s: Enable CONFIG_MEM_SOFT_DIRTY Enable CONFIG_MEM_SOFT_DIRTY to get some test coverage. Distros enable it, and it has been broken previously. See commit 66b2ca086210 ("powerpc/64s/radix: Fix soft dirty tracking"). Signed-off-by: Michael Ellerman Link: https://msgid.link/20231024112726.1819795-1-mpe@ellerman.id.au --- arch/powerpc/configs/ppc64_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 6e7b9e8fd225..544a65fda77b 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -92,6 +92,7 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_MEM_SOFT_DIRTY=y CONFIG_ZONE_DEVICE=y CONFIG_NET=y CONFIG_PACKET=y From 98eb30fe4c69a9b602f29e406317c49b5580352a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 25 Oct 2023 12:24:52 +1100 Subject: [PATCH 064/310] powerpc: Make cpu_spec __ro_after_init The cpu_spec is a struct holding various information about the CPU the kernel is executing on. It's populated early in boot and must not change after that. In particular the cpu_features and mmu_features hold the set of discovered CPU/MMU features and are used to set static keys for each feature, and do binary patching of assembly. So any change to the cpu_features/mmu_features later in boot will not be reflected in the state of the static keys or patched code. There is already logic to check that cpu_features/mmu_features don't change, see check_features() in feature-fixups.c. But as another layer of protection the entire cpu_spec should be read only after init, annotate it as such. Signed-off-by: Michael Ellerman Link: https://msgid.link/20231025012452.1985680-1-mpe@ellerman.id.au --- arch/powerpc/kernel/cputable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index e97a0fd0ae90..6f6801da9dc1 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -20,9 +20,9 @@ #include #include -static struct cpu_spec the_cpu_spec __read_mostly; +static struct cpu_spec the_cpu_spec __ro_after_init; -struct cpu_spec* cur_cpu_spec __read_mostly = NULL; +struct cpu_spec *cur_cpu_spec __ro_after_init = NULL; EXPORT_SYMBOL(cur_cpu_spec); /* The platform string corresponding to the real PVR */ From 6f2a9e0e0ae5fb0697dd1660ede7e609be25ff6f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 13 Nov 2023 15:39:47 +1100 Subject: [PATCH 065/310] powerpc: Remove orphaned reg_a2.h Commit fb5a515704d7 ("powerpc: Remove platforms/wsp and associated pieces") removed the A2 CPU support, but missed removal of reg_a2.h. None of the defines contained in it are used, with the exception of the SPRN_TEN* values, but they are also defined in reg_booke.h. Signed-off-by: Michael Ellerman Link: https://msgid.link/20231113043947.1931831-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/reg_a2.h | 154 --------------------------- arch/powerpc/kernel/exceptions-64e.S | 1 - arch/powerpc/kernel/udbg_16550.c | 1 - 3 files changed, 156 deletions(-) delete mode 100644 arch/powerpc/include/asm/reg_a2.h diff --git a/arch/powerpc/include/asm/reg_a2.h b/arch/powerpc/include/asm/reg_a2.h deleted file mode 100644 index 74fba29e9491..000000000000 --- a/arch/powerpc/include/asm/reg_a2.h +++ /dev/null @@ -1,154 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Register definitions specific to the A2 core - * - * Copyright (C) 2008 Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. - */ - -#ifndef __ASM_POWERPC_REG_A2_H__ -#define __ASM_POWERPC_REG_A2_H__ - -#include - -#define SPRN_TENSR 0x1b5 -#define SPRN_TENS 0x1b6 /* Thread ENable Set */ -#define SPRN_TENC 0x1b7 /* Thread ENable Clear */ - -#define SPRN_A2_CCR0 0x3f0 /* Core Configuration Register 0 */ -#define SPRN_A2_CCR1 0x3f1 /* Core Configuration Register 1 */ -#define SPRN_A2_CCR2 0x3f2 /* Core Configuration Register 2 */ -#define SPRN_MMUCR0 0x3fc /* MMU Control Register 0 */ -#define SPRN_MMUCR1 0x3fd /* MMU Control Register 1 */ -#define SPRN_MMUCR2 0x3fe /* MMU Control Register 2 */ -#define SPRN_MMUCR3 0x3ff /* MMU Control Register 3 */ - -#define SPRN_IAR 0x372 - -#define SPRN_IUCR0 0x3f3 -#define IUCR0_ICBI_ACK 0x1000 - -#define SPRN_XUCR0 0x3f6 /* Execution Unit Config Register 0 */ - -#define A2_IERAT_SIZE 16 -#define A2_DERAT_SIZE 32 - -/* A2 MMUCR0 bits */ -#define MMUCR0_ECL 0x80000000 /* Extended Class for TLB fills */ -#define MMUCR0_TID_NZ 0x40000000 /* TID is non-zero */ -#define MMUCR0_TS 0x10000000 /* Translation space for TLB fills */ -#define MMUCR0_TGS 0x20000000 /* Guest space for TLB fills */ -#define MMUCR0_TLBSEL 0x0c000000 /* TLB or ERAT target for TLB fills */ -#define MMUCR0_TLBSEL_U 0x00000000 /* TLBSEL = UTLB */ -#define MMUCR0_TLBSEL_I 0x08000000 /* TLBSEL = I-ERAT */ -#define MMUCR0_TLBSEL_D 0x0c000000 /* TLBSEL = D-ERAT */ -#define MMUCR0_LOCKSRSH 0x02000000 /* Use TLB lock on tlbsx. */ -#define MMUCR0_TID_MASK 0x000000ff /* TID field */ - -/* A2 MMUCR1 bits */ -#define MMUCR1_IRRE 0x80000000 /* I-ERAT round robin enable */ -#define MMUCR1_DRRE 0x40000000 /* D-ERAT round robin enable */ -#define MMUCR1_REE 0x20000000 /* Reference Exception Enable*/ -#define MMUCR1_CEE 0x10000000 /* Change exception enable */ -#define MMUCR1_CSINV_ALL 0x00000000 /* Inval ERAT on all CS evts */ -#define MMUCR1_CSINV_NISYNC 0x04000000 /* Inval ERAT on all ex isync*/ -#define MMUCR1_CSINV_NEVER 0x0c000000 /* Don't inval ERAT on CS */ -#define MMUCR1_ICTID 0x00080000 /* IERAT class field as TID */ -#define MMUCR1_ITTID 0x00040000 /* IERAT thdid field as TID */ -#define MMUCR1_DCTID 0x00020000 /* DERAT class field as TID */ -#define MMUCR1_DTTID 0x00010000 /* DERAT thdid field as TID */ -#define MMUCR1_DCCD 0x00008000 /* DERAT class ignore */ -#define MMUCR1_TLBWE_BINV 0x00004000 /* back invalidate on tlbwe */ - -/* A2 MMUCR2 bits */ -#define MMUCR2_PSSEL_SHIFT 4 - -/* A2 MMUCR3 bits */ -#define MMUCR3_THID 0x0000000f /* Thread ID */ - -/* *** ERAT TLB bits definitions */ -#define TLB0_EPN_MASK ASM_CONST(0xfffffffffffff000) -#define TLB0_CLASS_MASK ASM_CONST(0x0000000000000c00) -#define TLB0_CLASS_00 ASM_CONST(0x0000000000000000) -#define TLB0_CLASS_01 ASM_CONST(0x0000000000000400) -#define TLB0_CLASS_10 ASM_CONST(0x0000000000000800) -#define TLB0_CLASS_11 ASM_CONST(0x0000000000000c00) -#define TLB0_V ASM_CONST(0x0000000000000200) -#define TLB0_X ASM_CONST(0x0000000000000100) -#define TLB0_SIZE_MASK ASM_CONST(0x00000000000000f0) -#define TLB0_SIZE_4K ASM_CONST(0x0000000000000010) -#define TLB0_SIZE_64K ASM_CONST(0x0000000000000030) -#define TLB0_SIZE_1M ASM_CONST(0x0000000000000050) -#define TLB0_SIZE_16M ASM_CONST(0x0000000000000070) -#define TLB0_SIZE_1G ASM_CONST(0x00000000000000a0) -#define TLB0_THDID_MASK ASM_CONST(0x000000000000000f) -#define TLB0_THDID_0 ASM_CONST(0x0000000000000001) -#define TLB0_THDID_1 ASM_CONST(0x0000000000000002) -#define TLB0_THDID_2 ASM_CONST(0x0000000000000004) -#define TLB0_THDID_3 ASM_CONST(0x0000000000000008) -#define TLB0_THDID_ALL ASM_CONST(0x000000000000000f) - -#define TLB1_RESVATTR ASM_CONST(0x00f0000000000000) -#define TLB1_U0 ASM_CONST(0x0008000000000000) -#define TLB1_U1 ASM_CONST(0x0004000000000000) -#define TLB1_U2 ASM_CONST(0x0002000000000000) -#define TLB1_U3 ASM_CONST(0x0001000000000000) -#define TLB1_R ASM_CONST(0x0000800000000000) -#define TLB1_C ASM_CONST(0x0000400000000000) -#define TLB1_RPN_MASK ASM_CONST(0x000003fffffff000) -#define TLB1_W ASM_CONST(0x0000000000000800) -#define TLB1_I ASM_CONST(0x0000000000000400) -#define TLB1_M ASM_CONST(0x0000000000000200) -#define TLB1_G ASM_CONST(0x0000000000000100) -#define TLB1_E ASM_CONST(0x0000000000000080) -#define TLB1_VF ASM_CONST(0x0000000000000040) -#define TLB1_UX ASM_CONST(0x0000000000000020) -#define TLB1_SX ASM_CONST(0x0000000000000010) -#define TLB1_UW ASM_CONST(0x0000000000000008) -#define TLB1_SW ASM_CONST(0x0000000000000004) -#define TLB1_UR ASM_CONST(0x0000000000000002) -#define TLB1_SR ASM_CONST(0x0000000000000001) - -/* A2 erativax attributes definitions */ -#define ERATIVAX_RS_IS_ALL 0x000 -#define ERATIVAX_RS_IS_TID 0x040 -#define ERATIVAX_RS_IS_CLASS 0x080 -#define ERATIVAX_RS_IS_FULLMATCH 0x0c0 -#define ERATIVAX_CLASS_00 0x000 -#define ERATIVAX_CLASS_01 0x010 -#define ERATIVAX_CLASS_10 0x020 -#define ERATIVAX_CLASS_11 0x030 -#define ERATIVAX_PSIZE_4K (TLB_PSIZE_4K >> 1) -#define ERATIVAX_PSIZE_64K (TLB_PSIZE_64K >> 1) -#define ERATIVAX_PSIZE_1M (TLB_PSIZE_1M >> 1) -#define ERATIVAX_PSIZE_16M (TLB_PSIZE_16M >> 1) -#define ERATIVAX_PSIZE_1G (TLB_PSIZE_1G >> 1) - -/* A2 eratilx attributes definitions */ -#define ERATILX_T_ALL 0 -#define ERATILX_T_TID 1 -#define ERATILX_T_TGS 2 -#define ERATILX_T_FULLMATCH 3 -#define ERATILX_T_CLASS0 4 -#define ERATILX_T_CLASS1 5 -#define ERATILX_T_CLASS2 6 -#define ERATILX_T_CLASS3 7 - -/* XUCR0 bits */ -#define XUCR0_TRACE_UM_T0 0x40000000 /* Thread 0 */ -#define XUCR0_TRACE_UM_T1 0x20000000 /* Thread 1 */ -#define XUCR0_TRACE_UM_T2 0x10000000 /* Thread 2 */ -#define XUCR0_TRACE_UM_T3 0x08000000 /* Thread 3 */ - -/* A2 CCR0 register */ -#define A2_CCR0_PME_DISABLED 0x00000000 -#define A2_CCR0_PME_SLEEP 0x40000000 -#define A2_CCR0_PME_RVW 0x80000000 -#define A2_CCR0_PME_DISABLED2 0xc0000000 - -/* A2 CCR2 register */ -#define A2_CCR2_ERAT_ONLY_MODE 0x00000001 -#define A2_CCR2_ENABLE_ICSWX 0x00000002 -#define A2_CCR2_ENABLE_PC 0x20000000 -#define A2_CCR2_ENABLE_TRACE 0x40000000 - -#endif /* __ASM_POWERPC_REG_A2_H__ */ diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 7ab4c8c0f1ab..dcf0591ad3c2 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/udbg_16550.c b/arch/powerpc/kernel/udbg_16550.c index 74ddf836f7a2..a0467e528b70 100644 --- a/arch/powerpc/kernel/udbg_16550.c +++ b/arch/powerpc/kernel/udbg_16550.c @@ -7,7 +7,6 @@ #include #include #include -#include #include extern u8 real_readb(volatile u8 __iomem *addr); From c8a1634145c23a5a979a7166a12b99871812a6ab Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 13 Nov 2023 16:19:29 +1100 Subject: [PATCH 066/310] powerpc/32: Drop unused grackle_set_stg() The call to grackle_set_stg() ("Store Gathering") has always been inside an #ifdef 0, since the code was first merged in v2.3.43pre7. Apparently it was suspected of causing problems on some hardware so was disabled. No one has ever proved otherwise so drop the code as unused for now. Reported-by: kernel test robot Reported-by: Bjorn Helgaas Closes: https://lore.kernel.org/all/20231031145600.GA9161@bhelgaas/ Signed-off-by: Michael Ellerman Link: https://msgid.link/20231113051929.1952351-1-mpe@ellerman.id.au --- arch/powerpc/sysdev/grackle.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/arch/powerpc/sysdev/grackle.c b/arch/powerpc/sysdev/grackle.c index fd2f94a884f0..7dce8278b71e 100644 --- a/arch/powerpc/sysdev/grackle.c +++ b/arch/powerpc/sysdev/grackle.c @@ -18,24 +18,8 @@ #define GRACKLE_CFA(b, d, o) (0x80 | ((b) << 8) | ((d) << 16) \ | (((o) & ~3) << 24)) -#define GRACKLE_PICR1_STG 0x00000040 #define GRACKLE_PICR1_LOOPSNOOP 0x00000010 -/* N.B. this is called before bridges is initialized, so we can't - use grackle_pcibios_{read,write}_config_dword. */ -static inline void grackle_set_stg(struct pci_controller* bp, int enable) -{ - unsigned int val; - - out_be32(bp->cfg_addr, GRACKLE_CFA(0, 0, 0xa8)); - val = in_le32(bp->cfg_data); - val = enable? (val | GRACKLE_PICR1_STG) : - (val & ~GRACKLE_PICR1_STG); - out_be32(bp->cfg_addr, GRACKLE_CFA(0, 0, 0xa8)); - out_le32(bp->cfg_data, val); - (void)in_le32(bp->cfg_data); -} - static inline void grackle_set_loop_snoop(struct pci_controller *bp, int enable) { unsigned int val; @@ -56,7 +40,4 @@ void __init setup_grackle(struct pci_controller *hose) pci_add_flags(PCI_REASSIGN_ALL_BUS); if (of_machine_is_compatible("AAPL,PowerBook1998")) grackle_set_loop_snoop(hose, 1); -#if 0 /* Disabled for now, HW problems ??? */ - grackle_set_stg(hose, 1); -#endif } From 1b1e38002648819c04773647d5242990e2824264 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 21 Nov 2023 08:23:32 +0900 Subject: [PATCH 067/310] powerpc: add crtsavres.o to always-y instead of extra-y crtsavres.o is linked to modules. However, as explained in commit d0e628cd817f ("kbuild: doc: clarify the difference between extra-y and always-y"), 'make modules' does not build extra-y. For example, the following command fails: $ make ARCH=powerpc LLVM=1 KBUILD_MODPOST_WARN=1 mrproper ps3_defconfig modules [snip] LD [M] arch/powerpc/platforms/cell/spufs/spufs.ko ld.lld: error: cannot open arch/powerpc/lib/crtsavres.o: No such file or directory make[3]: *** [scripts/Makefile.modfinal:56: arch/powerpc/platforms/cell/spufs/spufs.ko] Error 1 make[2]: *** [Makefile:1844: modules] Error 2 make[1]: *** [/home/masahiro/workspace/linux-kbuild/Makefile:350: __build_one_by_one] Error 2 make: *** [Makefile:234: __sub-make] Error 2 Signed-off-by: Masahiro Yamada Fixes: baa25b571a16 ("powerpc/64: Do not link crtsavres.o in vmlinux") Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://msgid.link/20231120232332.4100288-1-masahiroy@kernel.org --- arch/powerpc/lib/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 51ad0397c17a..6eac63e79a89 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -45,7 +45,7 @@ obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o # so it is only needed for modules, and only for older linkers which # do not support --save-restore-funcs ifndef CONFIG_LD_IS_BFD -extra-$(CONFIG_PPC64) += crtsavres.o +always-$(CONFIG_PPC64) += crtsavres.o endif obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \ From 45b1ba7e5d1f6881050d558baf9bc74a2ae13930 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Wed, 22 Nov 2023 11:06:51 +0800 Subject: [PATCH 068/310] powerpc/xics: Check return value of kasprintf in icp_native_map_one_cpu kasprintf() returns a pointer to dynamically allocated memory which can be NULL upon failure. Ensure the allocation was successful by checking the pointer validity. Signed-off-by: Kunwu Chan Signed-off-by: Michael Ellerman Link: https://msgid.link/20231122030651.3818-1-chentao@kylinos.cn --- arch/powerpc/sysdev/xics/icp-native.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c index f6ec6dba92dc..700b67476a7d 100644 --- a/arch/powerpc/sysdev/xics/icp-native.c +++ b/arch/powerpc/sysdev/xics/icp-native.c @@ -236,6 +236,8 @@ static int __init icp_native_map_one_cpu(int hw_id, unsigned long addr, rname = kasprintf(GFP_KERNEL, "CPU %d [0x%x] Interrupt Presentation", cpu, hw_id); + if (!rname) + return -ENOMEM; if (!request_mem_region(addr, size, rname)) { pr_warn("icp_native: Could not reserve ICP MMIO for CPU %d, interrupt server #0x%x\n", cpu, hw_id); From df99da19c6c24ab65052ae1bc0904f99069478d9 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 21 Nov 2023 10:54:36 +1100 Subject: [PATCH 069/310] powerpc/lib: Avoid array bounds warnings in vec ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building with GCC with -Warray-bounds enabled there are several warnings in sstep.c along the lines of: In function ‘do_byte_reverse’, inlined from ‘do_vec_load’ at arch/powerpc/lib/sstep.c:691:3, inlined from ‘emulate_loadstore’ at arch/powerpc/lib/sstep.c:3439:9: arch/powerpc/lib/sstep.c:289:23: error: array subscript 2 is outside array bounds of ‘u8[16]’ {aka ‘unsigned char[16]’} [-Werror=array-bounds=] 289 | up[2] = byterev_8(up[1]); | ~~~~~~^~~~~~~~~~~~~~~~~~ arch/powerpc/lib/sstep.c: In function ‘emulate_loadstore’: arch/powerpc/lib/sstep.c:681:11: note: at offset 16 into object ‘u’ of size 16 681 | } u = {}; | ^ do_byte_reverse() supports a size up to 32 bytes, but in these cases the caller is only passing a 16 byte buffer. In practice there is no bug, do_vec_load() is only called from the LOAD_VMX case in emulate_loadstore(). That in turn is only reached when analyse_instr() recognises VMX ops, and in all cases the size is no greater than 16: $ git grep -w LOAD_VMX arch/powerpc/lib/sstep.c arch/powerpc/lib/sstep.c: op->type = MKOP(LOAD_VMX, 0, 1); arch/powerpc/lib/sstep.c: op->type = MKOP(LOAD_VMX, 0, 2); arch/powerpc/lib/sstep.c: op->type = MKOP(LOAD_VMX, 0, 4); arch/powerpc/lib/sstep.c: op->type = MKOP(LOAD_VMX, 0, 16); Similarly for do_vec_store(). Although the warning is incorrect, the code would be safer if it clamped the size from the caller to the known size of the buffer. Do that using min_t(). Reported-by: Bagas Sanjaya Closes: https://lore.kernel.org/linuxppc-dev/YpbUcPrm61RLIiZF@debian.me/ Reported-by: Jan-Benedict Glaw Closes: https://lore.kernel.org/linuxppc-dev/20221212215117.aa7255t7qd6yefk4@lug-owl.de/ Reported-by: "Gustavo A. R. Silva" Closes: https://lore.kernel.org/linuxppc-dev/6a8bf78c-aedb-4d5a-b0aa-82a51a17b884@embeddedor.com/ Reviewed-by: "Gustavo A. R. Silva" Build-tested-by: "Gustavo A. R. Silva" Signed-off-by: Michael Ellerman Link: https://msgid.link/20231120235436.1569255-1-mpe@ellerman.id.au --- arch/powerpc/lib/sstep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index a4ab8625061a..a13f05cfc7db 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -688,7 +688,7 @@ static nokprobe_inline int do_vec_load(int rn, unsigned long ea, if (err) return err; if (unlikely(cross_endian)) - do_byte_reverse(&u.b[ea & 0xf], size); + do_byte_reverse(&u.b[ea & 0xf], min_t(size_t, size, sizeof(u))); preempt_disable(); if (regs->msr & MSR_VEC) put_vr(rn, &u.v); @@ -719,7 +719,7 @@ static nokprobe_inline int do_vec_store(int rn, unsigned long ea, u.v = current->thread.vr_state.vr[rn]; preempt_enable(); if (unlikely(cross_endian)) - do_byte_reverse(&u.b[ea & 0xf], size); + do_byte_reverse(&u.b[ea & 0xf], min_t(size_t, size, sizeof(u))); return copy_mem_out(&u.b[ea & 0xf], ea, size, regs); } #endif /* CONFIG_ALTIVEC */ From 8f9abaa6d7de0a70fc68acaedce290c1f96e2e59 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Thu, 23 Nov 2023 12:47:05 +0530 Subject: [PATCH 070/310] powerpc/lib: Validate size for vector operations Some of the fp/vmx code in sstep.c assume a certain maximum size for the instructions being emulated. The size of those operations however is determined separately in analyse_instr(). Add a check to validate the assumption on the maximum size of the operations, so as to prevent any unintended kernel stack corruption. Signed-off-by: Naveen N Rao Reviewed-by: Gustavo A. R. Silva Build-tested-by: Gustavo A. R. Silva Signed-off-by: Michael Ellerman Link: https://msgid.link/20231123071705.397625-1-naveen@kernel.org --- arch/powerpc/lib/sstep.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index a13f05cfc7db..5766180f5380 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -586,6 +586,8 @@ static int do_fp_load(struct instruction_op *op, unsigned long ea, } u; nb = GETSIZE(op->type); + if (nb > sizeof(u)) + return -EINVAL; if (!address_ok(regs, ea, nb)) return -EFAULT; rn = op->reg; @@ -636,6 +638,8 @@ static int do_fp_store(struct instruction_op *op, unsigned long ea, } u; nb = GETSIZE(op->type); + if (nb > sizeof(u)) + return -EINVAL; if (!address_ok(regs, ea, nb)) return -EFAULT; rn = op->reg; @@ -680,6 +684,9 @@ static nokprobe_inline int do_vec_load(int rn, unsigned long ea, u8 b[sizeof(__vector128)]; } u = {}; + if (size > sizeof(u)) + return -EINVAL; + if (!address_ok(regs, ea & ~0xfUL, 16)) return -EFAULT; /* align to multiple of size */ @@ -707,6 +714,9 @@ static nokprobe_inline int do_vec_store(int rn, unsigned long ea, u8 b[sizeof(__vector128)]; } u; + if (size > sizeof(u)) + return -EINVAL; + if (!address_ok(regs, ea & ~0xfUL, 16)) return -EFAULT; /* align to multiple of size */ From 0d555b57ee660d8a871781c0eebf006e855e918d Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 27 Nov 2023 13:28:09 +1100 Subject: [PATCH 071/310] powerpc: pmd_move_must_withdraw() is only needed for CONFIG_TRANSPARENT_HUGEPAGE The linux-next build of powerpc64 allnoconfig fails with: arch/powerpc/mm/book3s64/pgtable.c:557:5: error: no previous prototype for 'pmd_move_must_withdraw' 557 | int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, | ^~~~~~~~~~~~~~~~~~~~~~ Caused by commit: c6345dfa6e3e ("Makefile.extrawarn: turn on missing-prototypes globally") Fix it by moving the function definition under CONFIG_TRANSPARENT_HUGEPAGE like the prototype. The function is only called when CONFIG_TRANSPARENT_HUGEPAGE=y. Signed-off-by: Stephen Rothwell [mpe: Flesh out change log from linux-next patch] Signed-off-by: Michael Ellerman Link: https://msgid.link/20231127132809.45c2b398@canb.auug.org.au --- arch/powerpc/mm/book3s64/pgtable.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index be229290a6a7..3438ab72c346 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -542,6 +542,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, set_pte_at(vma->vm_mm, addr, ptep, pte); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * For hash translation mode, we use the deposited table to store hash slot * information and they are stored at PTRS_PER_PMD offset from related pmd @@ -563,6 +564,7 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, return true; } +#endif /* * Does the CPU support tlbie? From ff03ff328fbd0a2b3a43e8b9bbc2a1d84265e77e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Sat, 18 Nov 2023 13:32:32 -0600 Subject: [PATCH 072/310] x86/mce/amd, EDAC/mce_amd: Move long names to decoder module The long names of the SMCA banks are only used by the MCE decoder module. Move them out of the arch code and into the decoder module. [ bp: Name the long names array "smca_long_names", drop local ptr in decode_smca_error(), constify arrays. ] Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231118193248.1296798-5-yazen.ghannam@amd.com --- arch/x86/include/asm/mce.h | 1 - arch/x86/kernel/cpu/mce/amd.c | 74 ++++++++++++++--------------------- drivers/edac/mce_amd.c | 46 ++++++++++++++++++++-- 3 files changed, 72 insertions(+), 49 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6de6e1d95952..4ad49afca2db 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -333,7 +333,6 @@ enum smca_bank_types { N_SMCA_BANK_TYPES }; -extern const char *smca_get_long_name(enum smca_bank_types t); extern bool amd_mce_is_memory_error(struct mce *m); extern int mce_threshold_create_device(unsigned int cpu); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index f3517b8a8e91..f6c6c1e0a581 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -87,42 +87,37 @@ struct smca_bank { static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks); static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts); -struct smca_bank_name { - const char *name; /* Short name for sysfs */ - const char *long_name; /* Long name for pretty-printing */ -}; - -static struct smca_bank_name smca_names[] = { - [SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" }, - [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, - [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, - [SMCA_DE] = { "decode_unit", "Decode Unit" }, - [SMCA_RESERVED] = { "reserved", "Reserved" }, - [SMCA_EX] = { "execution_unit", "Execution Unit" }, - [SMCA_FP] = { "floating_point", "Floating Point Unit" }, - [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, - [SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" }, - [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, +static const char * const smca_names[] = { + [SMCA_LS ... SMCA_LS_V2] = "load_store", + [SMCA_IF] = "insn_fetch", + [SMCA_L2_CACHE] = "l2_cache", + [SMCA_DE] = "decode_unit", + [SMCA_RESERVED] = "reserved", + [SMCA_EX] = "execution_unit", + [SMCA_FP] = "floating_point", + [SMCA_L3_CACHE] = "l3_cache", + [SMCA_CS ... SMCA_CS_V2] = "coherent_slave", + [SMCA_PIE] = "pie", /* UMC v2 is separate because both of them can exist in a single system. */ - [SMCA_UMC] = { "umc", "Unified Memory Controller" }, - [SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" }, - [SMCA_PB] = { "param_block", "Parameter Block" }, - [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, - [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" }, - [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, - [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" }, - [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, - [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" }, - [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" }, - [SMCA_NBIF] = { "nbif", "NBIF Unit" }, - [SMCA_SHUB] = { "shub", "System Hub Unit" }, - [SMCA_SATA] = { "sata", "SATA Unit" }, - [SMCA_USB] = { "usb", "USB Unit" }, - [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" }, - [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" }, - [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" }, - [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" }, + [SMCA_UMC] = "umc", + [SMCA_UMC_V2] = "umc_v2", + [SMCA_PB] = "param_block", + [SMCA_PSP ... SMCA_PSP_V2] = "psp", + [SMCA_SMU ... SMCA_SMU_V2] = "smu", + [SMCA_MP5] = "mp5", + [SMCA_MPDMA] = "mpdma", + [SMCA_NBIO] = "nbio", + [SMCA_PCIE ... SMCA_PCIE_V2] = "pcie", + [SMCA_XGMI_PCS] = "xgmi_pcs", + [SMCA_NBIF] = "nbif", + [SMCA_SHUB] = "shub", + [SMCA_SATA] = "sata", + [SMCA_USB] = "usb", + [SMCA_GMI_PCS] = "gmi_pcs", + [SMCA_XGMI_PHY] = "xgmi_phy", + [SMCA_WAFL_PHY] = "wafl_phy", + [SMCA_GMI_PHY] = "gmi_phy", }; static const char *smca_get_name(enum smca_bank_types t) @@ -130,18 +125,9 @@ static const char *smca_get_name(enum smca_bank_types t) if (t >= N_SMCA_BANK_TYPES) return NULL; - return smca_names[t].name; + return smca_names[t]; } -const char *smca_get_long_name(enum smca_bank_types t) -{ - if (t >= N_SMCA_BANK_TYPES) - return NULL; - - return smca_names[t].long_name; -} -EXPORT_SYMBOL_GPL(smca_get_long_name); - enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank) { struct smca_bank *b; diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 9215c06783df..28363eb42e3e 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -1163,11 +1163,51 @@ static void decode_mc6_mce(struct mce *m) pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); } +static const char * const smca_long_names[] = { + [SMCA_LS ... SMCA_LS_V2] = "Load Store Unit", + [SMCA_IF] = "Instruction Fetch Unit", + [SMCA_L2_CACHE] = "L2 Cache", + [SMCA_DE] = "Decode Unit", + [SMCA_RESERVED] = "Reserved", + [SMCA_EX] = "Execution Unit", + [SMCA_FP] = "Floating Point Unit", + [SMCA_L3_CACHE] = "L3 Cache", + [SMCA_CS ... SMCA_CS_V2] = "Coherent Slave", + [SMCA_PIE] = "Power, Interrupts, etc.", + + /* UMC v2 is separate because both of them can exist in a single system. */ + [SMCA_UMC] = "Unified Memory Controller", + [SMCA_UMC_V2] = "Unified Memory Controller v2", + [SMCA_PB] = "Parameter Block", + [SMCA_PSP ... SMCA_PSP_V2] = "Platform Security Processor", + [SMCA_SMU ... SMCA_SMU_V2] = "System Management Unit", + [SMCA_MP5] = "Microprocessor 5 Unit", + [SMCA_MPDMA] = "MPDMA Unit", + [SMCA_NBIO] = "Northbridge IO Unit", + [SMCA_PCIE ... SMCA_PCIE_V2] = "PCI Express Unit", + [SMCA_XGMI_PCS] = "Ext Global Memory Interconnect PCS Unit", + [SMCA_NBIF] = "NBIF Unit", + [SMCA_SHUB] = "System Hub Unit", + [SMCA_SATA] = "SATA Unit", + [SMCA_USB] = "USB Unit", + [SMCA_GMI_PCS] = "Global Memory Interconnect PCS Unit", + [SMCA_XGMI_PHY] = "Ext Global Memory Interconnect PHY Unit", + [SMCA_WAFL_PHY] = "WAFL PHY Unit", + [SMCA_GMI_PHY] = "Global Memory Interconnect PHY Unit", +}; + +static const char *smca_get_long_name(enum smca_bank_types t) +{ + if (t >= N_SMCA_BANK_TYPES) + return NULL; + + return smca_long_names[t]; +} + /* Decode errors according to Scalable MCA specification */ static void decode_smca_error(struct mce *m) { enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank); - const char *ip_name; u8 xec = XEC(m->status, xec_mask); if (bank_type >= N_SMCA_BANK_TYPES) @@ -1178,9 +1218,7 @@ static void decode_smca_error(struct mce *m) return; } - ip_name = smca_get_long_name(bank_type); - - pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec); + pr_emerg(HW_ERR "%s Ext. Error Code: %d", smca_get_long_name(bank_type), xec); /* Only print the decode of valid error codes */ if (xec < smca_mce_descs[bank_type].num_descs) From e2768b798a197318736f00c506633cb78ff77012 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 27 Nov 2023 11:17:26 +0000 Subject: [PATCH 073/310] arm64/mm: Modify range-based tlbi to decrement scale In preparation for adding support for LPA2 to the tlb invalidation routines, modify the algorithm used by range-based tlbi to start at the highest 'scale' and decrement instead of starting at the lowest 'scale' and incrementing. This new approach makes it possible to maintain 64K alignment as we work through the range, until the last op (at scale=0). This is required when LPA2 is enabled. (This part will be added in a subsequent commit). This change is separated into its own patch because it will also impact non-LPA2 systems, and I want to make it easy to bisect in case it leads to performance regression (see below for benchmarks that suggest this should not be a problem). The original commit (d1d3aa98 "arm64: tlb: Use the TLBI RANGE feature in arm64") stated this as the reason for _incrementing_ scale: However, in most scenarios, the pages = 1 when flush_tlb_range() is called. Start from scale = 3 or other proper value (such as scale =ilog2(pages)), will incur extra overhead. So increase 'scale' from 0 to maximum. But pages=1 is already special cased by the non-range invalidation path, which will take care of it the first time through the loop (both in the original commit and in my change), so I don't think switching to decrement scale should have any extra performance impact after all. Indeed benchmarking kernel compilation, a TLBI-heavy workload, suggests that this new approach actually _improves_ performance slightly (using a virtual machine on Apple M2): Table shows time to execute kernel compilation workload with 8 jobs, relative to baseline without this patch (more negative number is bigger speedup). Repeated 9 times across 3 system reboots: | counter | mean | stdev | |:----------|-----------:|----------:| | real-time | -0.6% | 0.0% | | kern-time | -1.6% | 0.5% | | user-time | -0.4% | 0.1% | Reviewed-by: Oliver Upton Signed-off-by: Ryan Roberts Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231127111737.1897081-2-ryan.roberts@arm.com --- arch/arm64/include/asm/tlbflush.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index bb2c2833a987..36acdb3d16a5 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -350,14 +350,14 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) * entries one by one at the granularity of 'stride'. If the TLB * range ops are supported, then: * - * 1. If 'pages' is odd, flush the first page through non-range - * operations; + * 1. The minimum range granularity is decided by 'scale', so multiple range + * TLBI operations may be required. Start from scale = 3, flush the largest + * possible number of pages ((num+1)*2^(5*scale+1)) that fit into the + * requested range, then decrement scale and continue until one or zero pages + * are left. * - * 2. For remaining pages: the minimum range granularity is decided - * by 'scale', so multiple range TLBI operations may be required. - * Start from scale = 0, flush the corresponding number of pages - * ((num+1)*2^(5*scale+1) starting from 'addr'), then increase it - * until no pages left. + * 2. If there is 1 page remaining, flush it through non-range operations. Range + * operations can only span an even number of pages. * * Note that certain ranges can be represented by either num = 31 and * scale or num = 0 and scale + 1. The loop below favours the latter @@ -367,12 +367,12 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) asid, tlb_level, tlbi_user) \ do { \ int num = 0; \ - int scale = 0; \ + int scale = 3; \ unsigned long addr; \ \ while (pages > 0) { \ if (!system_supports_tlb_range() || \ - pages % 2 == 1) { \ + pages == 1) { \ addr = __TLBI_VADDR(start, asid); \ __tlbi_level(op, addr, tlb_level); \ if (tlbi_user) \ @@ -392,7 +392,7 @@ do { \ start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \ pages -= __TLBI_RANGE_PAGES(num, scale); \ } \ - scale++; \ + scale--; \ } \ } while (0) From 936a4ec28141fa9369b6af4d6401f0be9f7e304c Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 27 Nov 2023 11:17:27 +0000 Subject: [PATCH 074/310] arm64/mm: Add lpa2_is_enabled() kvm_lpa2_is_enabled() stubs Add stub functions which is initially always return false. These provide the hooks that we need to update the range-based TLBI routines, whose operands are encoded differently depending on whether lpa2 is enabled or not. The kernel and kvm will enable the use of lpa2 asynchronously in future, and part of that enablement will involve fleshing out their respective hook to advertise when it is using lpa2. Since the kernel's decision to use lpa2 relies on more than just whether the HW supports the feature, it can't just use the same static key as kvm. This is another reason to use separate functions. lpa2_is_enabled() is already implemented as part of Ard's kernel lpa2 series. Since kvm will make its decision solely based on HW support, kvm_lpa2_is_enabled() will be defined as system_supports_lpa2() once kvm starts using lpa2. Reviewed-by: Oliver Upton Signed-off-by: Ryan Roberts Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231127111737.1897081-3-ryan.roberts@arm.com --- arch/arm64/include/asm/kvm_pgtable.h | 2 ++ arch/arm64/include/asm/pgtable-prot.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index d3e354bb8351..10068500d601 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -25,6 +25,8 @@ #define KVM_PGTABLE_MIN_BLOCK_LEVEL 2U #endif +#define kvm_lpa2_is_enabled() false + static inline u64 kvm_get_parange(u64 mmfr0) { u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index e9624f6326dd..483dbfa39c4c 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -71,6 +71,8 @@ extern bool arm64_use_ng_mappings; #define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0) #define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0) +#define lpa2_is_enabled() false + /* * If we have userspace only BTI we don't want to mark kernel pages * guarded even if the system does support BTI. From c910f2b65518538b5072cb51760c8ef749e455d0 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 27 Nov 2023 11:17:28 +0000 Subject: [PATCH 075/310] arm64/mm: Update tlb invalidation routines for FEAT_LPA2 FEAT_LPA2 impacts tlb invalidation in 2 ways; Firstly, the TTL field in the non-range tlbi instructions can now validly take a 0 value as a level hint for the 4KB granule (this is due to the extra level of translation) - previously TTL=0b0100 meant no hint and was treated as 0b0000. Secondly, The BADDR field of the range-based tlbi instructions is specified in 64KB units when LPA2 is in use (TCR.DS=1), whereas it is in page units otherwise. Changes are required for tlbi to continue to operate correctly when LPA2 is in use. Solve the first problem by always adding the level hint if the level is between [0, 3] (previously anything other than 0 was hinted, which breaks in the new level -1 case from kvm). When running on non-LPA2 HW, 0 is still safe to hint as the HW will fall back to non-hinted. While we are at it, we replace the notion of 0 being the non-hinted sentinel with a macro, TLBI_TTL_UNKNOWN. This means callers won't need updating if/when translation depth increases in future. The second issue is more complex: When LPA2 is in use, use the non-range tlbi instructions to forward align to a 64KB boundary first, then use range-based tlbi from there on, until we have either invalidated all pages or we have a single page remaining. If the latter, that is done with non-range tlbi. We determine whether LPA2 is in use based on lpa2_is_enabled() (for kernel calls) or kvm_lpa2_is_enabled() (for kvm calls). Reviewed-by: Catalin Marinas Reviewed-by: Oliver Upton Signed-off-by: Ryan Roberts Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231127111737.1897081-4-ryan.roberts@arm.com --- arch/arm64/include/asm/tlb.h | 15 ++++-- arch/arm64/include/asm/tlbflush.h | 90 ++++++++++++++++++++----------- 2 files changed, 68 insertions(+), 37 deletions(-) diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index 846c563689a8..0150deb332af 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -22,15 +22,15 @@ static void tlb_flush(struct mmu_gather *tlb); #include /* - * get the tlbi levels in arm64. Default value is 0 if more than one - * of cleared_* is set or neither is set. - * Arm64 doesn't support p4ds now. + * get the tlbi levels in arm64. Default value is TLBI_TTL_UNKNOWN if more than + * one of cleared_* is set or neither is set - this elides the level hinting to + * the hardware. */ static inline int tlb_get_level(struct mmu_gather *tlb) { /* The TTL field is only valid for the leaf entry. */ if (tlb->freed_tables) - return 0; + return TLBI_TTL_UNKNOWN; if (tlb->cleared_ptes && !(tlb->cleared_pmds || tlb->cleared_puds || @@ -47,7 +47,12 @@ static inline int tlb_get_level(struct mmu_gather *tlb) tlb->cleared_p4ds)) return 1; - return 0; + if (tlb->cleared_p4ds && !(tlb->cleared_ptes || + tlb->cleared_pmds || + tlb->cleared_puds)) + return 0; + + return TLBI_TTL_UNKNOWN; } static inline void tlb_flush(struct mmu_gather *tlb) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 36acdb3d16a5..1deb5d789c2e 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -94,19 +94,22 @@ static inline unsigned long get_trans_granule(void) * When ARMv8.4-TTL exists, TLBI operations take an additional hint for * the level at which the invalidation must take place. If the level is * wrong, no invalidation may take place. In the case where the level - * cannot be easily determined, a 0 value for the level parameter will - * perform a non-hinted invalidation. + * cannot be easily determined, the value TLBI_TTL_UNKNOWN will perform + * a non-hinted invalidation. Any provided level outside the hint range + * will also cause fall-back to non-hinted invalidation. * * For Stage-2 invalidation, use the level values provided to that effect * in asm/stage2_pgtable.h. */ #define TLBI_TTL_MASK GENMASK_ULL(47, 44) +#define TLBI_TTL_UNKNOWN INT_MAX + #define __tlbi_level(op, addr, level) do { \ u64 arg = addr; \ \ if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) && \ - level) { \ + level >= 0 && level <= 3) { \ u64 ttl = level & 3; \ ttl |= get_trans_granule() << 2; \ arg &= ~TLBI_TTL_MASK; \ @@ -122,28 +125,34 @@ static inline unsigned long get_trans_granule(void) } while (0) /* - * This macro creates a properly formatted VA operand for the TLB RANGE. - * The value bit assignments are: + * This macro creates a properly formatted VA operand for the TLB RANGE. The + * value bit assignments are: * * +----------+------+-------+-------+-------+----------------------+ * | ASID | TG | SCALE | NUM | TTL | BADDR | * +-----------------+-------+-------+-------+----------------------+ * |63 48|47 46|45 44|43 39|38 37|36 0| * - * The address range is determined by below formula: - * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE) + * The address range is determined by below formula: [BADDR, BADDR + (NUM + 1) * + * 2^(5*SCALE + 1) * PAGESIZE) + * + * Note that the first argument, baddr, is pre-shifted; If LPA2 is in use, BADDR + * holds addr[52:16]. Else BADDR holds page number. See for example ARM DDI + * 0487J.a section C5.5.60 "TLBI VAE1IS, TLBI VAE1ISNXS, TLB Invalidate by VA, + * EL1, Inner Shareable". * */ -#define __TLBI_VADDR_RANGE(addr, asid, scale, num, ttl) \ - ({ \ - unsigned long __ta = (addr) >> PAGE_SHIFT; \ - __ta &= GENMASK_ULL(36, 0); \ - __ta |= (unsigned long)(ttl) << 37; \ - __ta |= (unsigned long)(num) << 39; \ - __ta |= (unsigned long)(scale) << 44; \ - __ta |= get_trans_granule() << 46; \ - __ta |= (unsigned long)(asid) << 48; \ - __ta; \ +#define __TLBI_VADDR_RANGE(baddr, asid, scale, num, ttl) \ + ({ \ + unsigned long __ta = (baddr); \ + unsigned long __ttl = (ttl >= 1 && ttl <= 3) ? ttl : 0; \ + __ta &= GENMASK_ULL(36, 0); \ + __ta |= __ttl << 37; \ + __ta |= (unsigned long)(num) << 39; \ + __ta |= (unsigned long)(scale) << 44; \ + __ta |= get_trans_granule() << 46; \ + __ta |= (unsigned long)(asid) << 48; \ + __ta; \ }) /* These macros are used by the TLBI RANGE feature. */ @@ -216,12 +225,16 @@ static inline unsigned long get_trans_granule(void) * CPUs, ensuring that any walk-cache entries associated with the * translation are also invalidated. * - * __flush_tlb_range(vma, start, end, stride, last_level) + * __flush_tlb_range(vma, start, end, stride, last_level, tlb_level) * Invalidate the virtual-address range '[start, end)' on all * CPUs for the user address space corresponding to 'vma->mm'. * The invalidation operations are issued at a granularity * determined by 'stride' and only affect any walk-cache entries - * if 'last_level' is equal to false. + * if 'last_level' is equal to false. tlb_level is the level at + * which the invalidation must take place. If the level is wrong, + * no invalidation may take place. In the case where the level + * cannot be easily determined, the value TLBI_TTL_UNKNOWN will + * perform a non-hinted invalidation. * * * Finally, take a look at asm/tlb.h to see how tlb_flush() is implemented @@ -345,34 +358,44 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) * @tlb_level: Translation Table level hint, if known * @tlbi_user: If 'true', call an additional __tlbi_user() * (typically for user ASIDs). 'flase' for IPA instructions + * @lpa2: If 'true', the lpa2 scheme is used as set out below * * When the CPU does not support TLB range operations, flush the TLB * entries one by one at the granularity of 'stride'. If the TLB * range ops are supported, then: * - * 1. The minimum range granularity is decided by 'scale', so multiple range + * 1. If FEAT_LPA2 is in use, the start address of a range operation must be + * 64KB aligned, so flush pages one by one until the alignment is reached + * using the non-range operations. This step is skipped if LPA2 is not in + * use. + * + * 2. The minimum range granularity is decided by 'scale', so multiple range * TLBI operations may be required. Start from scale = 3, flush the largest * possible number of pages ((num+1)*2^(5*scale+1)) that fit into the * requested range, then decrement scale and continue until one or zero pages - * are left. + * are left. We must start from highest scale to ensure 64KB start alignment + * is maintained in the LPA2 case. * - * 2. If there is 1 page remaining, flush it through non-range operations. Range - * operations can only span an even number of pages. + * 3. If there is 1 page remaining, flush it through non-range operations. Range + * operations can only span an even number of pages. We save this for last to + * ensure 64KB start alignment is maintained for the LPA2 case. * * Note that certain ranges can be represented by either num = 31 and * scale or num = 0 and scale + 1. The loop below favours the latter * since num is limited to 30 by the __TLBI_RANGE_NUM() macro. */ #define __flush_tlb_range_op(op, start, pages, stride, \ - asid, tlb_level, tlbi_user) \ + asid, tlb_level, tlbi_user, lpa2) \ do { \ int num = 0; \ int scale = 3; \ + int shift = lpa2 ? 16 : PAGE_SHIFT; \ unsigned long addr; \ \ while (pages > 0) { \ if (!system_supports_tlb_range() || \ - pages == 1) { \ + pages == 1 || \ + (lpa2 && start != ALIGN(start, SZ_64K))) { \ addr = __TLBI_VADDR(start, asid); \ __tlbi_level(op, addr, tlb_level); \ if (tlbi_user) \ @@ -384,8 +407,8 @@ do { \ \ num = __TLBI_RANGE_NUM(pages, scale); \ if (num >= 0) { \ - addr = __TLBI_VADDR_RANGE(start, asid, scale, \ - num, tlb_level); \ + addr = __TLBI_VADDR_RANGE(start >> shift, asid, \ + scale, num, tlb_level); \ __tlbi(r##op, addr); \ if (tlbi_user) \ __tlbi_user(r##op, addr); \ @@ -397,7 +420,7 @@ do { \ } while (0) #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ - __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false) + __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled()); static inline void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -427,9 +450,11 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, asid = ASID(vma->vm_mm); if (last_level) - __flush_tlb_range_op(vale1is, start, pages, stride, asid, tlb_level, true); + __flush_tlb_range_op(vale1is, start, pages, stride, asid, + tlb_level, true, lpa2_is_enabled()); else - __flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true); + __flush_tlb_range_op(vae1is, start, pages, stride, asid, + tlb_level, true, lpa2_is_enabled()); dsb(ish); mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); @@ -441,9 +466,10 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, /* * We cannot use leaf-only invalidation here, since we may be invalidating * table entries as part of collapsing hugepages or moving page tables. - * Set the tlb_level to 0 because we can not get enough information here. + * Set the tlb_level to TLBI_TTL_UNKNOWN because we can not get enough + * information here. */ - __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 0); + __flush_tlb_range(vma, start, end, PAGE_SIZE, false, TLBI_TTL_UNKNOWN); } static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) From e477c8c483913de92c9cc00b34459dc4d695529b Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 27 Nov 2023 11:17:29 +0000 Subject: [PATCH 076/310] arm64/mm: Add FEAT_LPA2 specific ID_AA64MMFR0.TGRAN[2] PAGE_SIZE support is tested against possible minimum and maximum values for its respective ID_AA64MMFR0.TGRAN field, depending on whether it is signed or unsigned. But then FEAT_LPA2 implementation needs to be validated for 4K and 16K page sizes via feature specific ID_AA64MMFR0.TGRAN values. Hence it adds FEAT_LPA2 specific ID_AA64MMFR0.TGRAN[2] values per ARM ARM (0487G.A). Acked-by: Catalin Marinas Signed-off-by: Anshuman Khandual Reviewed-by: Oliver Upton Signed-off-by: Ryan Roberts Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231127111737.1897081-5-ryan.roberts@arm.com --- arch/arm64/include/asm/sysreg.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 5e65f51c10d2..48181cf6cc40 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -871,10 +871,12 @@ /* id_aa64mmfr0 */ #define ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN 0x0 +#define ID_AA64MMFR0_EL1_TGRAN4_LPA2 ID_AA64MMFR0_EL1_TGRAN4_52_BIT #define ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX 0x7 #define ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MIN 0x0 #define ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MAX 0x7 #define ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN 0x1 +#define ID_AA64MMFR0_EL1_TGRAN16_LPA2 ID_AA64MMFR0_EL1_TGRAN16_52_BIT #define ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX 0xf #define ARM64_MIN_PARANGE_BITS 32 @@ -882,6 +884,7 @@ #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_DEFAULT 0x0 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_NONE 0x1 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MIN 0x2 +#define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2 0x3 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MAX 0x7 #ifdef CONFIG_ARM64_PA_BITS_52 @@ -892,11 +895,13 @@ #if defined(CONFIG_ARM64_4K_PAGES) #define ID_AA64MMFR0_EL1_TGRAN_SHIFT ID_AA64MMFR0_EL1_TGRAN4_SHIFT +#define ID_AA64MMFR0_EL1_TGRAN_LPA2 ID_AA64MMFR0_EL1_TGRAN4_52_BIT #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX #define ID_AA64MMFR0_EL1_TGRAN_2_SHIFT ID_AA64MMFR0_EL1_TGRAN4_2_SHIFT #elif defined(CONFIG_ARM64_16K_PAGES) #define ID_AA64MMFR0_EL1_TGRAN_SHIFT ID_AA64MMFR0_EL1_TGRAN16_SHIFT +#define ID_AA64MMFR0_EL1_TGRAN_LPA2 ID_AA64MMFR0_EL1_TGRAN16_52_BIT #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX #define ID_AA64MMFR0_EL1_TGRAN_2_SHIFT ID_AA64MMFR0_EL1_TGRAN16_2_SHIFT From b1366d21daaebb8e474e4169c5e557fbb37bfdc0 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 27 Nov 2023 11:17:30 +0000 Subject: [PATCH 077/310] arm64: Add ARM64_HAS_LPA2 CPU capability Expose FEAT_LPA2 as a capability so that we can take advantage of alternatives patching in the hypervisor. Although FEAT_LPA2 presence is advertised separately for stage1 and stage2, the expectation is that in practice both stages will either support or not support it. Therefore, we combine both into a single capability, allowing us to simplify the implementation. KVM requires support in both stages in order to use LPA2 since the same library is used for hyp stage 1 and guest stage 2 pgtables. Reviewed-by: Oliver Upton Signed-off-by: Ryan Roberts Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20231127111737.1897081-6-ryan.roberts@arm.com --- arch/arm64/include/asm/cpufeature.h | 5 ++++ arch/arm64/kernel/cpufeature.c | 39 +++++++++++++++++++++++++++++ arch/arm64/tools/cpucaps | 1 + 3 files changed, 45 insertions(+) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index f6d416fe49b0..acf109581ac0 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -819,6 +819,11 @@ static inline bool system_supports_tlb_range(void) return alternative_has_cap_unlikely(ARM64_HAS_TLB_RANGE); } +static inline bool system_supports_lpa2(void) +{ + return cpus_have_final_cap(ARM64_HAS_LPA2); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 646591c67e7a..7900ba7e157e 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1768,6 +1768,39 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, return !meltdown_safe; } +#if defined(ID_AA64MMFR0_EL1_TGRAN_LPA2) && defined(ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2) +static bool has_lpa2_at_stage1(u64 mmfr0) +{ + unsigned int tgran; + + tgran = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_TGRAN_SHIFT); + return tgran == ID_AA64MMFR0_EL1_TGRAN_LPA2; +} + +static bool has_lpa2_at_stage2(u64 mmfr0) +{ + unsigned int tgran; + + tgran = cpuid_feature_extract_unsigned_field(mmfr0, + ID_AA64MMFR0_EL1_TGRAN_2_SHIFT); + return tgran == ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2; +} + +static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) +{ + u64 mmfr0; + + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + return has_lpa2_at_stage1(mmfr0) && has_lpa2_at_stage2(mmfr0); +} +#else +static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope) +{ + return false; +} +#endif + #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 #define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT)) @@ -2731,6 +2764,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, EVT, IMP) }, + { + .desc = "52-bit Virtual Addressing for KVM (LPA2)", + .capability = ARM64_HAS_LPA2, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_lpa2, + }, {}, }; diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index b98c38288a9d..919eceb0b3da 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -37,6 +37,7 @@ HAS_GIC_PRIO_MASKING HAS_GIC_PRIO_RELAXED_SYNC HAS_HCX HAS_LDAPR +HAS_LPA2 HAS_LSE_ATOMICS HAS_MOPS HAS_NESTED_VIRT From 9e08ac1b5e3bb7837aa23b67a97f08b80384e9d2 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 13 Oct 2023 14:03:42 -0500 Subject: [PATCH 078/310] EDAC/armada_xp: Explicitly include correct DT includes The DT of_device.h and of_platform.h date back to the separate of_platform_bus_type before it was merged into the regular platform bus. As part of that merge prepping Arm DT support 13 years ago, they "temporarily" include each other. They also include platform_device.h and of.h. As a result, there's a pretty much random mix of those include files used throughout the tree. In order to detangle these headers and replace the implicit includes with struct declarations, users need to explicitly include the correct includes. Signed-off-by: Rob Herring Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Jan Luebbe Link: https://lore.kernel.org/r/20231013190342.246973-1-robh@kernel.org --- drivers/edac/armada_xp_edac.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/edac/armada_xp_edac.c b/drivers/edac/armada_xp_edac.c index 5bcd34f23baf..25517c99b3ea 100644 --- a/drivers/edac/armada_xp_edac.c +++ b/drivers/edac/armada_xp_edac.c @@ -5,7 +5,9 @@ #include #include -#include +#include +#include +#include #include #include From 9be4feb768b86c25da336a6c0f3e3caefd16f1e4 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 27 Nov 2023 18:40:09 -0600 Subject: [PATCH 079/310] powerpc/rtas_pci: rename and properly expose config access APIs The rtas_read_config() and rtas_write_config() functions in kernel/rtas_pci.c have external linkage and two users in arch/powerpc: the rtas_pci code itself and the pseries platform's "enhanced error handling" (EEH) support code. The prototypes for these functions in asm/ppc-pci.h have until now been guarded by CONFIG_EEH since the only external caller is the pseries EEH code. However, this presumably has always generated warnings when built with !CONFIG_EEH and -Wmissing-prototypes: arch/powerpc/kernel/rtas_pci.c:46:5: error: no previous prototype for function 'rtas_read_config' [-Werror,-Wmissing-prototypes] 46 | int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val) arch/powerpc/kernel/rtas_pci.c:98:5: error: no previous prototype for function 'rtas_write_config' [-Werror,-Wmissing-prototypes] 98 | int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val) The introduction of commit c6345dfa6e3e ("Makefile.extrawarn: turn on missing-prototypes globally") forces the issue. The efika and chrp platform code have (static) functions with the same names but different signatures. We may as well eliminate the potential for conflicts and confusion by renaming the globally visible versions as their prototypes get moved out of the CONFIG_EEH-guarded region; their current names are too generic anyway. Since they operate on objects of the type 'struct pci_dn *', give them the slightly more verbose prefix "rtas_pci_dn_" and fix up all the call sites. Fixes: c6345dfa6e3e ("Makefile.extrawarn: turn on missing-prototypes globally") Reported-by: Linux Kernel Functional Testing Closes: https://lore.kernel.org/linuxppc-dev/CA+G9fYt0LLXtjSz+Hkf3Fhm-kf0ZQanrhUS+zVZGa3O+Wt2+vg@mail.gmail.com/ Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231127-rtas-pci-rw-config-v1-1-385d29ace3df@linux.ibm.com --- arch/powerpc/include/asm/ppc-pci.h | 5 +++-- arch/powerpc/kernel/rtas_pci.c | 8 ++++---- arch/powerpc/platforms/pseries/eeh_pseries.c | 18 +++++++++--------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index d9fcff575027..ce2b1b5eebdd 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -35,6 +35,9 @@ extern void init_pci_config_tokens (void); extern unsigned long get_phb_buid (struct device_node *); extern int rtas_setup_phb(struct pci_controller *phb); +int rtas_pci_dn_read_config(struct pci_dn *pdn, int where, int size, u32 *val); +int rtas_pci_dn_write_config(struct pci_dn *pdn, int where, int size, u32 val); + #ifdef CONFIG_EEH void eeh_addr_cache_insert_dev(struct pci_dev *dev); @@ -44,8 +47,6 @@ void eeh_slot_error_detail(struct eeh_pe *pe, int severity); int eeh_pci_enable(struct eeh_pe *pe, int function); int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed); void eeh_save_bars(struct eeh_dev *edev); -int rtas_write_config(struct pci_dn *, int where, int size, u32 val); -int rtas_read_config(struct pci_dn *, int where, int size, u32 *val); void eeh_pe_state_mark(struct eeh_pe *pe, int state); void eeh_pe_mark_isolated(struct eeh_pe *pe); void eeh_pe_state_clear(struct eeh_pe *pe, int state, bool include_passed); diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c index e1fdc7473b72..fccf96e897f6 100644 --- a/arch/powerpc/kernel/rtas_pci.c +++ b/arch/powerpc/kernel/rtas_pci.c @@ -43,7 +43,7 @@ static inline int config_access_valid(struct pci_dn *dn, int where) return 0; } -int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val) +int rtas_pci_dn_read_config(struct pci_dn *pdn, int where, int size, u32 *val) { int returnval = -1; unsigned long buid, addr; @@ -87,7 +87,7 @@ static int rtas_pci_read_config(struct pci_bus *bus, pdn = pci_get_pdn_by_devfn(bus, devfn); /* Validity of pdn is checked in here */ - ret = rtas_read_config(pdn, where, size, val); + ret = rtas_pci_dn_read_config(pdn, where, size, val); if (*val == EEH_IO_ERROR_VALUE(size) && eeh_dev_check_failure(pdn_to_eeh_dev(pdn))) return PCIBIOS_DEVICE_NOT_FOUND; @@ -95,7 +95,7 @@ static int rtas_pci_read_config(struct pci_bus *bus, return ret; } -int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val) +int rtas_pci_dn_write_config(struct pci_dn *pdn, int where, int size, u32 val) { unsigned long buid, addr; int ret; @@ -134,7 +134,7 @@ static int rtas_pci_write_config(struct pci_bus *bus, pdn = pci_get_pdn_by_devfn(bus, devfn); /* Validity of pdn is checked in here. */ - return rtas_write_config(pdn, where, size, val); + return rtas_pci_dn_write_config(pdn, where, size, val); } static struct pci_ops rtas_pci_ops = { diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index def184da51cf..b1ae0c0d1187 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -252,7 +252,7 @@ static int pseries_eeh_cap_start(struct pci_dn *pdn) if (!pdn) return 0; - rtas_read_config(pdn, PCI_STATUS, 2, &status); + rtas_pci_dn_read_config(pdn, PCI_STATUS, 2, &status); if (!(status & PCI_STATUS_CAP_LIST)) return 0; @@ -270,11 +270,11 @@ static int pseries_eeh_find_cap(struct pci_dn *pdn, int cap) return 0; while (cnt--) { - rtas_read_config(pdn, pos, 1, &pos); + rtas_pci_dn_read_config(pdn, pos, 1, &pos); if (pos < 0x40) break; pos &= ~3; - rtas_read_config(pdn, pos + PCI_CAP_LIST_ID, 1, &id); + rtas_pci_dn_read_config(pdn, pos + PCI_CAP_LIST_ID, 1, &id); if (id == 0xff) break; if (id == cap) @@ -294,7 +294,7 @@ static int pseries_eeh_find_ecap(struct pci_dn *pdn, int cap) if (!edev || !edev->pcie_cap) return 0; - if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) + if (rtas_pci_dn_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) return 0; else if (!header) return 0; @@ -307,7 +307,7 @@ static int pseries_eeh_find_ecap(struct pci_dn *pdn, int cap) if (pos < 256) break; - if (rtas_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) + if (rtas_pci_dn_read_config(pdn, pos, 4, &header) != PCIBIOS_SUCCESSFUL) break; } @@ -412,8 +412,8 @@ static void pseries_eeh_init_edev(struct pci_dn *pdn) if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) { edev->mode |= EEH_DEV_BRIDGE; if (edev->pcie_cap) { - rtas_read_config(pdn, edev->pcie_cap + PCI_EXP_FLAGS, - 2, &pcie_flags); + rtas_pci_dn_read_config(pdn, edev->pcie_cap + PCI_EXP_FLAGS, + 2, &pcie_flags); pcie_flags = (pcie_flags & PCI_EXP_FLAGS_TYPE) >> 4; if (pcie_flags == PCI_EXP_TYPE_ROOT_PORT) edev->mode |= EEH_DEV_ROOT_PORT; @@ -676,7 +676,7 @@ static int pseries_eeh_read_config(struct eeh_dev *edev, int where, int size, u3 { struct pci_dn *pdn = eeh_dev_to_pdn(edev); - return rtas_read_config(pdn, where, size, val); + return rtas_pci_dn_read_config(pdn, where, size, val); } /** @@ -692,7 +692,7 @@ static int pseries_eeh_write_config(struct eeh_dev *edev, int where, int size, u { struct pci_dn *pdn = eeh_dev_to_pdn(edev); - return rtas_write_config(pdn, where, size, val); + return rtas_pci_dn_write_config(pdn, where, size, val); } #ifdef CONFIG_PCI_IOV From c64545594daf748422fa083389b062d0a16fb477 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 28 Nov 2023 10:00:16 +0100 Subject: [PATCH 080/310] x86/Kconfig: Remove obsolete config X86_32_SMP Commit 0f08c3b22996 ("x86/smp: Reduce code duplication") removed the only use of CONFIG_X86_32_SMP. Remove the now obsolete config X86_32_SMP too. Signed-off-by: Lukas Bulwahn Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231128090016.29676-1-lukas.bulwahn@gmail.com --- arch/x86/Kconfig | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3762f41bb092..5a6728dda670 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -384,10 +384,6 @@ config HAVE_INTEL_TXT def_bool y depends on INTEL_IOMMU && ACPI -config X86_32_SMP - def_bool y - depends on X86_32 && SMP - config X86_64_SMP def_bool y depends on X86_64 && SMP From 9f988030e85fafa2b03910d467302853ad29a300 Mon Sep 17 00:00:00 2001 From: Muralidhara M K Date: Thu, 2 Nov 2023 11:42:22 +0000 Subject: [PATCH 081/310] EDAC/mce_amd: Remove SMCA Extended Error code descriptions On AMD systems with Scalable MCA each machine check error of a SMCA bank type has an associated bit position in the bank's control (CTL) register. An error's bit position in the CTL register is used during error decoding for offsetting into the corresponding bank's error description structure. As new errors are being added in newer AMD systems for existing SMCA bank types, the underlying SMCA architecture guarantees that the bit positions of existing errors are not altered. However, on some AMD systems some of the existing bit definitions in the CTL register of SMCA bank type are reassigned without defining new HWID and McaType. Consequently, the errors whose bit definitions have been reassigned in the CTL register are being erroneously decoded. Remove SMCA Extended Error Code descriptions, this avoids decoding issues for incorrectly reassigned bits, and avoids the related maintenance burden in the kernel. But the bank type and Extended Error Code value for an error will continue to be printed as a convenience. The decoding of SMCA Extended Error Code description can be done by referring to AMD documentation or use external tools such as rasdaemon. Offline decoding can be done using below option in rasdaemon. For example: $ rasdaemon -p --status --ipid --smca Also, the user can pass particular family and model to decode the error string. $ rasdaemon -p --status --ipid --smca --family --model --bank Refer to the rasdaemon commit for details: https://github.com/mchehab/rasdaemon/commit/932118b04a04104dfac6b8536 Signed-off-by: Muralidhara M K Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Yazen Ghannam Link: https://lore.kernel.org/r/20231102114225.2006878-2-muralimk@amd.com --- drivers/edac/mce_amd.c | 480 ----------------------------------------- 1 file changed, 480 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 28363eb42e3e..ec8b6c9fedfd 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -143,482 +143,6 @@ static const char * const mc6_mce_desc[] = { "Status Register File", }; -/* Scalable MCA error strings */ -static const char * const smca_ls_mce_desc[] = { - "Load queue parity error", - "Store queue parity error", - "Miss address buffer payload parity error", - "Level 1 TLB parity error", - "DC Tag error type 5", - "DC Tag error type 6", - "DC Tag error type 1", - "Internal error type 1", - "Internal error type 2", - "System Read Data Error Thread 0", - "System Read Data Error Thread 1", - "DC Tag error type 2", - "DC Data error type 1 and poison consumption", - "DC Data error type 2", - "DC Data error type 3", - "DC Tag error type 4", - "Level 2 TLB parity error", - "PDC parity error", - "DC Tag error type 3", - "DC Tag error type 5", - "L2 Fill Data error", -}; - -static const char * const smca_ls2_mce_desc[] = { - "An ECC error was detected on a data cache read by a probe or victimization", - "An ECC error or L2 poison was detected on a data cache read by a load", - "An ECC error was detected on a data cache read-modify-write by a store", - "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization", - "An ECC error or poison bit mismatch was detected on a tag read by a load", - "An ECC error or poison bit mismatch was detected on a tag read by a store", - "An ECC error was detected on an EMEM read by a load", - "An ECC error was detected on an EMEM read-modify-write by a store", - "A parity error was detected in an L1 TLB entry by any access", - "A parity error was detected in an L2 TLB entry by any access", - "A parity error was detected in a PWC entry by any access", - "A parity error was detected in an STQ entry by any access", - "A parity error was detected in an LDQ entry by any access", - "A parity error was detected in a MAB entry by any access", - "A parity error was detected in an SCB entry state field by any access", - "A parity error was detected in an SCB entry address field by any access", - "A parity error was detected in an SCB entry data field by any access", - "A parity error was detected in a WCB entry by any access", - "A poisoned line was detected in an SCB entry by any access", - "A SystemReadDataError error was reported on read data returned from L2 for a load", - "A SystemReadDataError error was reported on read data returned from L2 for an SCB store", - "A SystemReadDataError error was reported on read data returned from L2 for a WCB store", - "A hardware assertion error was reported", - "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access", -}; - -static const char * const smca_if_mce_desc[] = { - "Op Cache Microtag Probe Port Parity Error", - "IC Microtag or Full Tag Multi-hit Error", - "IC Full Tag Parity Error", - "IC Data Array Parity Error", - "Decoupling Queue PhysAddr Parity Error", - "L0 ITLB Parity Error", - "L1 ITLB Parity Error", - "L2 ITLB Parity Error", - "BPQ Thread 0 Snoop Parity Error", - "BPQ Thread 1 Snoop Parity Error", - "L1 BTB Multi-Match Error", - "L2 BTB Multi-Match Error", - "L2 Cache Response Poison Error", - "System Read Data Error", - "Hardware Assertion Error", - "L1-TLB Multi-Hit", - "L2-TLB Multi-Hit", - "BSR Parity Error", - "CT MCE", -}; - -static const char * const smca_l2_mce_desc[] = { - "L2M Tag Multiple-Way-Hit error", - "L2M Tag or State Array ECC Error", - "L2M Data Array ECC Error", - "Hardware Assert Error", -}; - -static const char * const smca_de_mce_desc[] = { - "Micro-op cache tag parity error", - "Micro-op cache data parity error", - "Instruction buffer parity error", - "Micro-op queue parity error", - "Instruction dispatch queue parity error", - "Fetch address FIFO parity error", - "Patch RAM data parity error", - "Patch RAM sequencer parity error", - "Micro-op buffer parity error", - "Hardware Assertion MCA Error", -}; - -static const char * const smca_ex_mce_desc[] = { - "Watchdog Timeout error", - "Physical register file parity error", - "Flag register file parity error", - "Immediate displacement register file parity error", - "Address generator payload parity error", - "EX payload parity error", - "Checkpoint queue parity error", - "Retire dispatch queue parity error", - "Retire status queue parity error", - "Scheduling queue parity error", - "Branch buffer queue parity error", - "Hardware Assertion error", - "Spec Map parity error", - "Retire Map parity error", -}; - -static const char * const smca_fp_mce_desc[] = { - "Physical register file (PRF) parity error", - "Freelist (FL) parity error", - "Schedule queue parity error", - "NSQ parity error", - "Retire queue (RQ) parity error", - "Status register file (SRF) parity error", - "Hardware assertion", -}; - -static const char * const smca_l3_mce_desc[] = { - "Shadow Tag Macro ECC Error", - "Shadow Tag Macro Multi-way-hit Error", - "L3M Tag ECC Error", - "L3M Tag Multi-way-hit Error", - "L3M Data ECC Error", - "SDP Parity Error or SystemReadDataError from XI", - "L3 Victim Queue Parity Error", - "L3 Hardware Assertion", -}; - -static const char * const smca_cs_mce_desc[] = { - "Illegal Request", - "Address Violation", - "Security Violation", - "Illegal Response", - "Unexpected Response", - "Request or Probe Parity Error", - "Read Response Parity Error", - "Atomic Request Parity Error", - "Probe Filter ECC Error", -}; - -static const char * const smca_cs2_mce_desc[] = { - "Illegal Request", - "Address Violation", - "Security Violation", - "Illegal Response", - "Unexpected Response", - "Request or Probe Parity Error", - "Read Response Parity Error", - "Atomic Request Parity Error", - "SDP read response had no match in the CS queue", - "Probe Filter Protocol Error", - "Probe Filter ECC Error", - "SDP read response had an unexpected RETRY error", - "Counter overflow error", - "Counter underflow error", -}; - -static const char * const smca_pie_mce_desc[] = { - "Hardware Assert", - "Register security violation", - "Link Error", - "Poison data consumption", - "A deferred error was detected in the DF" -}; - -static const char * const smca_umc_mce_desc[] = { - "DRAM ECC error", - "Data poison error", - "SDP parity error", - "Advanced peripheral bus error", - "Address/Command parity error", - "Write data CRC error", - "DCQ SRAM ECC error", - "AES SRAM ECC error", -}; - -static const char * const smca_umc2_mce_desc[] = { - "DRAM ECC error", - "Data poison error", - "SDP parity error", - "Reserved", - "Address/Command parity error", - "Write data parity error", - "DCQ SRAM ECC error", - "Reserved", - "Read data parity error", - "Rdb SRAM ECC error", - "RdRsp SRAM ECC error", - "LM32 MP errors", -}; - -static const char * const smca_pb_mce_desc[] = { - "An ECC error in the Parameter Block RAM array", -}; - -static const char * const smca_psp_mce_desc[] = { - "An ECC or parity error in a PSP RAM instance", -}; - -static const char * const smca_psp2_mce_desc[] = { - "High SRAM ECC or parity error", - "Low SRAM ECC or parity error", - "Instruction Cache Bank 0 ECC or parity error", - "Instruction Cache Bank 1 ECC or parity error", - "Instruction Tag Ram 0 parity error", - "Instruction Tag Ram 1 parity error", - "Data Cache Bank 0 ECC or parity error", - "Data Cache Bank 1 ECC or parity error", - "Data Cache Bank 2 ECC or parity error", - "Data Cache Bank 3 ECC or parity error", - "Data Tag Bank 0 parity error", - "Data Tag Bank 1 parity error", - "Data Tag Bank 2 parity error", - "Data Tag Bank 3 parity error", - "Dirty Data Ram parity error", - "TLB Bank 0 parity error", - "TLB Bank 1 parity error", - "System Hub Read Buffer ECC or parity error", -}; - -static const char * const smca_smu_mce_desc[] = { - "An ECC or parity error in an SMU RAM instance", -}; - -static const char * const smca_smu2_mce_desc[] = { - "High SRAM ECC or parity error", - "Low SRAM ECC or parity error", - "Data Cache Bank A ECC or parity error", - "Data Cache Bank B ECC or parity error", - "Data Tag Cache Bank A ECC or parity error", - "Data Tag Cache Bank B ECC or parity error", - "Instruction Cache Bank A ECC or parity error", - "Instruction Cache Bank B ECC or parity error", - "Instruction Tag Cache Bank A ECC or parity error", - "Instruction Tag Cache Bank B ECC or parity error", - "System Hub Read Buffer ECC or parity error", - "PHY RAM ECC error", -}; - -static const char * const smca_mp5_mce_desc[] = { - "High SRAM ECC or parity error", - "Low SRAM ECC or parity error", - "Data Cache Bank A ECC or parity error", - "Data Cache Bank B ECC or parity error", - "Data Tag Cache Bank A ECC or parity error", - "Data Tag Cache Bank B ECC or parity error", - "Instruction Cache Bank A ECC or parity error", - "Instruction Cache Bank B ECC or parity error", - "Instruction Tag Cache Bank A ECC or parity error", - "Instruction Tag Cache Bank B ECC or parity error", -}; - -static const char * const smca_mpdma_mce_desc[] = { - "Main SRAM [31:0] bank ECC or parity error", - "Main SRAM [63:32] bank ECC or parity error", - "Main SRAM [95:64] bank ECC or parity error", - "Main SRAM [127:96] bank ECC or parity error", - "Data Cache Bank A ECC or parity error", - "Data Cache Bank B ECC or parity error", - "Data Tag Cache Bank A ECC or parity error", - "Data Tag Cache Bank B ECC or parity error", - "Instruction Cache Bank A ECC or parity error", - "Instruction Cache Bank B ECC or parity error", - "Instruction Tag Cache Bank A ECC or parity error", - "Instruction Tag Cache Bank B ECC or parity error", - "Data Cache Bank A ECC or parity error", - "Data Cache Bank B ECC or parity error", - "Data Tag Cache Bank A ECC or parity error", - "Data Tag Cache Bank B ECC or parity error", - "Instruction Cache Bank A ECC or parity error", - "Instruction Cache Bank B ECC or parity error", - "Instruction Tag Cache Bank A ECC or parity error", - "Instruction Tag Cache Bank B ECC or parity error", - "Data Cache Bank A ECC or parity error", - "Data Cache Bank B ECC or parity error", - "Data Tag Cache Bank A ECC or parity error", - "Data Tag Cache Bank B ECC or parity error", - "Instruction Cache Bank A ECC or parity error", - "Instruction Cache Bank B ECC or parity error", - "Instruction Tag Cache Bank A ECC or parity error", - "Instruction Tag Cache Bank B ECC or parity error", - "System Hub Read Buffer ECC or parity error", - "MPDMA TVF DVSEC Memory ECC or parity error", - "MPDMA TVF MMIO Mailbox0 ECC or parity error", - "MPDMA TVF MMIO Mailbox1 ECC or parity error", - "MPDMA TVF Doorbell Memory ECC or parity error", - "MPDMA TVF SDP Slave Memory 0 ECC or parity error", - "MPDMA TVF SDP Slave Memory 1 ECC or parity error", - "MPDMA TVF SDP Slave Memory 2 ECC or parity error", - "MPDMA TVF SDP Master Memory 0 ECC or parity error", - "MPDMA TVF SDP Master Memory 1 ECC or parity error", - "MPDMA TVF SDP Master Memory 2 ECC or parity error", - "MPDMA TVF SDP Master Memory 3 ECC or parity error", - "MPDMA TVF SDP Master Memory 4 ECC or parity error", - "MPDMA TVF SDP Master Memory 5 ECC or parity error", - "MPDMA TVF SDP Master Memory 6 ECC or parity error", - "MPDMA PTE Command FIFO ECC or parity error", - "MPDMA PTE Hub Data FIFO ECC or parity error", - "MPDMA PTE Internal Data FIFO ECC or parity error", - "MPDMA PTE Command Memory DMA ECC or parity error", - "MPDMA PTE Command Memory Internal ECC or parity error", - "MPDMA PTE DMA Completion FIFO ECC or parity error", - "MPDMA PTE Tablewalk Completion FIFO ECC or parity error", - "MPDMA PTE Descriptor Completion FIFO ECC or parity error", - "MPDMA PTE ReadOnly Completion FIFO ECC or parity error", - "MPDMA PTE DirectWrite Completion FIFO ECC or parity error", - "SDP Watchdog Timer expired", -}; - -static const char * const smca_nbio_mce_desc[] = { - "ECC or Parity error", - "PCIE error", - "SDP ErrEvent error", - "SDP Egress Poison Error", - "IOHC Internal Poison Error", -}; - -static const char * const smca_pcie_mce_desc[] = { - "CCIX PER Message logging", - "CCIX Read Response with Status: Non-Data Error", - "CCIX Write Response with Status: Non-Data Error", - "CCIX Read Response with Status: Data Error", - "CCIX Non-okay write response with data error", -}; - -static const char * const smca_pcie2_mce_desc[] = { - "SDP Parity Error logging", -}; - -static const char * const smca_xgmipcs_mce_desc[] = { - "Data Loss Error", - "Training Error", - "Flow Control Acknowledge Error", - "Rx Fifo Underflow Error", - "Rx Fifo Overflow Error", - "CRC Error", - "BER Exceeded Error", - "Tx Vcid Data Error", - "Replay Buffer Parity Error", - "Data Parity Error", - "Replay Fifo Overflow Error", - "Replay Fifo Underflow Error", - "Elastic Fifo Overflow Error", - "Deskew Error", - "Flow Control CRC Error", - "Data Startup Limit Error", - "FC Init Timeout Error", - "Recovery Timeout Error", - "Ready Serial Timeout Error", - "Ready Serial Attempt Error", - "Recovery Attempt Error", - "Recovery Relock Attempt Error", - "Replay Attempt Error", - "Sync Header Error", - "Tx Replay Timeout Error", - "Rx Replay Timeout Error", - "LinkSub Tx Timeout Error", - "LinkSub Rx Timeout Error", - "Rx CMD Packet Error", -}; - -static const char * const smca_xgmiphy_mce_desc[] = { - "RAM ECC Error", - "ARC instruction buffer parity error", - "ARC data buffer parity error", - "PHY APB error", -}; - -static const char * const smca_nbif_mce_desc[] = { - "Timeout error from GMI", - "SRAM ECC error", - "NTB Error Event", - "SDP Parity error", -}; - -static const char * const smca_sata_mce_desc[] = { - "Parity error for port 0", - "Parity error for port 1", - "Parity error for port 2", - "Parity error for port 3", - "Parity error for port 4", - "Parity error for port 5", - "Parity error for port 6", - "Parity error for port 7", -}; - -static const char * const smca_usb_mce_desc[] = { - "Parity error or ECC error for S0 RAM0", - "Parity error or ECC error for S0 RAM1", - "Parity error or ECC error for S0 RAM2", - "Parity error for PHY RAM0", - "Parity error for PHY RAM1", - "AXI Slave Response error", -}; - -static const char * const smca_gmipcs_mce_desc[] = { - "Data Loss Error", - "Training Error", - "Replay Parity Error", - "Rx Fifo Underflow Error", - "Rx Fifo Overflow Error", - "CRC Error", - "BER Exceeded Error", - "Tx Fifo Underflow Error", - "Replay Buffer Parity Error", - "Tx Overflow Error", - "Replay Fifo Overflow Error", - "Replay Fifo Underflow Error", - "Elastic Fifo Overflow Error", - "Deskew Error", - "Offline Error", - "Data Startup Limit Error", - "FC Init Timeout Error", - "Recovery Timeout Error", - "Ready Serial Timeout Error", - "Ready Serial Attempt Error", - "Recovery Attempt Error", - "Recovery Relock Attempt Error", - "Deskew Abort Error", - "Rx Buffer Error", - "Rx LFDS Fifo Overflow Error", - "Rx LFDS Fifo Underflow Error", - "LinkSub Tx Timeout Error", - "LinkSub Rx Timeout Error", - "Rx CMD Packet Error", - "LFDS Training Timeout Error", - "LFDS FC Init Timeout Error", - "Data Loss Error", -}; - -struct smca_mce_desc { - const char * const *descs; - unsigned int num_descs; -}; - -static struct smca_mce_desc smca_mce_descs[] = { - [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) }, - [SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) }, - [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) }, - [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) }, - [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) }, - [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) }, - [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, - [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, - [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, - [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, - [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, - [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, - [SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) }, - [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, - [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, - [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) }, - [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, - [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) }, - [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, - [SMCA_MPDMA] = { smca_mpdma_mce_desc, ARRAY_SIZE(smca_mpdma_mce_desc) }, - [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) }, - [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) }, - [SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) }, - [SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) }, - /* NBIF and SHUB have the same error descriptions, for now. */ - [SMCA_NBIF] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) }, - [SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) }, - [SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) }, - [SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) }, - [SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) }, - /* All the PHY bank types have the same error descriptions, for now. */ - [SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) }, - [SMCA_WAFL_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) }, - [SMCA_GMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) }, -}; - static bool f12h_mc0_mce(u16 ec, u8 xec) { bool ret = false; @@ -1220,10 +744,6 @@ static void decode_smca_error(struct mce *m) pr_emerg(HW_ERR "%s Ext. Error Code: %d", smca_get_long_name(bank_type), xec); - /* Only print the decode of valid error codes */ - if (xec < smca_mce_descs[bank_type].num_descs) - pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]); - if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0 && decode_dram_ecc) decode_dram_ecc(topology_die_id(m->extcpu), m); From a2f99fbae4513ce4871e71ba88cd9d75fe5019e6 Mon Sep 17 00:00:00 2001 From: Abhinav Singh Date: Tue, 28 Nov 2023 19:47:03 +0530 Subject: [PATCH 082/310] EDAC/{sb,i7core}_edac: Do not use a plain integer for a NULL pointer Sparse warns about the use of the integer constant 0 as a NULL pointer with the -Wnon-pointer-null switch. Even though the C standard requires that 0 == NULL and type conversion rules turn an integer constant 0 into a NULL pointer when cast to a void * type, Linus notes that this is a very poor situation from a type safety angle and a pointer should be initialized with a pointer type - not an integer constant. See https://www.spinics.net/lists/linux-sparse/msg10066.html for more info. [ bp: Rewrite commit message, drop useless comments in the code. ] Signed-off-by: Abhinav Singh Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231128141703.614605-1-singhabhinav9051571833@gmail.com --- drivers/edac/i7core_edac.c | 4 ++-- drivers/edac/sb_edac.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index 23d25724bae4..91e0a88ef904 100644 --- a/drivers/edac/i7core_edac.c +++ b/drivers/edac/i7core_edac.c @@ -376,7 +376,7 @@ static const struct pci_id_table pci_dev_table[] = { PCI_ID_TABLE_ENTRY(pci_dev_descr_i7core_nehalem), PCI_ID_TABLE_ENTRY(pci_dev_descr_lynnfield), PCI_ID_TABLE_ENTRY(pci_dev_descr_i7core_westmere), - {0,} /* 0 terminated list. */ + { NULL, } }; /* @@ -385,7 +385,7 @@ static const struct pci_id_table pci_dev_table[] = { static const struct pci_device_id i7core_pci_tbl[] = { {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_X58_HUB_MGMT)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_LYNNFIELD_QPI_LINK0)}, - {0,} /* 0 terminated list. */ + { 0, } }; /**************************************************************************** diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 0c779a0326b6..26cca5a9322d 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -439,7 +439,7 @@ static const struct pci_id_descr pci_dev_descr_sbridge[] = { static const struct pci_id_table pci_dev_descr_sbridge_table[] = { PCI_ID_TABLE_ENTRY(pci_dev_descr_sbridge, ARRAY_SIZE(pci_dev_descr_sbridge), 1, SANDY_BRIDGE), - {0,} /* 0 terminated list. */ + { NULL, } }; /* This changes depending if 1HA or 2HA: @@ -505,7 +505,7 @@ static const struct pci_id_descr pci_dev_descr_ibridge[] = { static const struct pci_id_table pci_dev_descr_ibridge_table[] = { PCI_ID_TABLE_ENTRY(pci_dev_descr_ibridge, 12, 2, IVY_BRIDGE), - {0,} /* 0 terminated list. */ + { NULL, } }; /* Haswell support */ @@ -576,7 +576,7 @@ static const struct pci_id_descr pci_dev_descr_haswell[] = { static const struct pci_id_table pci_dev_descr_haswell_table[] = { PCI_ID_TABLE_ENTRY(pci_dev_descr_haswell, 13, 2, HASWELL), - {0,} /* 0 terminated list. */ + { NULL, } }; /* Knight's Landing Support */ @@ -620,7 +620,7 @@ static const struct pci_id_descr pci_dev_descr_knl[] = { static const struct pci_id_table pci_dev_descr_knl_table[] = { PCI_ID_TABLE_ENTRY(pci_dev_descr_knl, ARRAY_SIZE(pci_dev_descr_knl), 1, KNIGHTS_LANDING), - {0,} + { NULL, } }; /* @@ -686,7 +686,7 @@ static const struct pci_id_descr pci_dev_descr_broadwell[] = { static const struct pci_id_table pci_dev_descr_broadwell_table[] = { PCI_ID_TABLE_ENTRY(pci_dev_descr_broadwell, 10, 2, BROADWELL), - {0,} /* 0 terminated list. */ + { NULL, } }; From 47b744ea5e3cf855087951a74ba9f89180fa1ba5 Mon Sep 17 00:00:00 2001 From: Muralidhara M K Date: Thu, 2 Nov 2023 11:42:23 +0000 Subject: [PATCH 083/310] x86/MCE/AMD: Add new MA_LLC, USR_DP, and USR_CP bank types Add HWID and McaType values for new SMCA bank types. Signed-off-by: Muralidhara M K Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231102114225.2006878-3-muralimk@amd.com --- arch/x86/include/asm/mce.h | 3 +++ arch/x86/kernel/cpu/mce/amd.c | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 4ad49afca2db..de3118305838 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -311,6 +311,7 @@ enum smca_bank_types { SMCA_PIE, /* Power, Interrupts, etc. */ SMCA_UMC, /* Unified Memory Controller */ SMCA_UMC_V2, + SMCA_MA_LLC, /* Memory Attached Last Level Cache */ SMCA_PB, /* Parameter Block */ SMCA_PSP, /* Platform Security Processor */ SMCA_PSP_V2, @@ -326,6 +327,8 @@ enum smca_bank_types { SMCA_SHUB, /* System HUB Unit */ SMCA_SATA, /* SATA Unit */ SMCA_USB, /* USB Unit */ + SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */ + SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */ SMCA_GMI_PCS, /* GMI PCS Unit */ SMCA_XGMI_PHY, /* xGMI PHY Unit */ SMCA_WAFL_PHY, /* WAFL PHY Unit */ diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index f6c6c1e0a581..2b46eb0fdf3a 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -102,6 +102,7 @@ static const char * const smca_names[] = { /* UMC v2 is separate because both of them can exist in a single system. */ [SMCA_UMC] = "umc", [SMCA_UMC_V2] = "umc_v2", + [SMCA_MA_LLC] = "ma_llc", [SMCA_PB] = "param_block", [SMCA_PSP ... SMCA_PSP_V2] = "psp", [SMCA_SMU ... SMCA_SMU_V2] = "smu", @@ -114,6 +115,8 @@ static const char * const smca_names[] = { [SMCA_SHUB] = "shub", [SMCA_SATA] = "sata", [SMCA_USB] = "usb", + [SMCA_USR_DP] = "usr_dp", + [SMCA_USR_CP] = "usr_cp", [SMCA_GMI_PCS] = "gmi_pcs", [SMCA_XGMI_PHY] = "xgmi_phy", [SMCA_WAFL_PHY] = "wafl_phy", @@ -164,6 +167,7 @@ static const struct smca_hwid smca_hwid_mcatypes[] = { { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) }, { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) }, { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) }, + { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) }, /* Unified Memory Controller MCA type */ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) }, @@ -198,6 +202,8 @@ static const struct smca_hwid smca_hwid_mcatypes[] = { { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) }, { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) }, { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) }, + { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) }, + { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) }, { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) }, { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) }, { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) }, From 9a5f580c1c71b6aedba696c4898a7a7184cef8ad Mon Sep 17 00:00:00 2001 From: Muralidhara M K Date: Thu, 2 Nov 2023 11:42:24 +0000 Subject: [PATCH 084/310] EDAC/mc: Add support for HBM3 memory type AMD MI300A models use HBM3 (High Bandwidth Memory Gen 3) memory. HBM is a high-speed computer memory interface for 3D-stacked synchronous dynamic random-access memory (SDRAM). Signed-off-by: Muralidhara M K Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231102114225.2006878-4-muralimk@amd.com --- drivers/edac/edac_mc.c | 1 + include/linux/edac.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 6faeb2ab3960..d6eed727b0cd 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -166,6 +166,7 @@ const char * const edac_mem_types[] = { [MEM_NVDIMM] = "Non-volatile-RAM", [MEM_WIO2] = "Wide-IO-2", [MEM_HBM2] = "High-bandwidth-memory-Gen2", + [MEM_HBM3] = "High-bandwidth-memory-Gen3", }; EXPORT_SYMBOL_GPL(edac_mem_types); diff --git a/include/linux/edac.h b/include/linux/edac.h index fa4bda2a70f6..1174beb94ab6 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -187,6 +187,7 @@ static inline char *mc_event_error_type(const unsigned int err_type) * @MEM_NVDIMM: Non-volatile RAM * @MEM_WIO2: Wide I/O 2. * @MEM_HBM2: High bandwidth Memory Gen 2. + * @MEM_HBM3: High bandwidth Memory Gen 3. */ enum mem_type { MEM_EMPTY = 0, @@ -218,6 +219,7 @@ enum mem_type { MEM_NVDIMM, MEM_WIO2, MEM_HBM2, + MEM_HBM3, }; #define MEM_FLAG_EMPTY BIT(MEM_EMPTY) @@ -248,6 +250,7 @@ enum mem_type { #define MEM_FLAG_NVDIMM BIT(MEM_NVDIMM) #define MEM_FLAG_WIO2 BIT(MEM_WIO2) #define MEM_FLAG_HBM2 BIT(MEM_HBM2) +#define MEM_FLAG_HBM3 BIT(MEM_HBM3) /** * enum edac_type - Error Detection and Correction capabilities and mode From 12f230c07a95d925d6af754485952515e7975127 Mon Sep 17 00:00:00 2001 From: Muralidhara M K Date: Thu, 2 Nov 2023 11:42:25 +0000 Subject: [PATCH 085/310] EDAC/amd64: Add support for family 0x19, models 0x90-9f devices AMD Models 90h-9fh are APUs. They have built-in HBM3 memory. ECC support is enabled by default. APU models have a single Data Fabric (DF) per Package. Each DF is visible to the OS in the same way as chiplet-based systems like Zen2 CPUs and later. However, the Unified Memory Controllers (UMCs) are arranged in the same way as GPU-based MI200 devices rather than CPU-based systems. Use the existing gpu_ops for hetergeneous systems to support enumeration of nodes and memory topology with few fixups. [ bp: Massage comments. ] Signed-off-by: Muralidhara M K Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231102114225.2006878-5-muralimk@amd.com --- drivers/edac/amd64_edac.c | 66 ++++++++++++++++++++++++++++----------- drivers/edac/amd64_edac.h | 1 + 2 files changed, 49 insertions(+), 18 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 9b6642d00871..537b9987a431 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -996,15 +996,23 @@ static struct local_node_map { #define LNTM_NODE_COUNT GENMASK(27, 16) #define LNTM_BASE_NODE_ID GENMASK(11, 0) -static int gpu_get_node_map(void) +static int gpu_get_node_map(struct amd64_pvt *pvt) { struct pci_dev *pdev; int ret; u32 tmp; /* - * Node ID 0 is reserved for CPUs. - * Therefore, a non-zero Node ID means we've already cached the values. + * Mapping of nodes from hardware-provided AMD Node ID to a + * Linux logical one is applicable for MI200 models. Therefore, + * return early for other heterogeneous systems. + */ + if (pvt->F3->device != PCI_DEVICE_ID_AMD_MI200_DF_F3) + return 0; + + /* + * Node ID 0 is reserved for CPUs. Therefore, a non-zero Node ID + * means the values have been already cached. */ if (gpu_node_map.base_node_id) return 0; @@ -3851,7 +3859,7 @@ static void gpu_init_csrows(struct mem_ctl_info *mci) dimm->nr_pages = gpu_get_csrow_nr_pages(pvt, umc, cs); dimm->edac_mode = EDAC_SECDED; - dimm->mtype = MEM_HBM2; + dimm->mtype = pvt->dram_type; dimm->dtype = DEV_X16; dimm->grain = 64; } @@ -3880,7 +3888,7 @@ static bool gpu_ecc_enabled(struct amd64_pvt *pvt) return true; } -static inline u32 gpu_get_umc_base(u8 umc, u8 channel) +static inline u32 gpu_get_umc_base(struct amd64_pvt *pvt, u8 umc, u8 channel) { /* * On CPUs, there is one channel per UMC, so UMC numbering equals @@ -3893,13 +3901,16 @@ static inline u32 gpu_get_umc_base(u8 umc, u8 channel) * On GPU nodes channels are selected in 3rd nibble * HBM chX[3:0]= [Y ]5X[3:0]000; * HBM chX[7:4]= [Y+1]5X[3:0]000 + * + * On MI300 APU nodes, same as GPU nodes but channels are selected + * in the base address of 0x90000 */ umc *= 2; if (channel >= 4) umc++; - return 0x50000 + (umc << 20) + ((channel % 4) << 12); + return pvt->gpu_umc_base + (umc << 20) + ((channel % 4) << 12); } static void gpu_read_mc_regs(struct amd64_pvt *pvt) @@ -3910,7 +3921,7 @@ static void gpu_read_mc_regs(struct amd64_pvt *pvt) /* Read registers from each UMC */ for_each_umc(i) { - umc_base = gpu_get_umc_base(i, 0); + umc_base = gpu_get_umc_base(pvt, i, 0); umc = &pvt->umc[i]; amd_smn_read(nid, umc_base + UMCCH_UMC_CFG, &umc->umc_cfg); @@ -3927,7 +3938,7 @@ static void gpu_read_base_mask(struct amd64_pvt *pvt) for_each_umc(umc) { for_each_chip_select(cs, umc, pvt) { - base_reg = gpu_get_umc_base(umc, cs) + UMCCH_BASE_ADDR; + base_reg = gpu_get_umc_base(pvt, umc, cs) + UMCCH_BASE_ADDR; base = &pvt->csels[umc].csbases[cs]; if (!amd_smn_read(pvt->mc_node_id, base_reg, base)) { @@ -3935,7 +3946,7 @@ static void gpu_read_base_mask(struct amd64_pvt *pvt) umc, cs, *base, base_reg); } - mask_reg = gpu_get_umc_base(umc, cs) + UMCCH_ADDR_MASK; + mask_reg = gpu_get_umc_base(pvt, umc, cs) + UMCCH_ADDR_MASK; mask = &pvt->csels[umc].csmasks[cs]; if (!amd_smn_read(pvt->mc_node_id, mask_reg, mask)) { @@ -3960,7 +3971,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt) { int ret; - ret = gpu_get_node_map(); + ret = gpu_get_node_map(pvt); if (ret) return ret; @@ -4125,6 +4136,8 @@ static int per_family_init(struct amd64_pvt *pvt) if (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) { pvt->ctl_name = "MI200"; pvt->max_mcs = 4; + pvt->dram_type = MEM_HBM2; + pvt->gpu_umc_base = 0x50000; pvt->ops = &gpu_ops; } else { pvt->ctl_name = "F19h_M30h"; @@ -4142,6 +4155,13 @@ static int per_family_init(struct amd64_pvt *pvt) pvt->ctl_name = "F19h_M70h"; pvt->flags.zn_regs_v2 = 1; break; + case 0x90 ... 0x9f: + pvt->ctl_name = "F19h_M90h"; + pvt->max_mcs = 4; + pvt->dram_type = MEM_HBM3; + pvt->gpu_umc_base = 0x90000; + pvt->ops = &gpu_ops; + break; case 0xa0 ... 0xaf: pvt->ctl_name = "F19h_MA0h"; pvt->max_mcs = 12; @@ -4180,23 +4200,33 @@ static const struct attribute_group *amd64_edac_attr_groups[] = { NULL }; +/* + * For heterogeneous and APU models EDAC CHIP_SELECT and CHANNEL layers + * should be swapped to fit into the layers. + */ +static unsigned int get_layer_size(struct amd64_pvt *pvt, u8 layer) +{ + bool is_gpu = (pvt->ops == &gpu_ops); + + if (!layer) + return is_gpu ? pvt->max_mcs + : pvt->csels[0].b_cnt; + else + return is_gpu ? pvt->csels[0].b_cnt + : pvt->max_mcs; +} + static int init_one_instance(struct amd64_pvt *pvt) { struct mem_ctl_info *mci = NULL; struct edac_mc_layer layers[2]; int ret = -ENOMEM; - /* - * For Heterogeneous family EDAC CHIP_SELECT and CHANNEL layers should - * be swapped to fit into the layers. - */ layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; - layers[0].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ? - pvt->max_mcs : pvt->csels[0].b_cnt; + layers[0].size = get_layer_size(pvt, 0); layers[0].is_virt_csrow = true; layers[1].type = EDAC_MC_LAYER_CHANNEL; - layers[1].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ? - pvt->csels[0].b_cnt : pvt->max_mcs; + layers[1].size = get_layer_size(pvt, 1); layers[1].is_virt_csrow = false; mci = edac_mc_alloc(pvt->mc_node_id, ARRAY_SIZE(layers), layers, 0); diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 5a4e4a59682b..1665f7932bac 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -362,6 +362,7 @@ struct amd64_pvt { u32 dct_sel_lo; /* DRAM Controller Select Low */ u32 dct_sel_hi; /* DRAM Controller Select High */ u32 online_spare; /* On-Line spare Reg */ + u32 gpu_umc_base; /* Base address used for channel selection on GPUs */ /* x4, x8, or x16 syndromes in use */ u8 ecc_sym_sz; From 30fa92832f405d5ac9f263e99f62445fa3084008 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Tue, 31 Oct 2023 23:30:59 +0100 Subject: [PATCH 086/310] x86/CPU/AMD: Add ZenX generations flags Add X86_FEATURE flags for each Zen generation. They should be used from now on instead of checking f/m/s. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Acked-by: Thomas Gleixner Link: http://lore.kernel.org/r/20231120104152.13740-2-bp@alien8.de --- arch/x86/include/asm/cpufeatures.h | 5 ++- arch/x86/kernel/cpu/amd.c | 70 +++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 3e973ff20e23..149cc5d5c2ae 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -218,7 +218,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */ +#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU based on Zen microarchitecture */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ #define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ @@ -312,6 +312,9 @@ #define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */ #define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */ #define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */ +#define X86_FEATURE_ZEN2 (11*32+28) /* "" CPU based on Zen2 microarchitecture */ +#define X86_FEATURE_ZEN3 (11*32+29) /* "" CPU based on Zen3 microarchitecture */ +#define X86_FEATURE_ZEN4 (11*32+30) /* "" CPU based on Zen4 microarchitecture */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 841e21213668..6aba2244d6c6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -616,6 +616,49 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) } resctrl_cpu_detect(c); + + /* Figure out Zen generations: */ + switch (c->x86) { + case 0x17: { + switch (c->x86_model) { + case 0x00 ... 0x2f: + case 0x50 ... 0x5f: + setup_force_cpu_cap(X86_FEATURE_ZEN); + break; + case 0x30 ... 0x4f: + case 0x60 ... 0x7f: + case 0x90 ... 0x91: + case 0xa0 ... 0xaf: + setup_force_cpu_cap(X86_FEATURE_ZEN2); + break; + default: + goto warn; + } + break; + } + case 0x19: { + switch (c->x86_model) { + case 0x00 ... 0x0f: + case 0x20 ... 0x5f: + setup_force_cpu_cap(X86_FEATURE_ZEN3); + break; + case 0x10 ... 0x1f: + case 0x60 ... 0xaf: + setup_force_cpu_cap(X86_FEATURE_ZEN4); + break; + default: + goto warn; + } + break; + } + default: + break; + } + + return; + +warn: + WARN_ONCE(1, "Family 0x%x, model: 0x%x??\n", c->x86, c->x86_model); } static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) @@ -974,8 +1017,6 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) static void init_amd_zn(struct cpuinfo_x86 *c) { - set_cpu_cap(c, X86_FEATURE_ZEN); - #ifdef CONFIG_NUMA node_reclaim_distance = 32; #endif @@ -1037,6 +1078,22 @@ static void zenbleed_check(struct cpuinfo_x86 *c) } } +static void init_amd_zen(struct cpuinfo_x86 *c) +{ +} + +static void init_amd_zen2(struct cpuinfo_x86 *c) +{ +} + +static void init_amd_zen3(struct cpuinfo_x86 *c) +{ +} + +static void init_amd_zen4(struct cpuinfo_x86 *c) +{ +} + static void init_amd(struct cpuinfo_x86 *c) { u64 vm_cr; @@ -1077,6 +1134,15 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x19: init_amd_zn(c); break; } + if (boot_cpu_has(X86_FEATURE_ZEN)) + init_amd_zen(c); + else if (boot_cpu_has(X86_FEATURE_ZEN2)) + init_amd_zen2(c); + else if (boot_cpu_has(X86_FEATURE_ZEN3)) + init_amd_zen3(c); + else if (boot_cpu_has(X86_FEATURE_ZEN4)) + init_amd_zen4(c); + /* * Enable workaround for FXSAVE leak on CPUs * without a XSaveErPtr feature From a7c32a1ae9ee43abfe884f5af376877c4301d166 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 1 Nov 2023 11:14:59 +0100 Subject: [PATCH 087/310] x86/CPU/AMD: Carve out the erratum 1386 fix Call it on the affected CPU generations. No functional changes. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-3-bp@alien8.de --- arch/x86/kernel/cpu/amd.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 6aba2244d6c6..0a499cbbf644 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -984,6 +984,19 @@ static void init_amd_bd(struct cpuinfo_x86 *c) clear_rdrand_cpuid_bit(c); } +static void fix_erratum_1386(struct cpuinfo_x86 *c) +{ + /* + * Work around Erratum 1386. The XSAVES instruction malfunctions in + * certain circumstances on Zen1/2 uarch, and not all parts have had + * updated microcode at the time of writing (March 2023). + * + * Affected parts all have no supervisor XSAVE states, meaning that + * the XSAVEC instruction (which works fine) is equivalent. + */ + clear_cpu_cap(c, X86_FEATURE_XSAVES); +} + void init_spectral_chicken(struct cpuinfo_x86 *c) { #ifdef CONFIG_CPU_UNRET_ENTRY @@ -1004,15 +1017,6 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) } } #endif - /* - * Work around Erratum 1386. The XSAVES instruction malfunctions in - * certain circumstances on Zen1/2 uarch, and not all parts have had - * updated microcode at the time of writing (March 2023). - * - * Affected parts all have no supervisor XSAVE states, meaning that - * the XSAVEC instruction (which works fine) is equivalent. - */ - clear_cpu_cap(c, X86_FEATURE_XSAVES); } static void init_amd_zn(struct cpuinfo_x86 *c) @@ -1080,10 +1084,12 @@ static void zenbleed_check(struct cpuinfo_x86 *c) static void init_amd_zen(struct cpuinfo_x86 *c) { + fix_erratum_1386(c); } static void init_amd_zen2(struct cpuinfo_x86 *c) { + fix_erratum_1386(c); } static void init_amd_zen3(struct cpuinfo_x86 *c) From affc66cb96f865b3763a8e18add52e133d864f04 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 1 Nov 2023 11:28:31 +0100 Subject: [PATCH 088/310] x86/CPU/AMD: Move the Zen3 BTC_NO detection to the Zen3 init function No functional changes. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-4-bp@alien8.de --- arch/x86/kernel/cpu/amd.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 0a499cbbf644..3d7434796360 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1031,14 +1031,6 @@ static void init_amd_zn(struct cpuinfo_x86 *c) /* Erratum 1076: CPB feature bit not being set in CPUID. */ if (!cpu_has(c, X86_FEATURE_CPB)) set_cpu_cap(c, X86_FEATURE_CPB); - - /* - * Zen3 (Fam19 model < 0x10) parts are not susceptible to - * Branch Type Confusion, but predate the allocation of the - * BTC_NO bit. - */ - if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO)) - set_cpu_cap(c, X86_FEATURE_BTC_NO); } } @@ -1094,6 +1086,15 @@ static void init_amd_zen2(struct cpuinfo_x86 *c) static void init_amd_zen3(struct cpuinfo_x86 *c) { + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { + /* + * Zen3 (Fam19 model < 0x10) parts are not susceptible to + * Branch Type Confusion, but predate the allocation of the + * BTC_NO bit. + */ + if (!cpu_has(c, X86_FEATURE_BTC_NO)) + set_cpu_cap(c, X86_FEATURE_BTC_NO); + } } static void init_amd_zen4(struct cpuinfo_x86 *c) From 0da91912fc150d6d321b15e648bead202ced1a27 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 1 Nov 2023 12:31:44 +0100 Subject: [PATCH 089/310] x86/CPU/AMD: Move erratum 1076 fix into the Zen1 init function No functional changes. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-5-bp@alien8.de --- arch/x86/kernel/cpu/amd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3d7434796360..ebe6be8b57ce 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1024,6 +1024,11 @@ static void init_amd_zn(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA node_reclaim_distance = 32; #endif +} + +static void init_amd_zen(struct cpuinfo_x86 *c) +{ + fix_erratum_1386(c); /* Fix up CPUID bits, but only if not virtualised. */ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { @@ -1074,11 +1079,6 @@ static void zenbleed_check(struct cpuinfo_x86 *c) } } -static void init_amd_zen(struct cpuinfo_x86 *c) -{ - fix_erratum_1386(c); -} - static void init_amd_zen2(struct cpuinfo_x86 *c) { fix_erratum_1386(c); From cfbf4f992bfce1fa9f2f347a79cbbea0368e7971 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 1 Nov 2023 11:20:01 +0100 Subject: [PATCH 090/310] x86/CPU/AMD: Call the spectral chicken in the Zen2 init function No functional change. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-6-bp@alien8.de --- arch/x86/kernel/cpu/amd.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index ebe6be8b57ce..82747b6bc6e0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1007,10 +1007,8 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) * * This suppresses speculation from the middle of a basic block, i.e. it * suppresses non-branch predictions. - * - * We use STIBP as a heuristic to filter out Zen2 from the rest of F17H */ - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_AMD_STIBP)) { + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT; wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); @@ -1081,6 +1079,7 @@ static void zenbleed_check(struct cpuinfo_x86 *c) static void init_amd_zen2(struct cpuinfo_x86 *c) { + init_spectral_chicken(c); fix_erratum_1386(c); } @@ -1136,7 +1135,7 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; case 0x16: init_amd_jg(c); break; - case 0x17: init_spectral_chicken(c); + case 0x17: fallthrough; case 0x19: init_amd_zn(c); break; } From 7c81ad8e8bc28a1847e87c5afe1bae6bffb2f73e Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 1 Nov 2023 12:34:29 +0100 Subject: [PATCH 091/310] x86/CPU/AMD: Rename init_amd_zn() to init_amd_zen_common() Call it from all Zen init functions. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-7-bp@alien8.de --- arch/x86/kernel/cpu/amd.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 82747b6bc6e0..f53e0a227ac5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1017,7 +1017,7 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) #endif } -static void init_amd_zn(struct cpuinfo_x86 *c) +static void init_amd_zen_common(void) { #ifdef CONFIG_NUMA node_reclaim_distance = 32; @@ -1026,6 +1026,7 @@ static void init_amd_zn(struct cpuinfo_x86 *c) static void init_amd_zen(struct cpuinfo_x86 *c) { + init_amd_zen_common(); fix_erratum_1386(c); /* Fix up CPUID bits, but only if not virtualised. */ @@ -1075,16 +1076,20 @@ static void zenbleed_check(struct cpuinfo_x86 *c) } else { msr_clear_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT); } + } static void init_amd_zen2(struct cpuinfo_x86 *c) { + init_amd_zen_common(); init_spectral_chicken(c); fix_erratum_1386(c); } static void init_amd_zen3(struct cpuinfo_x86 *c) { + init_amd_zen_common(); + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { /* * Zen3 (Fam19 model < 0x10) parts are not susceptible to @@ -1098,6 +1103,7 @@ static void init_amd_zen3(struct cpuinfo_x86 *c) static void init_amd_zen4(struct cpuinfo_x86 *c) { + init_amd_zen_common(); } static void init_amd(struct cpuinfo_x86 *c) @@ -1135,9 +1141,6 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; case 0x16: init_amd_jg(c); break; - case 0x17: - fallthrough; - case 0x19: init_amd_zn(c); break; } if (boot_cpu_has(X86_FEATURE_ZEN)) From f69759be251dce722942594fbc62e53a40822a82 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 1 Nov 2023 12:38:35 +0100 Subject: [PATCH 092/310] x86/CPU/AMD: Move Zenbleed check to the Zen2 init function Prefix it properly so that it is clear which generation it is dealing with. No functional changes. Signed-off-by: Borislav Petkov (AMD) Link: http://lore.kernel.org/r/20231120104152.13740-8-bp@alien8.de --- arch/x86/kernel/cpu/amd.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f53e0a227ac5..3c3b4c1ad3ca 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -70,12 +70,6 @@ static const int amd_erratum_383[] = static const int amd_erratum_1054[] = AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); -static const int amd_zenbleed[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf), - AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf), - AMD_MODEL_RANGE(0x17, 0x90, 0x0, 0x91, 0xf), - AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf)); - static const int amd_div0[] = AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf), AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf)); @@ -1059,11 +1053,8 @@ static bool cpu_has_zenbleed_microcode(void) return true; } -static void zenbleed_check(struct cpuinfo_x86 *c) +static void zen2_zenbleed_check(struct cpuinfo_x86 *c) { - if (!cpu_has_amd_erratum(c, amd_zenbleed)) - return; - if (cpu_has(c, X86_FEATURE_HYPERVISOR)) return; @@ -1084,6 +1075,7 @@ static void init_amd_zen2(struct cpuinfo_x86 *c) init_amd_zen_common(); init_spectral_chicken(c); fix_erratum_1386(c); + zen2_zenbleed_check(c); } static void init_amd_zen3(struct cpuinfo_x86 *c) @@ -1227,8 +1219,6 @@ static void init_amd(struct cpuinfo_x86 *c) cpu_has(c, X86_FEATURE_AUTOIBRS)) WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); - zenbleed_check(c); - if (cpu_has_amd_erratum(c, amd_div0)) { pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); setup_force_cpu_bug(X86_BUG_DIV0); @@ -1393,7 +1383,7 @@ static void zenbleed_check_cpu(void *unused) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); - zenbleed_check(c); + zen2_zenbleed_check(c); } void amd_check_microcode(void) From bfff3c6692ce64fa9d86eb829d18229c307a0855 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 1 Nov 2023 12:52:01 +0100 Subject: [PATCH 093/310] x86/CPU/AMD: Move the DIV0 bug detection to the Zen1 init function No functional changes. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-9-bp@alien8.de --- arch/x86/kernel/cpu/amd.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3c3b4c1ad3ca..13ad44bea494 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -70,10 +70,6 @@ static const int amd_erratum_383[] = static const int amd_erratum_1054[] = AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); -static const int amd_div0[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf), - AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf)); - static const int amd_erratum_1485[] = AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf), AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf)); @@ -1030,6 +1026,9 @@ static void init_amd_zen(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_CPB)) set_cpu_cap(c, X86_FEATURE_CPB); } + + pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); + setup_force_cpu_bug(X86_BUG_DIV0); } static bool cpu_has_zenbleed_microcode(void) @@ -1067,7 +1066,6 @@ static void zen2_zenbleed_check(struct cpuinfo_x86 *c) } else { msr_clear_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT); } - } static void init_amd_zen2(struct cpuinfo_x86 *c) @@ -1219,11 +1217,6 @@ static void init_amd(struct cpuinfo_x86 *c) cpu_has(c, X86_FEATURE_AUTOIBRS)) WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); - if (cpu_has_amd_erratum(c, amd_div0)) { - pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); - setup_force_cpu_bug(X86_BUG_DIV0); - } - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has_amd_erratum(c, amd_erratum_1485)) msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); From 54c33e23f75d5c9925495231c57d3319336722ef Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 3 Nov 2023 19:53:49 +0100 Subject: [PATCH 094/310] x86/CPU/AMD: Get rid of amd_erratum_1054[] No functional changes. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-10-bp@alien8.de --- arch/x86/kernel/cpu/amd.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 13ad44bea494..219ae7e0fb5a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -66,10 +66,6 @@ static const int amd_erratum_400[] = static const int amd_erratum_383[] = AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); -/* #1054: Instructions Retired Performance Counter May Be Inaccurate */ -static const int amd_erratum_1054[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); - static const int amd_erratum_1485[] = AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf), AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf)); @@ -1201,7 +1197,7 @@ static void init_amd(struct cpuinfo_x86 *c) * Counter May Be Inaccurate". */ if (cpu_has(c, X86_FEATURE_IRPERF) && - !cpu_has_amd_erratum(c, amd_erratum_1054)) + (boot_cpu_has(X86_FEATURE_ZEN) && c->x86_model > 0x2f)) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); check_null_seg_clears_base(c); From 1709528f73d475d3c9ec514bc0dee0b41cadd871 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 3 Nov 2023 19:58:53 +0100 Subject: [PATCH 095/310] x86/CPU/AMD: Get rid of amd_erratum_383[] Set it in init_amd_gh() unconditionally as that is the F10h init function. No functional changes. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-11-bp@alien8.de --- arch/x86/kernel/cpu/amd.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 219ae7e0fb5a..7ab7f980c2d1 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -63,9 +63,6 @@ static const int amd_erratum_400[] = AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); -static const int amd_erratum_383[] = - AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); - static const int amd_erratum_1485[] = AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf), AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf)); @@ -876,8 +873,7 @@ static void init_amd_gh(struct cpuinfo_x86 *c) */ msr_clear_bit(MSR_AMD64_BU_CFG2, 24); - if (cpu_has_amd_erratum(c, amd_erratum_383)) - set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); + set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); } static void init_amd_ln(struct cpuinfo_x86 *c) From b3ffbbd282d4eb79f489853a171242c2a06bd8b8 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 3 Nov 2023 23:20:11 +0100 Subject: [PATCH 096/310] x86/CPU/AMD: Get rid of amd_erratum_400[] Setting X86_BUG_AMD_E400 in init_amd() is early enough. No functional changes. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-12-bp@alien8.de --- arch/x86/kernel/cpu/amd.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7ab7f980c2d1..550ac25c201b 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -59,10 +59,6 @@ static u32 nodes_per_socket = 1; #define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) #define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) -static const int amd_erratum_400[] = - AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), - AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); - static const int amd_erratum_1485[] = AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf), AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf)); @@ -765,15 +761,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (c->x86 == 0x16 && c->x86_model <= 0xf) msr_set_bit(MSR_AMD64_LS_CFG, 15); - /* - * Check whether the machine is affected by erratum 400. This is - * used to select the proper idle routine and to enable the check - * whether the machine is affected in arch_post_acpi_init(), which - * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check. - */ - if (cpu_has_amd_erratum(c, amd_erratum_400)) - set_cpu_bug(c, X86_BUG_AMD_E400); - early_detect_mem_encrypt(c); /* Re-enable TopologyExtensions if switched off by BIOS */ @@ -840,6 +827,16 @@ static void init_amd_k8(struct cpuinfo_x86 *c) msr_set_bit(MSR_K7_HWCR, 6); #endif set_cpu_bug(c, X86_BUG_SWAPGS_FENCE); + + /* + * Check models and steppings affected by erratum 400. This is + * used to select the proper idle routine and to enable the + * check whether the machine is affected in arch_post_acpi_subsys_init() + * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check. + */ + if (c->x86_model > 0x41 || + (c->x86_model == 0x41 && c->x86_stepping >= 0x2)) + setup_force_cpu_bug(X86_BUG_AMD_E400); } static void init_amd_gh(struct cpuinfo_x86 *c) @@ -874,6 +871,16 @@ static void init_amd_gh(struct cpuinfo_x86 *c) msr_clear_bit(MSR_AMD64_BU_CFG2, 24); set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); + + /* + * Check models and steppings affected by erratum 400. This is + * used to select the proper idle routine and to enable the + * check whether the machine is affected in arch_post_acpi_subsys_init() + * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check. + */ + if (c->x86_model > 0x2 || + (c->x86_model == 0x2 && c->x86_stepping >= 0x1)) + setup_force_cpu_bug(X86_BUG_AMD_E400); } static void init_amd_ln(struct cpuinfo_x86 *c) From 794c68b20408bb6899f90314e36e256924cc85a1 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 3 Nov 2023 23:21:56 +0100 Subject: [PATCH 097/310] x86/CPU/AMD: Get rid of amd_erratum_1485[] No functional changes. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-13-bp@alien8.de --- arch/x86/kernel/cpu/amd.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 550ac25c201b..f8be7ac40305 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -59,10 +59,6 @@ static u32 nodes_per_socket = 1; #define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) #define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) -static const int amd_erratum_1485[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf), - AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf)); - static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { int osvw_id = *erratum++; @@ -1093,6 +1089,9 @@ static void init_amd_zen3(struct cpuinfo_x86 *c) static void init_amd_zen4(struct cpuinfo_x86 *c) { init_amd_zen_common(); + + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); } static void init_amd(struct cpuinfo_x86 *c) @@ -1216,10 +1215,6 @@ static void init_amd(struct cpuinfo_x86 *c) cpu_has(c, X86_FEATURE_AUTOIBRS)) WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && - cpu_has_amd_erratum(c, amd_erratum_1485)) - msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); - /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); } From 05f5f73936fa4c1bc0a852702edf53789398d278 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 3 Nov 2023 23:40:48 +0100 Subject: [PATCH 098/310] x86/CPU/AMD: Drop now unused CPU erratum checking function Bye bye. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-14-bp@alien8.de --- arch/x86/kernel/cpu/amd.c | 56 --------------------------------------- 1 file changed, 56 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f8be7ac40305..89bbb1ac9ecf 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -34,62 +34,6 @@ */ static u32 nodes_per_socket = 1; -/* - * AMD errata checking - * - * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or - * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that - * have an OSVW id assigned, which it takes as first argument. Both take a - * variable number of family-specific model-stepping ranges created by - * AMD_MODEL_RANGE(). - * - * Example: - * - * const int amd_erratum_319[] = - * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), - * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), - * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); - */ - -#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } -#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } -#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ - ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) -#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) -#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) -#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) - -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) -{ - int osvw_id = *erratum++; - u32 range; - u32 ms; - - if (osvw_id >= 0 && osvw_id < 65536 && - cpu_has(cpu, X86_FEATURE_OSVW)) { - u64 osvw_len; - - rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); - if (osvw_id < osvw_len) { - u64 osvw_bits; - - rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), - osvw_bits); - return osvw_bits & (1ULL << (osvw_id & 0x3f)); - } - } - - /* OSVW unavailable or ID unknown, match family-model-stepping range */ - ms = (cpu->x86_model << 4) | cpu->x86_stepping; - while ((range = *erratum++)) - if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && - (ms >= AMD_MODEL_RANGE_START(range)) && - (ms <= AMD_MODEL_RANGE_END(range))) - return true; - - return false; -} - static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { u32 gprs[8] = { 0 }; From 5431fdd2c181dd2eac218e45b44deb2925fa48f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 17 Sep 2023 13:24:21 +0200 Subject: [PATCH 099/310] ptrace: Convert ptrace_attach() to use lock guards Created as testing for the conditional guard infrastructure. Specifically this makes use of the following form: scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR, &task->signal->cred_guard_mutex) { ... } ... return 0; Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20231102110706.568467727%40infradead.org --- include/linux/sched/task.h | 2 + include/linux/spinlock.h | 26 +++++++ kernel/ptrace.c | 154 ++++++++++++++++++------------------- 3 files changed, 102 insertions(+), 80 deletions(-) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index a23af225c898..4f3dca353556 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -226,4 +226,6 @@ static inline void task_unlock(struct task_struct *p) spin_unlock(&p->alloc_lock); } +DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T)) + #endif /* _LINUX_SCHED_TASK_H */ diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index ceb56b39c70f..90bc853cafb6 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -548,5 +548,31 @@ DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t, DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try, spin_trylock_irqsave(_T->lock, _T->flags)) +DEFINE_LOCK_GUARD_1(read_lock, rwlock_t, + read_lock(_T->lock), + read_unlock(_T->lock)) + +DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t, + read_lock_irq(_T->lock), + read_unlock_irq(_T->lock)) + +DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t, + read_lock_irqsave(_T->lock, _T->flags), + read_unlock_irqrestore(_T->lock, _T->flags), + unsigned long flags) + +DEFINE_LOCK_GUARD_1(write_lock, rwlock_t, + write_lock(_T->lock), + write_unlock(_T->lock)) + +DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t, + write_lock_irq(_T->lock), + write_unlock_irq(_T->lock)) + +DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t, + write_lock_irqsave(_T->lock, _T->flags), + write_unlock_irqrestore(_T->lock, _T->flags), + unsigned long flags) + #undef __LINUX_INSIDE_SPINLOCK_H #endif /* __LINUX_SPINLOCK_H */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d8b5e13a2229..5c579fb9a5e3 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -386,71 +386,9 @@ static int check_ptrace_options(unsigned long data) return 0; } -static int ptrace_attach(struct task_struct *task, long request, - unsigned long addr, - unsigned long flags) +static inline void ptrace_set_stopped(struct task_struct *task) { - bool seize = (request == PTRACE_SEIZE); - int retval; - - retval = -EIO; - if (seize) { - if (addr != 0) - goto out; - /* - * This duplicates the check in check_ptrace_options() because - * ptrace_attach() and ptrace_setoptions() have historically - * used different error codes for unknown ptrace options. - */ - if (flags & ~(unsigned long)PTRACE_O_MASK) - goto out; - retval = check_ptrace_options(flags); - if (retval) - return retval; - flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); - } else { - flags = PT_PTRACED; - } - - audit_ptrace(task); - - retval = -EPERM; - if (unlikely(task->flags & PF_KTHREAD)) - goto out; - if (same_thread_group(task, current)) - goto out; - - /* - * Protect exec's credential calculations against our interference; - * SUID, SGID and LSM creds get determined differently - * under ptrace. - */ - retval = -ERESTARTNOINTR; - if (mutex_lock_interruptible(&task->signal->cred_guard_mutex)) - goto out; - - task_lock(task); - retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); - task_unlock(task); - if (retval) - goto unlock_creds; - - write_lock_irq(&tasklist_lock); - retval = -EPERM; - if (unlikely(task->exit_state)) - goto unlock_tasklist; - if (task->ptrace) - goto unlock_tasklist; - - task->ptrace = flags; - - ptrace_link(task, current); - - /* SEIZE doesn't trap tracee on attach */ - if (!seize) - send_sig_info(SIGSTOP, SEND_SIG_PRIV, task); - - spin_lock(&task->sighand->siglock); + guard(spinlock)(&task->sighand->siglock); /* * If the task is already STOPPED, set JOBCTL_TRAP_STOP and @@ -474,28 +412,84 @@ static int ptrace_attach(struct task_struct *task, long request, task->jobctl &= ~JOBCTL_STOPPED; signal_wake_up_state(task, __TASK_STOPPED); } +} - spin_unlock(&task->sighand->siglock); +static int ptrace_attach(struct task_struct *task, long request, + unsigned long addr, + unsigned long flags) +{ + bool seize = (request == PTRACE_SEIZE); + int retval; - retval = 0; -unlock_tasklist: - write_unlock_irq(&tasklist_lock); -unlock_creds: - mutex_unlock(&task->signal->cred_guard_mutex); -out: - if (!retval) { + if (seize) { + if (addr != 0) + return -EIO; /* - * We do not bother to change retval or clear JOBCTL_TRAPPING - * if wait_on_bit() was interrupted by SIGKILL. The tracer will - * not return to user-mode, it will exit and clear this bit in - * __ptrace_unlink() if it wasn't already cleared by the tracee; - * and until then nobody can ptrace this task. + * This duplicates the check in check_ptrace_options() because + * ptrace_attach() and ptrace_setoptions() have historically + * used different error codes for unknown ptrace options. */ - wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE); - proc_ptrace_connector(task, PTRACE_ATTACH); + if (flags & ~(unsigned long)PTRACE_O_MASK) + return -EIO; + + retval = check_ptrace_options(flags); + if (retval) + return retval; + flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); + } else { + flags = PT_PTRACED; } - return retval; + audit_ptrace(task); + + if (unlikely(task->flags & PF_KTHREAD)) + return -EPERM; + if (same_thread_group(task, current)) + return -EPERM; + + /* + * Protect exec's credential calculations against our interference; + * SUID, SGID and LSM creds get determined differently + * under ptrace. + */ + scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR, + &task->signal->cred_guard_mutex) { + + scoped_guard (task_lock, task) { + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); + if (retval) + return retval; + } + + scoped_guard (write_lock_irq, &tasklist_lock) { + if (unlikely(task->exit_state)) + return -EPERM; + if (task->ptrace) + return -EPERM; + + task->ptrace = flags; + + ptrace_link(task, current); + + /* SEIZE doesn't trap tracee on attach */ + if (!seize) + send_sig_info(SIGSTOP, SEND_SIG_PRIV, task); + + ptrace_set_stopped(task); + } + } + + /* + * We do not bother to change retval or clear JOBCTL_TRAPPING + * if wait_on_bit() was interrupted by SIGKILL. The tracer will + * not return to user-mode, it will exit and clear this bit in + * __ptrace_unlink() if it wasn't already cleared by the tracee; + * and until then nobody can ptrace this task. + */ + wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE); + proc_ptrace_connector(task, PTRACE_ATTACH); + + return 0; } /** From 7d09a052a3bdb62de9a86d43359d6c22eeaf105a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 Nov 2023 10:13:22 -0500 Subject: [PATCH 100/310] x86: Add a comment about the "magic" behind shadow sti before mwait Add a note to make sure we never miss and break the requirements behind it. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/20231115151325.6262-2-frederic@kernel.org --- arch/x86/include/asm/mwait.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 778df05f8539..341ee4f1d91e 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -87,6 +87,15 @@ static __always_inline void __mwaitx(unsigned long eax, unsigned long ebx, :: "a" (eax), "b" (ebx), "c" (ecx)); } +/* + * Re-enable interrupts right upon calling mwait in such a way that + * no interrupt can fire _before_ the execution of mwait, ie: no + * instruction must be placed between "sti" and "mwait". + * + * This is necessary because if an interrupt queues a timer before + * executing mwait, it would otherwise go unnoticed and the next tick + * would not be reprogrammed accordingly before mwait ever wakes up. + */ static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx) { mds_idle_clear_cpu_buffers(); From edc8fc01f608108b0b7580cb2c29dfb5135e5f0e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 15 Nov 2023 10:13:23 -0500 Subject: [PATCH 101/310] x86: Fix CPUIDLE_FLAG_IRQ_ENABLE leaking timer reprogram intel_idle_irq() re-enables IRQs very early. As a result, an interrupt may fire before mwait() is eventually called. If such an interrupt queues a timer, it may go unnoticed until mwait returns and the idle loop handles the tick re-evaluation. And monitoring TIF_NEED_RESCHED doesn't help because a local timer enqueue doesn't set that flag. The issue is mitigated by the fact that this idle handler is only invoked for shallow C-states when, presumably, the next tick is supposed to be close enough. There may still be rare cases though when the next tick is far away and the selected C-state is shallow, resulting in a timer getting ignored for a while. Fix this with using sti_mwait() whose IRQ-reenablement only triggers upon calling mwait(), dealing with the race while keeping the interrupt latency within acceptable bounds. Fixes: c227233ad64c (intel_idle: enable interrupts before C1 on Xeons) Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/20231115151325.6262-3-frederic@kernel.org --- arch/x86/include/asm/mwait.h | 11 +++++++++-- drivers/idle/intel_idle.c | 19 +++++++------------ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 341ee4f1d91e..920426d691ce 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -124,8 +124,15 @@ static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned lo } __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (!need_resched()) - __mwait(eax, ecx); + + if (!need_resched()) { + if (ecx & 1) { + __mwait(eax, ecx); + } else { + __sti_mwait(eax, ecx); + raw_local_irq_disable(); + } + } } current_clr_polling(); } diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index dcda0afecfc5..3e01a6b23e75 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -131,11 +131,12 @@ static unsigned int mwait_substates __initdata; #define MWAIT2flg(eax) ((eax & 0xFF) << 24) static __always_inline int __intel_idle(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int index) + struct cpuidle_driver *drv, + int index, bool irqoff) { struct cpuidle_state *state = &drv->states[index]; unsigned long eax = flg2MWAIT(state->flags); - unsigned long ecx = 1; /* break on interrupt flag */ + unsigned long ecx = 1*irqoff; /* break on interrupt flag */ mwait_idle_with_hints(eax, ecx); @@ -159,19 +160,13 @@ static __always_inline int __intel_idle(struct cpuidle_device *dev, static __cpuidle int intel_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - return __intel_idle(dev, drv, index); + return __intel_idle(dev, drv, index, true); } static __cpuidle int intel_idle_irq(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - int ret; - - raw_local_irq_enable(); - ret = __intel_idle(dev, drv, index); - raw_local_irq_disable(); - - return ret; + return __intel_idle(dev, drv, index, false); } static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev, @@ -184,7 +179,7 @@ static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev, if (smt_active) __update_spec_ctrl(0); - ret = __intel_idle(dev, drv, index); + ret = __intel_idle(dev, drv, index, true); if (smt_active) __update_spec_ctrl(spec_ctrl); @@ -196,7 +191,7 @@ static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { fpu_idle_fpregs(); - return __intel_idle(dev, drv, index); + return __intel_idle(dev, drv, index, true); } /** From 360f051d82ee0cc580edfffe9e8c0b93011ab86d Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 30 Nov 2023 00:19:15 +1100 Subject: [PATCH 102/310] powerpc/suspend: Add prototype for do_after_copyback() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With HIBERNATION=y the build breaks with: arch/powerpc/kernel/swsusp_64.c:14:6: error: no previous prototype for ‘do_after_copyback’ [-Werror=missing-prototypes] 14 | void do_after_copyback(void) | ^~~~~~~~~~~~~~~~~ do_after_copyback() is only called from asm, so there is no prototype, nor any header where it makes sense to place one. Just add a prototype in the C file to fix the build error. Reviewed-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://msgid.link/20231129131919.2528517-1-mpe@ellerman.id.au --- arch/powerpc/kernel/swsusp_64.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/swsusp_64.c b/arch/powerpc/kernel/swsusp_64.c index 16ee3baaf09a..50fa8fc9ef95 100644 --- a/arch/powerpc/kernel/swsusp_64.c +++ b/arch/powerpc/kernel/swsusp_64.c @@ -11,6 +11,8 @@ #include #include +void do_after_copyback(void); + void do_after_copyback(void) { iommu_restore(); From 24afc61990de29dd47be7642c196a173f6cc21fc Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 30 Nov 2023 00:19:16 +1100 Subject: [PATCH 103/310] powerpc/512x: Make pdm360ng_init() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mpc512x_defconfig config fails with: arch/powerpc/platforms/512x/pdm360ng.c:104:13: error: no previous prototype for ‘pdm360ng_init’ [-Werror=missing-prototypes] 104 | void __init pdm360ng_init(void) | ^~~~~~~~~~~~~ Fix it by making pdm360ng_init() static. Reviewed-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://msgid.link/20231129131919.2528517-2-mpe@ellerman.id.au --- arch/powerpc/platforms/512x/pdm360ng.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/512x/pdm360ng.c b/arch/powerpc/platforms/512x/pdm360ng.c index ce51cfeeb066..8bbbf78bb42b 100644 --- a/arch/powerpc/platforms/512x/pdm360ng.c +++ b/arch/powerpc/platforms/512x/pdm360ng.c @@ -101,7 +101,7 @@ static inline void __init pdm360ng_touchscreen_init(void) } #endif /* CONFIG_TOUCHSCREEN_ADS7846 */ -void __init pdm360ng_init(void) +static void __init pdm360ng_init(void) { mpc512x_init(); pdm360ng_touchscreen_init(); From 10feb8f9612239b665815807e950bcd999a75dd2 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 30 Nov 2023 00:19:17 +1100 Subject: [PATCH 104/310] powerpc/512x: Fix missing prototype warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mpc512x_defconfig build fails with: arch/powerpc/platforms/512x/mpc5121_ads_cpld.c:142:1: error: no previous prototype for ‘mpc5121_ads_cpld_map’ [-Werror=missing-prototypes] 142 | mpc5121_ads_cpld_map(void) | ^~~~~~~~~~~~~~~~~~~~ arch/powerpc/platforms/512x/mpc5121_ads_cpld.c:157:1: error: no previous prototype for ‘mpc5121_ads_cpld_pic_init’ [-Werror=missing-prototypes] 157 | mpc5121_ads_cpld_pic_init(void) | ^~~~~~~~~~~~~~~~~~~~~~~~~ There are prototypes for these functions but the header they are in is not included by mpc5121_ads_cpld.c. Include it to fix the build error. Reviewed-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://msgid.link/20231129131919.2528517-3-mpe@ellerman.id.au --- arch/powerpc/platforms/512x/mpc5121_ads_cpld.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c index 6f08d07aee3b..e995eb30bf09 100644 --- a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c +++ b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c @@ -17,6 +17,8 @@ #include #include +#include "mpc5121_ads.h" + static struct device_node *cpld_pic_node; static struct irq_domain *cpld_pic_host; From b90ad501715f2feb1b0bf97aa700adb39c78deb3 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 30 Nov 2023 00:19:18 +1100 Subject: [PATCH 105/310] powerpc/44x: Make ppc44x_idle_init() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 44x/fsp2_defconfig build fails with: arch/powerpc/platforms/44x/idle.c:30:12: error: no previous prototype for ‘ppc44x_idle_init’ [-Werror=missing-prototypes] 30 | int __init ppc44x_idle_init(void) | ^~~~~~~~~~~~~~~~ Fix it by making ppc44x_idle_init() static. Reviewed-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://msgid.link/20231129131919.2528517-4-mpe@ellerman.id.au --- arch/powerpc/platforms/44x/idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/44x/idle.c b/arch/powerpc/platforms/44x/idle.c index f533b495e7db..e2eeef8dff78 100644 --- a/arch/powerpc/platforms/44x/idle.c +++ b/arch/powerpc/platforms/44x/idle.c @@ -27,7 +27,7 @@ static void ppc44x_idle(void) isync(); } -int __init ppc44x_idle_init(void) +static int __init ppc44x_idle_init(void) { if (!mode_spin) { /* If we are not setting spin mode From 60c2ea7c89e375804171552d8ea53d9084ec3269 Mon Sep 17 00:00:00 2001 From: Samuel Zeter Date: Wed, 29 Nov 2023 15:17:41 -0700 Subject: [PATCH 106/310] x86/tools: objdump_reformat.awk: Ensure regex matches fwait If there is "wait" mnemonic in the line being parsed, it is incorrectly handled by the script, and an extra line of "fwait" in objdump_reformat's output is inserted. As insn_decoder_test relies upon the formatted output, the test fails. This is reproducible when disassembling with llvm-objdump: Pre-processed lines: ffffffff81033e72: 9b wait ffffffff81033e73: 48 c7 c7 89 50 42 82 movq After objdump_reformat.awk: ffffffff81033e72: 9b fwait ffffffff81033e72: wait ffffffff81033e73: 48 c7 c7 89 50 42 82 movq The regex match now accepts spaces or tabs, along with the "fwait" instruction. Signed-off-by: Samuel Zeter Signed-off-by: Nathan Chancellor Signed-off-by: Ingo Molnar Tested-by: Nathan Chancellor Tested-by: Kees Cook Reviewed-by: Kees Cook Acked-by: Masami Hiramatsu Link: https://lore.kernel.org/r/20231129-objdump-reformat-llvm-v3-1-0d855e79314d@kernel.org Closes: https://github.com/ClangBuiltLinux/linux/issues/1364 --- arch/x86/tools/objdump_reformat.awk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/tools/objdump_reformat.awk b/arch/x86/tools/objdump_reformat.awk index f418c91b71f0..276e572a6f60 100644 --- a/arch/x86/tools/objdump_reformat.awk +++ b/arch/x86/tools/objdump_reformat.awk @@ -12,7 +12,7 @@ BEGIN { prev_hex = "" prev_mnemonic = "" bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))" - fwait_expr = "^9b " + fwait_expr = "^9b[ \t]*fwait" fwait_str="9b\tfwait" } From f4570ebd836363dc7722b8eb8d099b311021af13 Mon Sep 17 00:00:00 2001 From: Samuel Zeter Date: Wed, 29 Nov 2023 15:17:42 -0700 Subject: [PATCH 107/310] x86/tools: objdump_reformat.awk: Allow for spaces GNU objdump and LLVM objdump have differing output formats. Specifically, GNU objump will format its output as: address:hex, whereas LLVM objdump displays its output as address:hex. objdump_reformat.awk incorrectly handles this discrepancy due to the unexpected space and as a result insn_decoder_test fails, as its input is garbled. The instruction line being tokenized now handles a space and colon, or tab delimiter. Signed-off-by: Samuel Zeter Signed-off-by: Nathan Chancellor Signed-off-by: Ingo Molnar Tested-by: Nathan Chancellor Tested-by: Kees Cook Reviewed-by: Kees Cook Acked-by: Masami Hiramatsu Link: https://lore.kernel.org/r/20231129-objdump-reformat-llvm-v3-2-0d855e79314d@kernel.org Closes: https://github.com/ClangBuiltLinux/linux/issues/1364 --- arch/x86/tools/objdump_reformat.awk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/tools/objdump_reformat.awk b/arch/x86/tools/objdump_reformat.awk index 276e572a6f60..a4120d907277 100644 --- a/arch/x86/tools/objdump_reformat.awk +++ b/arch/x86/tools/objdump_reformat.awk @@ -22,7 +22,7 @@ BEGIN { } /^ *[0-9a-f]+:/ { - if (split($0, field, "\t") < 3) { + if (split($0, field, /: |\t/) < 3) { # This is a continuation of the same insn. prev_hex = prev_hex field[2] } else { From 5225952d74d43e4c054731c74b8afd700b23a94a Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 29 Nov 2023 15:17:43 -0700 Subject: [PATCH 108/310] x86/tools: Remove chkobjdump.awk This check is superfluous now that the minimum version of binutils to build the kernel is 2.25. This also fixes an error seen with llvm-objdump because it does not support '-v' prior to LLVM 13: llvm-objdump: error: unknown argument '-v' Signed-off-by: Nathan Chancellor Signed-off-by: Ingo Molnar Tested-by: Kees Cook Reviewed-by: Kees Cook Link: https://github.com/llvm/llvm-project/commit/dde24a87c55f82d8c7b3bf3eafb10f2b9b2b9a01 Link: https://lore.kernel.org/r/20231129-objdump-reformat-llvm-v3-3-0d855e79314d@kernel.org Closes: https://github.com/ClangBuiltLinux/linux/issues/1362 --- arch/x86/tools/Makefile | 2 +- arch/x86/tools/chkobjdump.awk | 34 ---------------------------------- 2 files changed, 1 insertion(+), 35 deletions(-) delete mode 100644 arch/x86/tools/chkobjdump.awk diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index 90e820ac9771..7278e2545c35 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -17,7 +17,7 @@ reformatter = $(srctree)/arch/x86/tools/objdump_reformat.awk chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk quiet_cmd_posttest = TEST $@ - cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(reformatter) | $(obj)/insn_decoder_test $(posttest_64bit) $(posttest_verbose) + cmd_posttest = $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(reformatter) | $(obj)/insn_decoder_test $(posttest_64bit) $(posttest_verbose) quiet_cmd_sanitytest = TEST $@ cmd_sanitytest = $(obj)/insn_sanity $(posttest_64bit) -m 1000000 diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk deleted file mode 100644 index a4cf678cf5c8..000000000000 --- a/arch/x86/tools/chkobjdump.awk +++ /dev/null @@ -1,34 +0,0 @@ -# GNU objdump version checker -# -# Usage: -# objdump -v | awk -f chkobjdump.awk -BEGIN { - # objdump version 2.19 or later is OK for the test. - od_ver = 2; - od_sver = 19; -} - -/^GNU objdump/ { - verstr = "" - gsub(/\(.*\)/, ""); - for (i = 3; i <= NF; i++) - if (match($(i), "^[0-9]")) { - verstr = $(i); - break; - } - if (verstr == "") { - printf("Warning: Failed to find objdump version number.\n"); - exit 0; - } - split(verstr, ver, "."); - if (ver[1] > od_ver || - (ver[1] == od_ver && ver[2] >= od_sver)) { - exit 1; - } else { - printf("Warning: objdump version %s is older than %d.%d\n", - verstr, od_ver, od_sver); - print("Warning: Skipping posttest."); - # Logic is inverted, because we just skip test without error. - exit 0; - } -} From 78a509fba9c9b1fcb77f95b7c6be30da3d24823a Mon Sep 17 00:00:00 2001 From: Jun'ichi Nomura Date: Wed, 29 Nov 2023 15:44:49 -0500 Subject: [PATCH 109/310] x86/boot: Ignore NMIs during very early boot When there are two racing NMIs on x86, the first NMI invokes NMI handler and the 2nd NMI is latched until IRET is executed. If panic on NMI and panic kexec are enabled, the first NMI triggers panic and starts booting the next kernel via kexec. Note that the 2nd NMI is still latched. During the early boot of the next kernel, once an IRET is executed as a result of a page fault, then the 2nd NMI is unlatched and invokes the NMI handler. However, NMI handler is not set up at the early stage of boot, which results in a boot failure. Avoid such problems by setting up a NOP handler for early NMIs. [ mingo: Refined the changelog. ] Signed-off-by: Jun'ichi Nomura Signed-off-by: Derek Barbosa Signed-off-by: Ingo Molnar Cc: Kees Cook Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Andy Lutomirski Cc: "H. Peter Anvin" Cc: Peter Zijlstra --- arch/x86/boot/compressed/ident_map_64.c | 5 +++++ arch/x86/boot/compressed/idt_64.c | 1 + arch/x86/boot/compressed/idt_handlers_64.S | 1 + arch/x86/boot/compressed/misc.h | 1 + 4 files changed, 8 insertions(+) diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index 473ba59b82a8..d040080d7edb 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -386,3 +386,8 @@ void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) */ kernel_add_identity_map(address, end); } + +void do_boot_nmi_trap(struct pt_regs *regs, unsigned long error_code) +{ + /* Empty handler to ignore NMI during early boot */ +} diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c index 3cdf94b41456..d100284bbef4 100644 --- a/arch/x86/boot/compressed/idt_64.c +++ b/arch/x86/boot/compressed/idt_64.c @@ -61,6 +61,7 @@ void load_stage2_idt(void) boot_idt_desc.address = (unsigned long)boot_idt; set_idt_entry(X86_TRAP_PF, boot_page_fault); + set_idt_entry(X86_TRAP_NMI, boot_nmi_trap); #ifdef CONFIG_AMD_MEM_ENCRYPT /* diff --git a/arch/x86/boot/compressed/idt_handlers_64.S b/arch/x86/boot/compressed/idt_handlers_64.S index 22890e199f5b..4d03c8562f63 100644 --- a/arch/x86/boot/compressed/idt_handlers_64.S +++ b/arch/x86/boot/compressed/idt_handlers_64.S @@ -70,6 +70,7 @@ SYM_FUNC_END(\name) .code64 EXCEPTION_HANDLER boot_page_fault do_boot_page_fault error_code=1 +EXCEPTION_HANDLER boot_nmi_trap do_boot_nmi_trap error_code=0 #ifdef CONFIG_AMD_MEM_ENCRYPT EXCEPTION_HANDLER boot_stage1_vc do_vc_no_ghcb error_code=1 diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index c0d502bd8716..bc2f0f17fb90 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -196,6 +196,7 @@ static inline void cleanup_exception_handling(void) { } /* IDT Entry Points */ void boot_page_fault(void); +void boot_nmi_trap(void); void boot_stage1_vc(void); void boot_stage2_vc(void); From ede66cd22441820cbd399936bf84fdc4294bc7fa Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 30 Nov 2023 00:19:19 +1100 Subject: [PATCH 110/310] powerpc/64s: Fix CONFIG_NUMA=n build due to create_section_mapping() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With CONFIG_NUMA=n the build fails with: arch/powerpc/mm/book3s64/pgtable.c:275:15: error: no previous prototype for ‘create_section_mapping’ [-Werror=missing-prototypes] 275 | int __meminit create_section_mapping(unsigned long start, unsigned long end, | ^~~~~~~~~~~~~~~~~~~~~~ That happens because the prototype for create_section_mapping() is in asm/mmzone.h, but asm/mmzone.h is only included by linux/mmzone.h when CONFIG_NUMA=y. In fact the prototype is only needed by arch/powerpc/mm code, so move the prototype into arch/powerpc/mm/mmu_decl.h, which also fixes the build error. Signed-off-by: Michael Ellerman Link: https://msgid.link/20231129131919.2528517-5-mpe@ellerman.id.au --- arch/powerpc/include/asm/mmzone.h | 5 ----- arch/powerpc/mm/mmu_decl.h | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h index 4c6c6dbd182f..4740ca230d36 100644 --- a/arch/powerpc/include/asm/mmzone.h +++ b/arch/powerpc/include/asm/mmzone.h @@ -46,10 +46,5 @@ u64 memory_hotplug_max(void); #define __HAVE_ARCH_RESERVED_KERNEL_PAGES #endif -#ifdef CONFIG_MEMORY_HOTPLUG -extern int create_section_mapping(unsigned long start, unsigned long end, - int nid, pgprot_t prot); -#endif - #endif /* __KERNEL__ */ #endif /* _ASM_MMZONE_H_ */ diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 7f9ff0640124..72341b9fb552 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -181,3 +181,8 @@ static inline bool debug_pagealloc_enabled_or_kfence(void) { return IS_ENABLED(CONFIG_KFENCE) || debug_pagealloc_enabled(); } + +#ifdef CONFIG_MEMORY_HOTPLUG +int create_section_mapping(unsigned long start, unsigned long end, + int nid, pgprot_t prot); +#endif From d8c3f243d4db24675b653f0568bb65dae34e6455 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 30 Nov 2023 22:44:32 +1100 Subject: [PATCH 111/310] powerpc/mm: Fix build failures due to arch_reserved_kernel_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With NUMA=n and FA_DUMP=y or PRESERVE_FA_DUMP=y the build fails with: arch/powerpc/kernel/fadump.c:1739:22: error: no previous prototype for ‘arch_reserved_kernel_pages’ [-Werror=missing-prototypes] 1739 | unsigned long __init arch_reserved_kernel_pages(void) | ^~~~~~~~~~~~~~~~~~~~~~~~~~ The prototype for arch_reserved_kernel_pages() is in include/linux/mm.h, but it's guarded by __HAVE_ARCH_RESERVED_KERNEL_PAGES. The powerpc headers define __HAVE_ARCH_RESERVED_KERNEL_PAGES in asm/mmzone.h, which is not included into the generic headers when NUMA=n. Move the definition of __HAVE_ARCH_RESERVED_KERNEL_PAGES into asm/mmu.h which is included regardless of NUMA=n. Additionally the ifdef around __HAVE_ARCH_RESERVED_KERNEL_PAGES needs to also check for CONFIG_PRESERVE_FA_DUMP. Signed-off-by: Michael Ellerman Link: https://msgid.link/20231130114433.3053544-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/mmu.h | 4 ++++ arch/powerpc/include/asm/mmzone.h | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 52cc25864a1b..d8b7e246a32f 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -412,5 +412,9 @@ extern void *abatron_pteptrs[2]; #include #endif +#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP) +#define __HAVE_ARCH_RESERVED_KERNEL_PAGES +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_MMU_H_ */ diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h index 4740ca230d36..da827d2d0866 100644 --- a/arch/powerpc/include/asm/mmzone.h +++ b/arch/powerpc/include/asm/mmzone.h @@ -42,9 +42,6 @@ u64 memory_hotplug_max(void); #else #define memory_hotplug_max() memblock_end_of_DRAM() #endif /* CONFIG_NUMA */ -#ifdef CONFIG_FA_DUMP -#define __HAVE_ARCH_RESERVED_KERNEL_PAGES -#endif #endif /* __KERNEL__ */ #endif /* _ASM_MMZONE_H_ */ From f8d3555355653848082c351fa90775214fb8a4fa Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 30 Nov 2023 22:44:33 +1100 Subject: [PATCH 112/310] powerpc: Fix build error due to is_valid_bugaddr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With CONFIG_GENERIC_BUG=n the build fails with: arch/powerpc/kernel/traps.c:1442:5: error: no previous prototype for ‘is_valid_bugaddr’ [-Werror=missing-prototypes] 1442 | int is_valid_bugaddr(unsigned long addr) | ^~~~~~~~~~~~~~~~ The prototype is only defined, and the function is only needed, when CONFIG_GENERIC_BUG=y, so move the implementation under that. Signed-off-by: Michael Ellerman Link: https://msgid.link/20231130114433.3053544-2-mpe@ellerman.id.au --- arch/powerpc/kernel/traps.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 5ea2014aff90..11e062b47d3f 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1439,10 +1439,12 @@ static int emulate_instruction(struct pt_regs *regs) return -EINVAL; } +#ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long addr) { return is_kernel_addr(addr); } +#endif #ifdef CONFIG_MATH_EMULATION static int emulate_math(struct pt_regs *regs) From e12d8e2602d2bcd26022eff3e2519d25925e760c Mon Sep 17 00:00:00 2001 From: Zhao Ke Date: Wed, 29 Nov 2023 15:58:45 +0800 Subject: [PATCH 113/310] powerpc: Add PVN support for HeXin C2000 processor HeXin Tech Co. has applied for a new PVN from the OpenPower Community for its new processor C2000. The OpenPower has assigned a new PVN and this newly assigned PVN is 0x0066, add pvr register related support for this PVN. Signed-off-by: Zhao Ke Link: https://discuss.openpower.foundation/t/how-to-get-a-new-pvr-for-processors-follow-power-isa/477/10 Signed-off-by: Michael Ellerman Link: https://msgid.link/20231129075845.57976-1-ke.zhao@shingroup.cn --- arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/kernel/cpu_specs_book3s_64.h | 15 +++++++++++++++ arch/powerpc/kvm/book3s_pr.c | 1 + arch/powerpc/mm/book3s64/pkeys.c | 3 ++- arch/powerpc/platforms/powernv/subcore.c | 3 ++- drivers/misc/cxl/cxl.h | 3 ++- 6 files changed, 23 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 4ae4ab9090a2..7fd09f25452d 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1361,6 +1361,7 @@ #define PVR_POWER8E 0x004B #define PVR_POWER8NVL 0x004C #define PVR_POWER8 0x004D +#define PVR_HX_C2000 0x0066 #define PVR_POWER9 0x004E #define PVR_POWER10 0x0080 #define PVR_BE 0x0070 diff --git a/arch/powerpc/kernel/cpu_specs_book3s_64.h b/arch/powerpc/kernel/cpu_specs_book3s_64.h index c370c1b804a9..3ff9757df4c0 100644 --- a/arch/powerpc/kernel/cpu_specs_book3s_64.h +++ b/arch/powerpc/kernel/cpu_specs_book3s_64.h @@ -238,6 +238,21 @@ static struct cpu_spec cpu_specs[] __initdata = { .machine_check_early = __machine_check_early_realmode_p8, .platform = "power8", }, + { /* 2.07-compliant processor, HeXin C2000 processor */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00660000, + .cpu_name = "HX-C2000", + .cpu_features = CPU_FTRS_POWER8, + .cpu_user_features = COMMON_USER_POWER8, + .cpu_user_features2 = COMMON_USER2_POWER8, + .mmu_features = MMU_FTRS_POWER8, + .icache_bsize = 128, + .dcache_bsize = 128, + .cpu_setup = __setup_cpu_power8, + .cpu_restore = __restore_cpu_power8, + .machine_check_early = __machine_check_early_realmode_p8, + .platform = "power8", + }, { /* 3.00-compliant processor, i.e. Power9 "architected" mode */ .pvr_mask = 0xffffffff, .pvr_value = 0x0f000005, diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 9118242063fb..5b92619a05fd 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -604,6 +604,7 @@ static void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) case PVR_POWER8: case PVR_POWER8E: case PVR_POWER8NVL: + case PVR_HX_C2000: case PVR_POWER9: vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE | BOOK3S_HFLAG_NEW_TLBIE; diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c index 125733962033..a974baf8f327 100644 --- a/arch/powerpc/mm/book3s64/pkeys.c +++ b/arch/powerpc/mm/book3s64/pkeys.c @@ -89,7 +89,8 @@ static int __init scan_pkey_feature(void) unsigned long pvr = mfspr(SPRN_PVR); if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E || - PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9) + PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9 || + PVR_VER(pvr) == PVR_HX_C2000) pkeys_total = 32; } } diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c index 191424468f10..393e747541fb 100644 --- a/arch/powerpc/platforms/powernv/subcore.c +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -425,7 +425,8 @@ static int subcore_init(void) if (pvr_ver != PVR_POWER8 && pvr_ver != PVR_POWER8E && - pvr_ver != PVR_POWER8NVL) + pvr_ver != PVR_POWER8NVL && + pvr_ver != PVR_HX_C2000) return 0; /* diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index 0562071cdd4a..6ad0ab892675 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -836,7 +836,8 @@ static inline bool cxl_is_power8(void) { if ((pvr_version_is(PVR_POWER8E)) || (pvr_version_is(PVR_POWER8NVL)) || - (pvr_version_is(PVR_POWER8))) + (pvr_version_is(PVR_POWER8)) || + (pvr_version_is(PVR_HX_C2000))) return true; return false; } From a9e1e4d6e8c77c732e8084b03bae0c78cafdceb0 Mon Sep 17 00:00:00 2001 From: Dario Binacchi Date: Fri, 24 Nov 2023 11:02:37 +0100 Subject: [PATCH 114/310] powerpc/85xx: Fix typo in code comment s/singals/signals/ Signed-off-by: Dario Binacchi Signed-off-by: Michael Ellerman Link: https://msgid.link/20231124100241.660374-1-dario.binacchi@amarulasolutions.com --- arch/powerpc/platforms/85xx/mpc85xx_rdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/85xx/mpc85xx_rdb.c b/arch/powerpc/platforms/85xx/mpc85xx_rdb.c index ec9f60fbebc7..e0cec670d8db 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_rdb.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_rdb.c @@ -76,7 +76,7 @@ static void __init mpc85xx_rdb_setup_arch(void) /* P1025 has pins muxed for QE and other functions. To * enable QE UEC mode, we need to set bit QE0 for UCC1 * in Eth mode, QE0 and QE3 for UCC5 in Eth mode, QE9 - * and QE12 for QE MII management singals in PMUXCR + * and QE12 for QE MII management signals in PMUXCR * register. */ setbits32(&guts->pmuxcr, MPC85xx_PMUXCR_QE(0) | From 4a74197b65e69c46fe6e53f7df2f4d6ce9ffe012 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 30 Nov 2023 21:51:59 -0800 Subject: [PATCH 115/310] powerpc/44x: select I2C for CURRITUCK Fix build errors when CURRITUCK=y and I2C is not builtin (=m or is not set). Fixes these build errors: powerpc-linux-ld: arch/powerpc/platforms/44x/ppc476.o: in function `avr_halt_system': ppc476.c:(.text+0x58): undefined reference to `i2c_smbus_write_byte_data' powerpc-linux-ld: arch/powerpc/platforms/44x/ppc476.o: in function `ppc47x_device_probe': ppc476.c:(.init.text+0x18): undefined reference to `i2c_register_driver' Fixes: 2a2c74b2efcb ("IBM Akebono: Add the Akebono platform") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Closes: lore.kernel.org/r/202312010820.cmdwF5X9-lkp@intel.com Signed-off-by: Michael Ellerman Link: https://msgid.link/20231201055159.8371-1-rdunlap@infradead.org --- arch/powerpc/platforms/44x/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index 1624ebf95497..35a1f4b9f827 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -173,6 +173,7 @@ config ISS4xx config CURRITUCK bool "IBM Currituck (476fpe) Support" depends on PPC_47x + select I2C select SWIOTLB select 476FPE select FORCE_PCI From bd68ffce69f6cf8ddd3a3c32549d1d2275e49fc5 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 14 Nov 2023 11:01:53 -0600 Subject: [PATCH 116/310] powerpc/pseries/memhp: Fix access beyond end of drmem array dlpar_memory_remove_by_index() may access beyond the bounds of the drmem lmb array when the LMB lookup fails to match an entry with the given DRC index. When the search fails, the cursor is left pointing to &drmem_info->lmbs[drmem_info->n_lmbs], which is one element past the last valid entry in the array. The debug message at the end of the function then dereferences this pointer: pr_debug("Failed to hot-remove memory at %llx\n", lmb->base_addr); This was found by inspection and confirmed with KASAN: pseries-hotplug-mem: Attempting to hot-remove LMB, drc index 1234 ================================================================== BUG: KASAN: slab-out-of-bounds in dlpar_memory+0x298/0x1658 Read of size 8 at addr c000000364e97fd0 by task bash/949 dump_stack_lvl+0xa4/0xfc (unreliable) print_report+0x214/0x63c kasan_report+0x140/0x2e0 __asan_load8+0xa8/0xe0 dlpar_memory+0x298/0x1658 handle_dlpar_errorlog+0x130/0x1d0 dlpar_store+0x18c/0x3e0 kobj_attr_store+0x68/0xa0 sysfs_kf_write+0xc4/0x110 kernfs_fop_write_iter+0x26c/0x390 vfs_write+0x2d4/0x4e0 ksys_write+0xac/0x1a0 system_call_exception+0x268/0x530 system_call_vectored_common+0x15c/0x2ec Allocated by task 1: kasan_save_stack+0x48/0x80 kasan_set_track+0x34/0x50 kasan_save_alloc_info+0x34/0x50 __kasan_kmalloc+0xd0/0x120 __kmalloc+0x8c/0x320 kmalloc_array.constprop.0+0x48/0x5c drmem_init+0x2a0/0x41c do_one_initcall+0xe0/0x5c0 kernel_init_freeable+0x4ec/0x5a0 kernel_init+0x30/0x1e0 ret_from_kernel_user_thread+0x14/0x1c The buggy address belongs to the object at c000000364e80000 which belongs to the cache kmalloc-128k of size 131072 The buggy address is located 0 bytes to the right of allocated 98256-byte region [c000000364e80000, c000000364e97fd0) ================================================================== pseries-hotplug-mem: Failed to hot-remove memory at 0 Log failed lookups with a separate message and dereference the cursor only when it points to a valid entry. Signed-off-by: Nathan Lynch Fixes: 51925fb3c5c9 ("powerpc/pseries: Implement memory hotplug remove in the kernel") Signed-off-by: Michael Ellerman Link: https://msgid.link/20231114-pseries-memhp-fixes-v1-1-fb8f2bb7c557@linux.ibm.com --- arch/powerpc/platforms/pseries/hotplug-memory.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index a43bfb01720a..6f2eebae7bee 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -436,14 +436,15 @@ static int dlpar_memory_remove_by_index(u32 drc_index) } } - if (!lmb_found) + if (!lmb_found) { + pr_debug("Failed to look up LMB for drc index %x\n", drc_index); rc = -EINVAL; - - if (rc) + } else if (rc) { pr_debug("Failed to hot-remove memory at %llx\n", lmb->base_addr); - else + } else { pr_debug("Memory at %llx was hot-removed\n", lmb->base_addr); + } return rc; } From 27951e1d8274e9f9a2925b069e4492939a3f2099 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 14 Nov 2023 11:01:55 -0600 Subject: [PATCH 117/310] powerpc/pseries/memhp: Log more error conditions in add path When an add operation for multiple LMBs fails, there is currently little indication from the kernel of what went wrong. Be a little more verbose about error conditions in the add paths. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231114-pseries-memhp-fixes-v1-3-fb8f2bb7c557@linux.ibm.com --- arch/powerpc/platforms/pseries/hotplug-memory.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 6f2eebae7bee..3fe3ddb30c04 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -208,8 +208,10 @@ static int dlpar_change_lmb_state(struct drmem_lmb *lmb, bool online) int rc; mem_block = lmb_to_memblock(lmb); - if (!mem_block) + if (!mem_block) { + pr_err("Failed memory block lookup for LMB 0x%x\n", lmb->drc_index); return -EINVAL; + } if (online && mem_block->dev.offline) rc = device_online(&mem_block->dev); @@ -576,6 +578,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) rc = update_lmb_associativity_index(lmb); if (rc) { dlpar_release_drc(lmb->drc_index); + pr_err("Failed to configure LMB 0x%x\n", lmb->drc_index); return rc; } @@ -589,12 +592,14 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) /* Add the memory */ rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_MEMMAP_ON_MEMORY); if (rc) { + pr_err("Failed to add LMB 0x%x to node %u", lmb->drc_index, nid); invalidate_lmb_associativity_index(lmb); return rc; } rc = dlpar_online_lmb(lmb); if (rc) { + pr_err("Failed to online LMB 0x%x on node %u\n", lmb->drc_index, nid); __remove_memory(lmb->base_addr, block_sz); invalidate_lmb_associativity_index(lmb); } else { From a51749ab34d9e5dec548fe38ede7e01e8bb26454 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 30 Nov 2023 21:48:17 +0100 Subject: [PATCH 118/310] locking/mutex: Document that mutex_unlock() is non-atomic I have seen several cases of attempts to use mutex_unlock() to release an object such that the object can then be freed by another task. This is not safe because mutex_unlock(), in the MUTEX_FLAG_WAITERS && !MUTEX_FLAG_HANDOFF case, accesses the mutex structure after having marked it as unlocked; so mutex_unlock() requires its caller to ensure that the mutex stays alive until mutex_unlock() returns. If MUTEX_FLAG_WAITERS is set and there are real waiters, those waiters have to keep the mutex alive, but we could have a spurious MUTEX_FLAG_WAITERS left if an interruptible/killable waiter bailed between the points where __mutex_unlock_slowpath() did the cmpxchg reading the flags and where it acquired the wait_lock. ( With spinlocks, that kind of code pattern is allowed and, from what I remember, used in several places in the kernel. ) Document this, such a semantic difference between mutexes and spinlocks is fairly unintuitive. [ mingo: Made the changelog a bit more assertive, refined the comments. ] Signed-off-by: Jann Horn Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231130204817.2031407-1-jannh@google.com --- Documentation/locking/mutex-design.rst | 6 ++++++ kernel/locking/mutex.c | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/Documentation/locking/mutex-design.rst b/Documentation/locking/mutex-design.rst index 78540cd7f54b..7572339b2f12 100644 --- a/Documentation/locking/mutex-design.rst +++ b/Documentation/locking/mutex-design.rst @@ -101,6 +101,12 @@ features that make lock debugging easier and faster: - Detects multi-task circular deadlocks and prints out all affected locks and tasks (and only those tasks). +Releasing a mutex is not an atomic operation: Once a mutex release operation +has begun, another context may be able to acquire the mutex before the release +operation has fully completed. The mutex user must ensure that the mutex is not +destroyed while a release operation is still in progress - in other words, +callers of mutex_unlock() must ensure that the mutex stays alive until +mutex_unlock() has returned. Interfaces ---------- diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 2deeeca3e71b..cbae8c0b89ab 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -532,6 +532,11 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne * This function must not be used in interrupt context. Unlocking * of a not locked mutex is not allowed. * + * The caller must ensure that the mutex stays alive until this function has + * returned - mutex_unlock() can NOT directly be used to release an object such + * that another concurrent task can free it. + * Mutexes are different from spinlocks & refcounts in this aspect. + * * This function is similar to (but not equivalent to) up(). */ void __sched mutex_unlock(struct mutex *lock) From 1f693ef550f051ef6a95dbfadfa4fbd600769a81 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Wed, 29 Nov 2023 13:56:43 -0800 Subject: [PATCH 119/310] x86/microcode/intel: Remove redundant microcode late updated message After successful update, the late loading routine prints an update summary similar to: microcode: load: updated on 128 primary CPUs with 128 siblings microcode: revision: 0x21000170 -> 0x21000190 Remove the redundant message in the Intel side of the driver. [ bp: Massage commit message. ] Signed-off-by: Ashok Raj Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/ZWjYhedNfhAUmt0k@a4bf019067fa.jf.intel.com --- arch/x86/kernel/cpu/microcode/intel.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 070426b9895f..5d6ea875b81b 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -457,12 +457,6 @@ static enum ucode_state apply_microcode_late(int cpu) if (ret != UCODE_UPDATED && ret != UCODE_OK) return ret; - if (!cpu && uci->cpu_sig.rev != cur_rev) { - pr_info("Updated to revision 0x%x, date = %04x-%02x-%02x\n", - uci->cpu_sig.rev, mc->hdr.date & 0xffff, mc->hdr.date >> 24, - (mc->hdr.date >> 16) & 0xff); - } - cpu_data(cpu).microcode = uci->cpu_sig.rev; if (!cpu) boot_cpu_data.microcode = uci->cpu_sig.rev; From 9c21ea53e6bd1104c637b80a0688040f184cc761 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 1 Dec 2023 14:35:06 +0100 Subject: [PATCH 120/310] x86/microcode/intel: Set new revision only after a successful update This was meant to be done only when early microcode got updated successfully. Move it into the if-branch. Also, make sure the current revision is read unconditionally and only once. Fixes: 080990aa3344 ("x86/microcode: Rework early revisions reporting") Reported-by: Ashok Raj Signed-off-by: Borislav Petkov (AMD) Tested-by: Ashok Raj Link: https://lore.kernel.org/r/ZWjVt5dNRjbcvlzR@a4bf019067fa.jf.intel.com --- arch/x86/kernel/cpu/microcode/intel.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 5d6ea875b81b..857e608af641 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -370,14 +370,14 @@ static __init struct microcode_intel *get_microcode_blob(struct ucode_cpu_info * { struct cpio_data cp; + intel_collect_cpu_info(&uci->cpu_sig); + if (!load_builtin_intel_microcode(&cp)) cp = find_microcode_in_initrd(ucode_path); if (!(cp.data && cp.size)) return NULL; - intel_collect_cpu_info(&uci->cpu_sig); - return scan_microcode(cp.data, cp.size, uci, save); } @@ -410,13 +410,13 @@ void __init load_ucode_intel_bsp(struct early_load_data *ed) { struct ucode_cpu_info uci; - ed->old_rev = intel_get_microcode_revision(); - uci.mc = get_microcode_blob(&uci, false); - if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) - ucode_patch_va = UCODE_BSP_LOADED; + ed->old_rev = uci.cpu_sig.rev; - ed->new_rev = uci.cpu_sig.rev; + if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) { + ucode_patch_va = UCODE_BSP_LOADED; + ed->new_rev = uci.cpu_sig.rev; + } } void load_ucode_intel_ap(void) From 6ac805d13870925c787a28e3fe5cc73610cacd03 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 4 Dec 2023 10:47:49 -0700 Subject: [PATCH 121/310] iov_iter: remove unused 'iov' argument from import_single_range() It is entirely unused, just get rid of it. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20231204174827.1258875-2-axboe@kernel.dk Signed-off-by: Christian Brauner --- drivers/block/ublk_drv.c | 6 ++---- drivers/char/random.c | 6 ++---- fs/aio.c | 2 +- include/linux/uio.h | 2 +- kernel/trace/trace_events_user.c | 3 +-- lib/iov_iter.c | 2 +- net/ipv4/tcp.c | 6 ++---- net/socket.c | 6 ++---- security/keys/keyctl.c | 3 +-- 9 files changed, 13 insertions(+), 23 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 83600b45e12a..5656b0a1233d 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -893,11 +893,10 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, */ if (ublk_need_map_req(req)) { struct iov_iter iter; - struct iovec iov; const int dir = ITER_DEST; import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes, - &iov, &iter); + &iter); return ublk_copy_user_pages(req, 0, &iter, dir); } @@ -915,13 +914,12 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, if (ublk_need_unmap_req(req)) { struct iov_iter iter; - struct iovec iov; const int dir = ITER_SOURCE; WARN_ON_ONCE(io->res > rq_bytes); import_single_range(dir, u64_to_user_ptr(io->addr), io->res, - &iov, &iter); + &iter); return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; diff --git a/drivers/char/random.c b/drivers/char/random.c index 4a9c79391dee..e79ae238b30d 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1364,7 +1364,6 @@ static void __cold try_to_generate_entropy(void) SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags) { struct iov_iter iter; - struct iovec iov; int ret; if (flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE)) @@ -1385,7 +1384,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags return ret; } - ret = import_single_range(ITER_DEST, ubuf, len, &iov, &iter); + ret = import_single_range(ITER_DEST, ubuf, len, &iter); if (unlikely(ret)) return ret; return get_random_bytes_user(&iter); @@ -1491,7 +1490,6 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) return 0; case RNDADDENTROPY: { struct iov_iter iter; - struct iovec iov; ssize_t ret; int len; @@ -1503,7 +1501,7 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) return -EINVAL; if (get_user(len, p++)) return -EFAULT; - ret = import_single_range(ITER_SOURCE, p, len, &iov, &iter); + ret = import_single_range(ITER_SOURCE, p, len, &iter); if (unlikely(ret)) return ret; ret = write_pool_user(&iter); diff --git a/fs/aio.c b/fs/aio.c index f8589caef9c1..251eeaef7fbf 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1498,7 +1498,7 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, size_t len = iocb->aio_nbytes; if (!vectored) { - ssize_t ret = import_single_range(rw, buf, len, *iovec, iter); + ssize_t ret = import_single_range(rw, buf, len, iter); *iovec = NULL; return ret; } diff --git a/include/linux/uio.h b/include/linux/uio.h index b6214cbf2a43..bfafd3542fa7 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -348,7 +348,7 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); int import_single_range(int type, void __user *buf, size_t len, - struct iovec *iov, struct iov_iter *i); + struct iov_iter *i); int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 9365ce407426..4efc75d90a0d 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -2177,14 +2177,13 @@ static int user_events_open(struct inode *node, struct file *file) static ssize_t user_events_write(struct file *file, const char __user *ubuf, size_t count, loff_t *ppos) { - struct iovec iov; struct iov_iter i; if (unlikely(*ppos != 0)) return -EFAULT; if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf, - count, &iov, &i))) + count, &i))) return -EFAULT; return user_events_write_core(file, &i); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index de7d11cf4c63..d60c73354e1f 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1370,7 +1370,7 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec, EXPORT_SYMBOL(import_iovec); int import_single_range(int rw, void __user *buf, size_t len, - struct iovec *iov, struct iov_iter *i) + struct iov_iter *i) { if (len > MAX_RW_COUNT) len = MAX_RW_COUNT; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 53bcc17c91e4..57cf3adb191a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1849,7 +1849,6 @@ static int receive_fallback_to_copy(struct sock *sk, { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; - struct iovec iov; int err; zc->length = 0; @@ -1859,7 +1858,7 @@ static int receive_fallback_to_copy(struct sock *sk, return -EINVAL; err = import_single_range(ITER_DEST, (void __user *)copy_address, - inq, &iov, &msg.msg_iter); + inq, &msg.msg_iter); if (err) return err; @@ -1886,14 +1885,13 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; - struct iovec iov; int err; if (copy_address != zc->copybuf_address) return -EINVAL; err = import_single_range(ITER_DEST, (void __user *)copy_address, - copylen, &iov, &msg.msg_iter); + copylen, &msg.msg_iter); if (err) return err; err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); diff --git a/net/socket.c b/net/socket.c index 3379c64217a4..1f0d0e8d0a50 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2161,10 +2161,9 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr_storage address; int err; struct msghdr msg; - struct iovec iov; int fput_needed; - err = import_single_range(ITER_SOURCE, buff, len, &iov, &msg.msg_iter); + err = import_single_range(ITER_SOURCE, buff, len, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); @@ -2226,11 +2225,10 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, .msg_name = addr ? (struct sockaddr *)&address : NULL, }; struct socket *sock; - struct iovec iov; int err, err2; int fput_needed; - err = import_single_range(ITER_DEST, ubuf, size, &iov, &msg.msg_iter); + err = import_single_range(ITER_DEST, ubuf, size, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 19be69fa4d05..193df7ca3ca8 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1252,12 +1252,11 @@ long keyctl_instantiate_key(key_serial_t id, key_serial_t ringid) { if (_payload && plen) { - struct iovec iov; struct iov_iter from; int ret; ret = import_single_range(ITER_SOURCE, (void __user *)_payload, plen, - &iov, &from); + &from); if (unlikely(ret)) return ret; From 9fd7874c0e5c89d7da0b4442271696ec0f8edcba Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 4 Dec 2023 10:47:50 -0700 Subject: [PATCH 122/310] iov_iter: replace import_single_range() with import_ubuf() With the removal of the 'iov' argument to import_single_range(), the two functions are now fully identical. Convert the import_single_range() callers to import_ubuf(), and remove the former fully. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20231204174827.1258875-3-axboe@kernel.dk Signed-off-by: Christian Brauner --- drivers/block/ublk_drv.c | 7 ++----- drivers/char/random.c | 4 ++-- fs/aio.c | 2 +- include/linux/uio.h | 2 -- kernel/trace/trace_events_user.c | 3 +-- lib/iov_iter.c | 13 ------------- net/ipv4/tcp.c | 8 ++++---- net/socket.c | 4 ++-- security/keys/keyctl.c | 4 ++-- 9 files changed, 14 insertions(+), 33 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 5656b0a1233d..3eaf02ebeebe 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -895,9 +895,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, struct iov_iter iter; const int dir = ITER_DEST; - import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes, - &iter); - + import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter); return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; @@ -918,8 +916,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, WARN_ON_ONCE(io->res > rq_bytes); - import_single_range(dir, u64_to_user_ptr(io->addr), io->res, - &iter); + import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter); return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; diff --git a/drivers/char/random.c b/drivers/char/random.c index e79ae238b30d..456be28ba67c 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1384,7 +1384,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags return ret; } - ret = import_single_range(ITER_DEST, ubuf, len, &iter); + ret = import_ubuf(ITER_DEST, ubuf, len, &iter); if (unlikely(ret)) return ret; return get_random_bytes_user(&iter); @@ -1501,7 +1501,7 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) return -EINVAL; if (get_user(len, p++)) return -EFAULT; - ret = import_single_range(ITER_SOURCE, p, len, &iter); + ret = import_ubuf(ITER_SOURCE, p, len, &iter); if (unlikely(ret)) return ret; ret = write_pool_user(&iter); diff --git a/fs/aio.c b/fs/aio.c index 251eeaef7fbf..4ea639509d41 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1498,7 +1498,7 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, size_t len = iocb->aio_nbytes; if (!vectored) { - ssize_t ret = import_single_range(rw, buf, len, iter); + ssize_t ret = import_ubuf(rw, buf, len, iter); *iovec = NULL; return ret; } diff --git a/include/linux/uio.h b/include/linux/uio.h index bfafd3542fa7..bea9c89922d9 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -347,8 +347,6 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec, ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); -int import_single_range(int type, void __user *buf, size_t len, - struct iov_iter *i); int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 4efc75d90a0d..e76f5e1efdf2 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -2182,8 +2182,7 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf, if (unlikely(*ppos != 0)) return -EFAULT; - if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf, - count, &i))) + if (unlikely(import_ubuf(ITER_SOURCE, (char __user *)ubuf, count, &i))) return -EFAULT; return user_events_write_core(file, &i); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index d60c73354e1f..009875bc95aa 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1369,19 +1369,6 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec, } EXPORT_SYMBOL(import_iovec); -int import_single_range(int rw, void __user *buf, size_t len, - struct iov_iter *i) -{ - if (len > MAX_RW_COUNT) - len = MAX_RW_COUNT; - if (unlikely(!access_ok(buf, len))) - return -EFAULT; - - iov_iter_ubuf(i, rw, buf, len); - return 0; -} -EXPORT_SYMBOL(import_single_range); - int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) { if (len > MAX_RW_COUNT) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 57cf3adb191a..54d3c762d400 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1857,8 +1857,8 @@ static int receive_fallback_to_copy(struct sock *sk, if (copy_address != zc->copybuf_address) return -EINVAL; - err = import_single_range(ITER_DEST, (void __user *)copy_address, - inq, &msg.msg_iter); + err = import_ubuf(ITER_DEST, (void __user *)copy_address, inq, + &msg.msg_iter); if (err) return err; @@ -1890,8 +1890,8 @@ static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, if (copy_address != zc->copybuf_address) return -EINVAL; - err = import_single_range(ITER_DEST, (void __user *)copy_address, - copylen, &msg.msg_iter); + err = import_ubuf(ITER_DEST, (void __user *)copy_address, copylen, + &msg.msg_iter); if (err) return err; err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); diff --git a/net/socket.c b/net/socket.c index 1f0d0e8d0a50..5d49ae0c1b79 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2163,7 +2163,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct msghdr msg; int fput_needed; - err = import_single_range(ITER_SOURCE, buff, len, &msg.msg_iter); + err = import_ubuf(ITER_SOURCE, buff, len, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); @@ -2228,7 +2228,7 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, int err, err2; int fput_needed; - err = import_single_range(ITER_DEST, ubuf, size, &msg.msg_iter); + err = import_ubuf(ITER_DEST, ubuf, size, &msg.msg_iter); if (unlikely(err)) return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 193df7ca3ca8..10ba439968f7 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1255,8 +1255,8 @@ long keyctl_instantiate_key(key_serial_t id, struct iov_iter from; int ret; - ret = import_single_range(ITER_SOURCE, (void __user *)_payload, plen, - &from); + ret = import_ubuf(ITER_SOURCE, (void __user *)_payload, plen, + &from); if (unlikely(ret)) return ret; From ced242ba9d7cb3571f6e0f165f643cb832d52148 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 4 Dec 2023 14:36:04 +0000 Subject: [PATCH 123/310] KVM: arm64: Remove VPIPT I-cache handling We have some special handling for VPIPT I-cache in critical parts of the cache and TLB maintenance. Remove it. Reviewed-by: Zenghui Yu Reviewed-by: Anshuman Khandual Signed-off-by: Marc Zyngier Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20231204143606.1806432-2-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/kvm_mmu.h | 7 ---- arch/arm64/kvm/hyp/nvhe/pkvm.c | 2 +- arch/arm64/kvm/hyp/nvhe/tlb.c | 61 -------------------------------- arch/arm64/kvm/hyp/vhe/tlb.c | 13 ------- 4 files changed, 1 insertion(+), 82 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 49e0d4b36bd0..e3e793d0ec30 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -243,13 +243,6 @@ static inline size_t __invalidate_icache_max_range(void) static inline void __invalidate_icache_guest_page(void *va, size_t size) { - /* - * VPIPT I-cache maintenance must be done from EL2. See comment in the - * nVHE flavor of __kvm_tlb_flush_vmid_ipa(). - */ - if (icache_is_vpipt() && read_sysreg(CurrentEL) != CurrentEL_EL2) - return; - /* * Blow the whole I-cache if it is aliasing (i.e. VIPT) or the * invalidation range exceeds our arbitrary limit on invadations by diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 9d23a51d7f75..b29f15418c0a 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -12,7 +12,7 @@ #include #include -/* Used by icache_is_vpipt(). */ +/* Used by icache_is_aliasing(). */ unsigned long __icache_flags; /* Used by kvm_get_vttbr(). */ diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 1b265713d6be..a60fb13e2192 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -105,28 +105,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, dsb(ish); isb(); - /* - * If the host is running at EL1 and we have a VPIPT I-cache, - * then we must perform I-cache maintenance at EL2 in order for - * it to have an effect on the guest. Since the guest cannot hit - * I-cache lines allocated with a different VMID, we don't need - * to worry about junk out of guest reset (we nuke the I-cache on - * VMID rollover), but we do need to be careful when remapping - * executable pages for the same guest. This can happen when KSM - * takes a CoW fault on an executable page, copies the page into - * a page that was previously mapped in the guest and then needs - * to invalidate the guest view of the I-cache for that page - * from EL1. To solve this, we invalidate the entire I-cache when - * unmapping a page from a guest if we have a VPIPT I-cache but - * the host is running at EL1. As above, we could do better if - * we had the VA. - * - * The moral of this story is: if you have a VPIPT I-cache, then - * you should be running with VHE enabled. - */ - if (icache_is_vpipt()) - icache_inval_all_pou(); - __tlb_switch_to_host(&cxt); } @@ -157,28 +135,6 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, dsb(nsh); isb(); - /* - * If the host is running at EL1 and we have a VPIPT I-cache, - * then we must perform I-cache maintenance at EL2 in order for - * it to have an effect on the guest. Since the guest cannot hit - * I-cache lines allocated with a different VMID, we don't need - * to worry about junk out of guest reset (we nuke the I-cache on - * VMID rollover), but we do need to be careful when remapping - * executable pages for the same guest. This can happen when KSM - * takes a CoW fault on an executable page, copies the page into - * a page that was previously mapped in the guest and then needs - * to invalidate the guest view of the I-cache for that page - * from EL1. To solve this, we invalidate the entire I-cache when - * unmapping a page from a guest if we have a VPIPT I-cache but - * the host is running at EL1. As above, we could do better if - * we had the VA. - * - * The moral of this story is: if you have a VPIPT I-cache, then - * you should be running with VHE enabled. - */ - if (icache_is_vpipt()) - icache_inval_all_pou(); - __tlb_switch_to_host(&cxt); } @@ -205,10 +161,6 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, dsb(ish); isb(); - /* See the comment in __kvm_tlb_flush_vmid_ipa() */ - if (icache_is_vpipt()) - icache_inval_all_pou(); - __tlb_switch_to_host(&cxt); } @@ -246,18 +198,5 @@ void __kvm_flush_vm_context(void) /* Same remark as in __tlb_switch_to_guest() */ dsb(ish); __tlbi(alle1is); - - /* - * VIPT and PIPT caches are not affected by VMID, so no maintenance - * is necessary across a VMID rollover. - * - * VPIPT caches constrain lookup and maintenance to the active VMID, - * so we need to invalidate lines with a stale VMID to avoid an ABA - * race after multiple rollovers. - * - */ - if (icache_is_vpipt()) - asm volatile("ic ialluis"); - dsb(ish); } diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index b636b4111dbf..b32e2940df7d 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -216,18 +216,5 @@ void __kvm_flush_vm_context(void) { dsb(ishst); __tlbi(alle1is); - - /* - * VIPT and PIPT caches are not affected by VMID, so no maintenance - * is necessary across a VMID rollover. - * - * VPIPT caches constrain lookup and maintenance to the active VMID, - * so we need to invalidate lines with a stale VMID to avoid an ABA - * race after multiple rollovers. - * - */ - if (icache_is_vpipt()) - asm volatile("ic ialluis"); - dsb(ish); } From d8e12a0d3715fbcc26fb2baac979bd07ba4c08d0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 4 Dec 2023 14:36:05 +0000 Subject: [PATCH 124/310] arm64: Kill detection of VPIPT i-cache policy Since the kernel will never run on a system with the VPIPT i-cache policy, drop the detection code altogether. Reviewed-by: Zenghui Yu Reviewed-by: Anshuman Khandual Signed-off-by: Marc Zyngier Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20231204143606.1806432-3-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/cache.h | 6 ------ arch/arm64/kernel/cpuinfo.c | 5 ----- 2 files changed, 11 deletions(-) diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index ceb368d33bf4..06a4670bdb0b 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -58,7 +58,6 @@ static inline unsigned int arch_slab_minalign(void) #define CTR_L1IP(ctr) SYS_FIELD_GET(CTR_EL0, L1Ip, ctr) #define ICACHEF_ALIASING 0 -#define ICACHEF_VPIPT 1 extern unsigned long __icache_flags; /* @@ -70,11 +69,6 @@ static inline int icache_is_aliasing(void) return test_bit(ICACHEF_ALIASING, &__icache_flags); } -static __always_inline int icache_is_vpipt(void) -{ - return test_bit(ICACHEF_VPIPT, &__icache_flags); -} - static inline u32 cache_type_cwg(void) { return SYS_FIELD_GET(CTR_EL0, CWG, read_cpuid_cachetype()); diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index a257da7b56fe..47043c0d95ec 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -36,8 +36,6 @@ static struct cpuinfo_arm64 boot_cpu_data; static inline const char *icache_policy_str(int l1ip) { switch (l1ip) { - case CTR_EL0_L1Ip_VPIPT: - return "VPIPT"; case CTR_EL0_L1Ip_VIPT: return "VIPT"; case CTR_EL0_L1Ip_PIPT: @@ -388,9 +386,6 @@ static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info) switch (l1ip) { case CTR_EL0_L1Ip_PIPT: break; - case CTR_EL0_L1Ip_VPIPT: - set_bit(ICACHEF_VPIPT, &__icache_flags); - break; case CTR_EL0_L1Ip_VIPT: default: /* Assume aliasing */ From f35c32ca6839f5777862dbe2138d02bf50b3dfa7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 4 Dec 2023 14:36:06 +0000 Subject: [PATCH 125/310] arm64: Rename reserved values for CTR_EL0.L1Ip We now have *two* values for CTR_EL0.L1Ip that are reserved. Which makes things a bit awkward. In order to lift the ambiguity, rename RESERVED (0b01) to RESERVED_AIVIVT, and VPIPT (0b00) to RESERVED_VPIPT. This makes it clear which of these meant what, and I'm sure archeologists will find it useful... Reviewed-by: Zenghui Yu Signed-off-by: Marc Zyngier Reviewed-by: Anshuman Khandual Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20231204143606.1806432-4-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 96cbeeab4eec..c5af75b23187 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2004,9 +2004,10 @@ Field 27:24 CWG Field 23:20 ERG Field 19:16 DminLine Enum 15:14 L1Ip - 0b00 VPIPT + # This was named as VPIPT in the ARM but now documented as reserved + 0b00 RESERVED_VPIPT # This is named as AIVIVT in the ARM but documented as reserved - 0b01 RESERVED + 0b01 RESERVED_AIVIVT 0b10 VIPT 0b11 PIPT EndEnum From a099bec7a81052a324d07c9be7c24dc92fbd8ad1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 17 Nov 2023 21:56:19 +0900 Subject: [PATCH 126/310] arm64: vdso32: rename 32-bit debug vdso to vdso32.so.dbg 'make vdso_install' renames arch/arm64/kernel/vdso32/vdso.so.dbg to vdso32.so during installation, which allows 64-bit and 32-bit vdso files to be installed in the same directory. However, arm64 is the only architecture that requires this renaming. To simplify the vdso_install logic, rename the in-tree vdso file so its base name matches the installed file name. Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20231117125620.1058300-1-masahiroy@kernel.org Signed-off-by: Will Deacon --- arch/arm64/Makefile | 2 +- arch/arm64/kernel/vdso32/Makefile | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 9a2d3723cd0f..47ecc4cff9d2 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -200,7 +200,7 @@ endif endif vdso-install-y += arch/arm64/kernel/vdso/vdso.so.dbg -vdso-install-$(CONFIG_COMPAT_VDSO) += arch/arm64/kernel/vdso32/vdso.so.dbg:vdso32.so +vdso-install-$(CONFIG_COMPAT_VDSO) += arch/arm64/kernel/vdso32/vdso32.so.dbg include $(srctree)/scripts/Makefile.defconf diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 1f911a76c5af..2266fcdff78a 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -118,7 +118,7 @@ endif VDSO_CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os # Build rules -targets := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) vdso.so vdso.so.dbg vdso.so.raw +targets := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) vdso.so vdso32.so.dbg vdso.so.raw c-obj-vdso := $(addprefix $(obj)/, $(c-obj-vdso)) c-obj-vdso-gettimeofday := $(addprefix $(obj)/, $(c-obj-vdso-gettimeofday)) asm-obj-vdso := $(addprefix $(obj)/, $(asm-obj-vdso)) @@ -127,15 +127,15 @@ obj-vdso := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) -include/generated/vdso32-offsets.h: $(obj)/vdso.so.dbg FORCE +include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE $(call if_changed,vdsosym) # Strip rule for vdso.so $(obj)/vdso.so: OBJCOPYFLAGS := -S -$(obj)/vdso.so: $(obj)/vdso.so.dbg FORCE +$(obj)/vdso.so: $(obj)/vdso32.so.dbg FORCE $(call if_changed,objcopy) -$(obj)/vdso.so.dbg: $(obj)/vdso.so.raw $(obj)/$(munge) FORCE +$(obj)/vdso32.so.dbg: $(obj)/vdso.so.raw $(obj)/$(munge) FORCE $(call if_changed,vdsomunge) # Link rule for the .so file, .lds has to be first From 103423ad7e56d6c756738823c332c414b07899e6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 22 Nov 2023 13:37:54 +0000 Subject: [PATCH 127/310] arm64: Get rid of ARM64_HAS_NO_HW_PREFETCH Back in 2016, it was argued that implementations lacking a HW prefetcher could be helped by sprinkling a number of PRFM instructions in strategic locations. In 2023, the one platform that presumably needed this hack is no longer in active use (let alone maintained), and an quick experiment shows dropping this hack only leads to a 0.4% drop on a full kernel compilation (tested on a MT30-GS0 48 CPU system). Given that this is pretty much in the noise department and that it may give odd ideas to other implementers, drop the hack for good. Suggested-by: Will Deacon Suggested-by: Mark Rutland Signed-off-by: Marc Zyngier Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20231122133754.1240687-1-maz@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 16 ---------------- arch/arm64/lib/copy_page.S | 11 ----------- arch/arm64/tools/cpucaps | 1 - 3 files changed, 28 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 646591c67e7a..b335da126e86 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1584,16 +1584,6 @@ static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry, return has_sre; } -static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry, int __unused) -{ - u32 midr = read_cpuid_id(); - - /* Cavium ThunderX pass 1.x and 2.x */ - return midr_is_cpu_model_range(midr, MIDR_THUNDERX, - MIDR_CPU_VAR_REV(0, 0), - MIDR_CPU_VAR_REV(1, MIDR_REVISION_MASK)); -} - static bool has_cache_idc(const struct arm64_cpu_capabilities *entry, int scope) { @@ -2321,12 +2311,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, ATOMIC, IMP) }, #endif /* CONFIG_ARM64_LSE_ATOMICS */ - { - .desc = "Software prefetching using PRFM", - .capability = ARM64_HAS_NO_HW_PREFETCH, - .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, - .matches = has_no_hw_prefetch, - }, { .desc = "Virtualization Host Extensions", .capability = ARM64_HAS_VIRT_HOST_EXTN, diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S index c336d2ffdec5..6a56d7cf309d 100644 --- a/arch/arm64/lib/copy_page.S +++ b/arch/arm64/lib/copy_page.S @@ -18,13 +18,6 @@ * x1 - src */ SYM_FUNC_START(__pi_copy_page) -alternative_if ARM64_HAS_NO_HW_PREFETCH - // Prefetch three cache lines ahead. - prfm pldl1strm, [x1, #128] - prfm pldl1strm, [x1, #256] - prfm pldl1strm, [x1, #384] -alternative_else_nop_endif - ldp x2, x3, [x1] ldp x4, x5, [x1, #16] ldp x6, x7, [x1, #32] @@ -39,10 +32,6 @@ alternative_else_nop_endif 1: tst x0, #(PAGE_SIZE - 1) -alternative_if ARM64_HAS_NO_HW_PREFETCH - prfm pldl1strm, [x1, #384] -alternative_else_nop_endif - stnp x2, x3, [x0, #-256] ldp x2, x3, [x1] stnp x4, x5, [x0, #16 - 256] diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index b98c38288a9d..0eb2a2d2f783 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -40,7 +40,6 @@ HAS_LDAPR HAS_LSE_ATOMICS HAS_MOPS HAS_NESTED_VIRT -HAS_NO_HW_PREFETCH HAS_PAN HAS_S1PIE HAS_RAS_EXTN From 590f23b092401f29e410fd4ca67128fcc45192fc Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 23 Nov 2023 11:58:13 +0000 Subject: [PATCH 128/310] perf/arm-cmn: Fix HN-F class_occup_id events A subtle copy-paste error managed to slip through the reorganisation of these patches in development, and not only give some HN-F events the wrong type, but use that wrong type before the subsequent patch defined it. Too late to fix history, but we can at least fix the bug. Fixes: b1b7dc38e482 ("perf/arm-cmn: Refactor HN-F event selector macros") Reported-by: Jing Zhang Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/5a22439de84ff188ef76674798052448eb03a3e1.1700740693.git.robin.murphy@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 014010d03588..86d970e74129 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -811,7 +811,7 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj, #define CMN_EVENT_HNF_OCC(_model, _name, _event) \ CMN_EVENT_HN_OCC(_model, hnf_##_name, CMN_TYPE_HNF, _event) #define CMN_EVENT_HNF_CLS(_model, _name, _event) \ - CMN_EVENT_HN_CLS(_model, hnf_##_name, CMN_TYPE_HNS, _event) + CMN_EVENT_HN_CLS(_model, hnf_##_name, CMN_TYPE_HNF, _event) #define CMN_EVENT_HNF_SNT(_model, _name, _event) \ CMN_EVENT_HN_SNT(_model, hnf_##_name, CMN_TYPE_HNF, _event) From 877806b9b41ea371defaaf58d815320f1a76384f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 14 Nov 2023 11:46:56 +0530 Subject: [PATCH 129/310] drivers: perf: arm_pmuv3: Add new macro PMUV3_INIT_MAP_EVENT() This further compacts all remaining PMU init procedures requiring specific map_event functions via a new macro PMUV3_INIT_MAP_EVENT(). While here, it also changes generated init function names to match to those generated via the other macro PMUV3_INIT_SIMPLE(). This does not cause functional change. Cc: Will Deacon Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Reviewed-by: James Clark Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20231114061656.337231-1-anshuman.khandual@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_pmuv3.c | 61 +++++++++++++--------------------------- 1 file changed, 20 insertions(+), 41 deletions(-) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 6ca7be05229c..c136a6529014 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -1221,6 +1221,12 @@ static int name##_pmu_init(struct arm_pmu *cpu_pmu) \ return armv8_pmu_init(cpu_pmu, #name, armv8_pmuv3_map_event); \ } +#define PMUV3_INIT_MAP_EVENT(name, map_event) \ +static int name##_pmu_init(struct arm_pmu *cpu_pmu) \ +{ \ + return armv8_pmu_init(cpu_pmu, #name, map_event); \ +} + PMUV3_INIT_SIMPLE(armv8_pmuv3) PMUV3_INIT_SIMPLE(armv8_cortex_a34) @@ -1247,51 +1253,24 @@ PMUV3_INIT_SIMPLE(armv8_neoverse_v1) PMUV3_INIT_SIMPLE(armv8_nvidia_carmel) PMUV3_INIT_SIMPLE(armv8_nvidia_denver) -static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a35", armv8_a53_map_event); -} - -static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a53", armv8_a53_map_event); -} - -static int armv8_a57_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a57", armv8_a57_map_event); -} - -static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a72", armv8_a57_map_event); -} - -static int armv8_a73_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a73", armv8_a73_map_event); -} - -static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init(cpu_pmu, "armv8_cavium_thunder", armv8_thunder_map_event); -} - -static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu) -{ - return armv8_pmu_init(cpu_pmu, "armv8_brcm_vulcan", armv8_vulcan_map_event); -} +PMUV3_INIT_MAP_EVENT(armv8_cortex_a35, armv8_a53_map_event) +PMUV3_INIT_MAP_EVENT(armv8_cortex_a53, armv8_a53_map_event) +PMUV3_INIT_MAP_EVENT(armv8_cortex_a57, armv8_a57_map_event) +PMUV3_INIT_MAP_EVENT(armv8_cortex_a72, armv8_a57_map_event) +PMUV3_INIT_MAP_EVENT(armv8_cortex_a73, armv8_a73_map_event) +PMUV3_INIT_MAP_EVENT(armv8_cavium_thunder, armv8_thunder_map_event) +PMUV3_INIT_MAP_EVENT(armv8_brcm_vulcan, armv8_vulcan_map_event) static const struct of_device_id armv8_pmu_of_device_ids[] = { {.compatible = "arm,armv8-pmuv3", .data = armv8_pmuv3_pmu_init}, {.compatible = "arm,cortex-a34-pmu", .data = armv8_cortex_a34_pmu_init}, - {.compatible = "arm,cortex-a35-pmu", .data = armv8_a35_pmu_init}, - {.compatible = "arm,cortex-a53-pmu", .data = armv8_a53_pmu_init}, + {.compatible = "arm,cortex-a35-pmu", .data = armv8_cortex_a35_pmu_init}, + {.compatible = "arm,cortex-a53-pmu", .data = armv8_cortex_a53_pmu_init}, {.compatible = "arm,cortex-a55-pmu", .data = armv8_cortex_a55_pmu_init}, - {.compatible = "arm,cortex-a57-pmu", .data = armv8_a57_pmu_init}, + {.compatible = "arm,cortex-a57-pmu", .data = armv8_cortex_a57_pmu_init}, {.compatible = "arm,cortex-a65-pmu", .data = armv8_cortex_a65_pmu_init}, - {.compatible = "arm,cortex-a72-pmu", .data = armv8_a72_pmu_init}, - {.compatible = "arm,cortex-a73-pmu", .data = armv8_a73_pmu_init}, + {.compatible = "arm,cortex-a72-pmu", .data = armv8_cortex_a72_pmu_init}, + {.compatible = "arm,cortex-a73-pmu", .data = armv8_cortex_a73_pmu_init}, {.compatible = "arm,cortex-a75-pmu", .data = armv8_cortex_a75_pmu_init}, {.compatible = "arm,cortex-a76-pmu", .data = armv8_cortex_a76_pmu_init}, {.compatible = "arm,cortex-a77-pmu", .data = armv8_cortex_a77_pmu_init}, @@ -1309,8 +1288,8 @@ static const struct of_device_id armv8_pmu_of_device_ids[] = { {.compatible = "arm,neoverse-n1-pmu", .data = armv8_neoverse_n1_pmu_init}, {.compatible = "arm,neoverse-n2-pmu", .data = armv9_neoverse_n2_pmu_init}, {.compatible = "arm,neoverse-v1-pmu", .data = armv8_neoverse_v1_pmu_init}, - {.compatible = "cavium,thunder-pmu", .data = armv8_thunder_pmu_init}, - {.compatible = "brcm,vulcan-pmu", .data = armv8_vulcan_pmu_init}, + {.compatible = "cavium,thunder-pmu", .data = armv8_cavium_thunder_pmu_init}, + {.compatible = "brcm,vulcan-pmu", .data = armv8_brcm_vulcan_pmu_init}, {.compatible = "nvidia,carmel-pmu", .data = armv8_nvidia_carmel_pmu_init}, {.compatible = "nvidia,denver-pmu", .data = armv8_nvidia_denver_pmu_init}, {}, From ca6f537e459e2da4b331fe8928d1a0b0f9301f42 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 4 Dec 2023 11:58:47 +0000 Subject: [PATCH 130/310] drivers/perf: pmuv3: don't expose SW_INCR event in sysfs The SW_INCR event is somewhat unusual, and depends on the specific HW counter that it is programmed into. When programmed into PMEVCNTR, SW_INCR will count any writes to PMSWINC_EL0 with bit n set, ignoring writes to SW_INCR with bit n clear. Event rotation means that there's no fixed relationship between perf_events and HW counters, so this isn't all that useful. Further, we program PMUSERENR.{SW,EN}=={0,0}, which causes EL0 writes to PMSWINC_EL0 to be trapped and handled as UNDEFINED, resulting in a SIGILL to userspace. Given that, it's not a good idea to expose SW_INCR in sysfs. Hide it as we did for CHAIN back in commit: 4ba2578fa7b55701 ("arm64: perf: don't expose CHAIN event in sysfs") Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20231204115847.2993026-1-mark.rutland@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_pmuv3.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index c136a6529014..bbde60f1b08c 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -169,7 +169,11 @@ armv8pmu_events_sysfs_show(struct device *dev, PMU_EVENT_ATTR_ID(name, armv8pmu_events_sysfs_show, config) static struct attribute *armv8_pmuv3_event_attrs[] = { - ARMV8_EVENT_ATTR(sw_incr, ARMV8_PMUV3_PERFCTR_SW_INCR), + /* + * Don't expose the sw_incr event in /sys. It's not usable as writes to + * PMSWINC_EL0 will trap as PMUSERENR.{SW,EN}=={0,0} and event rotation + * means we don't have a fixed event<->counter relationship regardless. + */ ARMV8_EVENT_ATTR(l1i_cache_refill, ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL), ARMV8_EVENT_ATTR(l1i_tlb_refill, ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL), ARMV8_EVENT_ATTR(l1d_cache_refill, ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL), From 38bbef7240b8c5f2dc4493eec356e2efbf2da5f4 Mon Sep 17 00:00:00 2001 From: Junhao He Date: Mon, 4 Dec 2023 19:04:25 +0800 Subject: [PATCH 131/310] drivers/perf: hisi: Fix some event id for HiSilicon UC pmu Some event id of HiSilicon uncore UC PMU driver is incorrect, fix them. Fixes: 312eca95e28d ("drivers/perf: hisi: Add support for HiSilicon UC PMU driver") Signed-off-by: Junhao He Reviewed-by: Yicong Yang Link: https://lore.kernel.org/r/20231204110425.20354-1-hejunhao3@huawei.com Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_uc_pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/perf/hisilicon/hisi_uncore_uc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_uc_pmu.c index 63da05e5831c..636fb79647c8 100644 --- a/drivers/perf/hisilicon/hisi_uncore_uc_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_uc_pmu.c @@ -383,8 +383,8 @@ static struct attribute *hisi_uc_pmu_events_attr[] = { HISI_PMU_EVENT_ATTR(cpu_rd, 0x10), HISI_PMU_EVENT_ATTR(cpu_rd64, 0x17), HISI_PMU_EVENT_ATTR(cpu_rs64, 0x19), - HISI_PMU_EVENT_ATTR(cpu_mru, 0x1a), - HISI_PMU_EVENT_ATTR(cycles, 0x9c), + HISI_PMU_EVENT_ATTR(cpu_mru, 0x1c), + HISI_PMU_EVENT_ATTR(cycles, 0x95), HISI_PMU_EVENT_ATTR(spipe_hit, 0xb3), HISI_PMU_EVENT_ATTR(hpipe_hit, 0xdb), HISI_PMU_EVENT_ATTR(cring_rxdat_cnt, 0xfa), From 5cd7da19cb9714adbf56054e0a0bd044f49e2a8e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 15 Nov 2023 14:58:04 +0530 Subject: [PATCH 132/310] arm: perf: Remove PMU locking Currently the 32-bit arm PMU drivers use the pmu_hw_events::lock spinlock in their arm_pmu::{start,stop,enable,disable}() callbacks to protect hardware state and event data. This locking is not necessary as the perf core code already provides mutual exclusion, disabling interrupts to serialize against the IRQ handler, and using perf_event_context::lock to protect against concurrent modifications of events cross-cpu. The locking was removed from the arm64 (now PMUv3) PMU driver in commit: 2a0e2a02e4b7 ("arm64: perf: Remove PMU locking") ... and the same reasoning applies to all the 32-bit PMU drivers. Remove the locking from the 32-bit PMU drivers. Cc: Mark Rutland Cc: Will Deacon Cc: Russell King Cc: linux-perf-users@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Anshuman Khandual Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20231115092805.737822-2-anshuman.khandual@arm.com Signed-off-by: Will Deacon --- arch/arm/kernel/perf_event_v6.c | 28 ++++-------------- arch/arm/kernel/perf_event_v7.c | 44 ----------------------------- arch/arm/kernel/perf_event_xscale.c | 44 ++++++----------------------- 3 files changed, 13 insertions(+), 103 deletions(-) diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c index 1ae99deeec54..8fc080c9e4fb 100644 --- a/arch/arm/kernel/perf_event_v6.c +++ b/arch/arm/kernel/perf_event_v6.c @@ -268,10 +268,8 @@ static inline void armv6pmu_write_counter(struct perf_event *event, u64 value) static void armv6pmu_enable_event(struct perf_event *event) { - unsigned long val, mask, evt, flags; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); + unsigned long val, mask, evt; struct hw_perf_event *hwc = &event->hw; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); int idx = hwc->idx; if (ARMV6_CYCLE_COUNTER == idx) { @@ -294,12 +292,10 @@ static void armv6pmu_enable_event(struct perf_event *event) * Mask out the current event and set the counter to count the event * that we're interested in. */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); val = armv6_pmcr_read(); val &= ~mask; val |= evt; armv6_pmcr_write(val); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static irqreturn_t @@ -362,26 +358,20 @@ armv6pmu_handle_irq(struct arm_pmu *cpu_pmu) static void armv6pmu_start(struct arm_pmu *cpu_pmu) { - unsigned long flags, val; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); + unsigned long val; - raw_spin_lock_irqsave(&events->pmu_lock, flags); val = armv6_pmcr_read(); val |= ARMV6_PMCR_ENABLE; armv6_pmcr_write(val); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void armv6pmu_stop(struct arm_pmu *cpu_pmu) { - unsigned long flags, val; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); + unsigned long val; - raw_spin_lock_irqsave(&events->pmu_lock, flags); val = armv6_pmcr_read(); val &= ~ARMV6_PMCR_ENABLE; armv6_pmcr_write(val); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static int @@ -419,10 +409,8 @@ static void armv6pmu_clear_event_idx(struct pmu_hw_events *cpuc, static void armv6pmu_disable_event(struct perf_event *event) { - unsigned long val, mask, evt, flags; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); + unsigned long val, mask, evt; struct hw_perf_event *hwc = &event->hw; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); int idx = hwc->idx; if (ARMV6_CYCLE_COUNTER == idx) { @@ -444,20 +432,16 @@ static void armv6pmu_disable_event(struct perf_event *event) * of ETM bus signal assertion cycles. The external reporting should * be disabled and so this should never increment. */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); val = armv6_pmcr_read(); val &= ~mask; val |= evt; armv6_pmcr_write(val); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void armv6mpcore_pmu_disable_event(struct perf_event *event) { - unsigned long val, mask, flags, evt = 0; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); + unsigned long val, mask, evt = 0; struct hw_perf_event *hwc = &event->hw; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); int idx = hwc->idx; if (ARMV6_CYCLE_COUNTER == idx) { @@ -475,12 +459,10 @@ static void armv6mpcore_pmu_disable_event(struct perf_event *event) * Unlike UP ARMv6, we don't have a way of stopping the counters. We * simply disable the interrupt reporting. */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); val = armv6_pmcr_read(); val &= ~mask; val |= evt; armv6_pmcr_write(val); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static int armv6_map_event(struct perf_event *event) diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c index eb2190477da1..c890354b04e9 100644 --- a/arch/arm/kernel/perf_event_v7.c +++ b/arch/arm/kernel/perf_event_v7.c @@ -870,10 +870,8 @@ static void armv7_pmnc_dump_regs(struct arm_pmu *cpu_pmu) static void armv7pmu_enable_event(struct perf_event *event) { - unsigned long flags; struct hw_perf_event *hwc = &event->hw; struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); int idx = hwc->idx; if (!armv7_pmnc_counter_valid(cpu_pmu, idx)) { @@ -886,7 +884,6 @@ static void armv7pmu_enable_event(struct perf_event *event) * Enable counter and interrupt, and set the counter to count * the event that we're interested in. */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); /* * Disable counter @@ -910,16 +907,12 @@ static void armv7pmu_enable_event(struct perf_event *event) * Enable counter */ armv7_pmnc_enable_counter(idx); - - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void armv7pmu_disable_event(struct perf_event *event) { - unsigned long flags; struct hw_perf_event *hwc = &event->hw; struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); int idx = hwc->idx; if (!armv7_pmnc_counter_valid(cpu_pmu, idx)) { @@ -931,7 +924,6 @@ static void armv7pmu_disable_event(struct perf_event *event) /* * Disable counter and interrupt */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); /* * Disable counter @@ -942,8 +934,6 @@ static void armv7pmu_disable_event(struct perf_event *event) * Disable interrupt for this counter */ armv7_pmnc_disable_intens(idx); - - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static irqreturn_t armv7pmu_handle_irq(struct arm_pmu *cpu_pmu) @@ -1009,24 +999,14 @@ static irqreturn_t armv7pmu_handle_irq(struct arm_pmu *cpu_pmu) static void armv7pmu_start(struct arm_pmu *cpu_pmu) { - unsigned long flags; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - raw_spin_lock_irqsave(&events->pmu_lock, flags); /* Enable all counters */ armv7_pmnc_write(armv7_pmnc_read() | ARMV7_PMNC_E); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void armv7pmu_stop(struct arm_pmu *cpu_pmu) { - unsigned long flags; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - raw_spin_lock_irqsave(&events->pmu_lock, flags); /* Disable all counters */ armv7_pmnc_write(armv7_pmnc_read() & ~ARMV7_PMNC_E); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static int armv7pmu_get_event_idx(struct pmu_hw_events *cpuc, @@ -1492,14 +1472,10 @@ static void krait_clearpmu(u32 config_base) static void krait_pmu_disable_event(struct perf_event *event) { - unsigned long flags; struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); /* Disable counter and interrupt */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); /* Disable counter */ armv7_pmnc_disable_counter(idx); @@ -1512,23 +1488,17 @@ static void krait_pmu_disable_event(struct perf_event *event) /* Disable interrupt for this counter */ armv7_pmnc_disable_intens(idx); - - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void krait_pmu_enable_event(struct perf_event *event) { - unsigned long flags; struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); /* * Enable counter and interrupt, and set the counter to count * the event that we're interested in. */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); /* Disable counter */ armv7_pmnc_disable_counter(idx); @@ -1548,8 +1518,6 @@ static void krait_pmu_enable_event(struct perf_event *event) /* Enable counter */ armv7_pmnc_enable_counter(idx); - - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void krait_pmu_reset(void *info) @@ -1825,14 +1793,10 @@ static void scorpion_clearpmu(u32 config_base) static void scorpion_pmu_disable_event(struct perf_event *event) { - unsigned long flags; struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); /* Disable counter and interrupt */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); /* Disable counter */ armv7_pmnc_disable_counter(idx); @@ -1845,23 +1809,17 @@ static void scorpion_pmu_disable_event(struct perf_event *event) /* Disable interrupt for this counter */ armv7_pmnc_disable_intens(idx); - - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void scorpion_pmu_enable_event(struct perf_event *event) { - unsigned long flags; struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); /* * Enable counter and interrupt, and set the counter to count * the event that we're interested in. */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); /* Disable counter */ armv7_pmnc_disable_counter(idx); @@ -1881,8 +1839,6 @@ static void scorpion_pmu_enable_event(struct perf_event *event) /* Enable counter */ armv7_pmnc_enable_counter(idx); - - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void scorpion_pmu_reset(void *info) diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c index f6cdcacfb96d..7a2ba1c689a7 100644 --- a/arch/arm/kernel/perf_event_xscale.c +++ b/arch/arm/kernel/perf_event_xscale.c @@ -203,10 +203,8 @@ xscale1pmu_handle_irq(struct arm_pmu *cpu_pmu) static void xscale1pmu_enable_event(struct perf_event *event) { - unsigned long val, mask, evt, flags; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); + unsigned long val, mask, evt; struct hw_perf_event *hwc = &event->hw; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); int idx = hwc->idx; switch (idx) { @@ -229,20 +227,16 @@ static void xscale1pmu_enable_event(struct perf_event *event) return; } - raw_spin_lock_irqsave(&events->pmu_lock, flags); val = xscale1pmu_read_pmnc(); val &= ~mask; val |= evt; xscale1pmu_write_pmnc(val); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void xscale1pmu_disable_event(struct perf_event *event) { - unsigned long val, mask, evt, flags; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); + unsigned long val, mask, evt; struct hw_perf_event *hwc = &event->hw; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); int idx = hwc->idx; switch (idx) { @@ -263,12 +257,10 @@ static void xscale1pmu_disable_event(struct perf_event *event) return; } - raw_spin_lock_irqsave(&events->pmu_lock, flags); val = xscale1pmu_read_pmnc(); val &= ~mask; val |= evt; xscale1pmu_write_pmnc(val); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static int @@ -300,26 +292,20 @@ static void xscalepmu_clear_event_idx(struct pmu_hw_events *cpuc, static void xscale1pmu_start(struct arm_pmu *cpu_pmu) { - unsigned long flags, val; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); + unsigned long val; - raw_spin_lock_irqsave(&events->pmu_lock, flags); val = xscale1pmu_read_pmnc(); val |= XSCALE_PMU_ENABLE; xscale1pmu_write_pmnc(val); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void xscale1pmu_stop(struct arm_pmu *cpu_pmu) { - unsigned long flags, val; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); + unsigned long val; - raw_spin_lock_irqsave(&events->pmu_lock, flags); val = xscale1pmu_read_pmnc(); val &= ~XSCALE_PMU_ENABLE; xscale1pmu_write_pmnc(val); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static inline u64 xscale1pmu_read_counter(struct perf_event *event) @@ -549,10 +535,8 @@ xscale2pmu_handle_irq(struct arm_pmu *cpu_pmu) static void xscale2pmu_enable_event(struct perf_event *event) { - unsigned long flags, ien, evtsel; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); + unsigned long ien, evtsel; struct hw_perf_event *hwc = &event->hw; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); int idx = hwc->idx; ien = xscale2pmu_read_int_enable(); @@ -587,18 +571,14 @@ static void xscale2pmu_enable_event(struct perf_event *event) return; } - raw_spin_lock_irqsave(&events->pmu_lock, flags); xscale2pmu_write_event_select(evtsel); xscale2pmu_write_int_enable(ien); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void xscale2pmu_disable_event(struct perf_event *event) { - unsigned long flags, ien, evtsel, of_flags; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); + unsigned long ien, evtsel, of_flags; struct hw_perf_event *hwc = &event->hw; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); int idx = hwc->idx; ien = xscale2pmu_read_int_enable(); @@ -638,11 +618,9 @@ static void xscale2pmu_disable_event(struct perf_event *event) return; } - raw_spin_lock_irqsave(&events->pmu_lock, flags); xscale2pmu_write_event_select(evtsel); xscale2pmu_write_int_enable(ien); xscale2pmu_write_overflow_flags(of_flags); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static int @@ -663,26 +641,20 @@ out: static void xscale2pmu_start(struct arm_pmu *cpu_pmu) { - unsigned long flags, val; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); + unsigned long val; - raw_spin_lock_irqsave(&events->pmu_lock, flags); val = xscale2pmu_read_pmnc() & ~XSCALE_PMU_CNT64; val |= XSCALE_PMU_ENABLE; xscale2pmu_write_pmnc(val); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void xscale2pmu_stop(struct arm_pmu *cpu_pmu) { - unsigned long flags, val; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); + unsigned long val; - raw_spin_lock_irqsave(&events->pmu_lock, flags); val = xscale2pmu_read_pmnc(); val &= ~XSCALE_PMU_ENABLE; xscale2pmu_write_pmnc(val); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static inline u64 xscale2pmu_read_counter(struct perf_event *event) From 118eb89b1e7f6807776c012cffc5c9b07fd26164 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 15 Nov 2023 14:58:05 +0530 Subject: [PATCH 133/310] drivers: perf: arm_pmu: Drop 'pmu_lock' element from 'struct pmu_hw_events' As 'pmu_lock' element is not being used in any ARM PMU implementation, just drop this from 'struct pmu_hw_events'. Cc: Will Deacon Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20231115092805.737822-3-anshuman.khandual@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_pmu.c | 1 - include/linux/perf/arm_pmu.h | 6 ------ 2 files changed, 7 deletions(-) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index d712a19e47ac..379479b50bdd 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -893,7 +893,6 @@ struct arm_pmu *armpmu_alloc(void) struct pmu_hw_events *events; events = per_cpu_ptr(pmu->hw_events, cpu); - raw_spin_lock_init(&events->pmu_lock); events->percpu_pmu = pmu; } diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 143fbc10ecfe..e2503d48ddee 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -59,12 +59,6 @@ struct pmu_hw_events { */ DECLARE_BITMAP(used_mask, ARMPMU_MAX_HWEVENTS); - /* - * Hardware lock to serialize accesses to PMU registers. Needed for the - * read/modify/write sequences. - */ - raw_spinlock_t pmu_lock; - /* * When using percpu IRQs, we need a percpu dev_id. Place it here as we * already have to allocate this struct per cpu. From afd83967e7bb9b41ee71153b868dcf94e43c2d7a Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Mon, 20 Nov 2023 17:33:13 +0800 Subject: [PATCH 134/310] perf: fsl_imx8_ddr: Add AXI ID PORT CHANNEL filter support This is the extension of AXI ID filter. Filter is defined with 2 configuration registers per counter 1-3 (counter 0 is not used for filtering and lacks these registers). * Counter N MASK COMP register - AXI_ID and AXI_MASKING. * Counter N MUX CNTL register - AXI CHANNEL and AXI PORT. -- 0: address channel -- 1: data channel This filter is exposed to userspace as an additional (channel, port) pair. The definition of axi_channel is inverted in userspace, and it will be reverted in driver automatically. AXI filter of Perf Monitor in DDR Subsystem, only a single port0 exist, so axi_port is reserved which should be 0. e.g. perf stat -a -e imx8_ddr0/axid-read,axi_mask=0xMMMM,axi_id=0xDDDD,axi_channel=0xH/ cmd perf stat -a -e imx8_ddr0/axid-write,axi_mask=0xMMMM,axi_id=0xDDDD,axi_channel=0xH/ cmd Signed-off-by: Xu Yang Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20231120093317.2652866-1-xu.yang_2@nxp.com Signed-off-by: Will Deacon --- drivers/perf/fsl_imx8_ddr_perf.c | 39 ++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c index 92611c98120f..d0eae2d7e64b 100644 --- a/drivers/perf/fsl_imx8_ddr_perf.c +++ b/drivers/perf/fsl_imx8_ddr_perf.c @@ -19,6 +19,8 @@ #define COUNTER_READ 0x20 #define COUNTER_DPCR1 0x30 +#define COUNTER_MUX_CNTL 0x50 +#define COUNTER_MASK_COMP 0x54 #define CNTL_OVER 0x1 #define CNTL_CLEAR 0x2 @@ -32,6 +34,13 @@ #define CNTL_CSV_SHIFT 24 #define CNTL_CSV_MASK (0xFFU << CNTL_CSV_SHIFT) +#define READ_PORT_SHIFT 0 +#define READ_PORT_MASK (0x7 << READ_PORT_SHIFT) +#define READ_CHANNEL_REVERT 0x00000008 /* bit 3 for read channel select */ +#define WRITE_PORT_SHIFT 8 +#define WRITE_PORT_MASK (0x7 << WRITE_PORT_SHIFT) +#define WRITE_CHANNEL_REVERT 0x00000800 /* bit 11 for write channel select */ + #define EVENT_CYCLES_ID 0 #define EVENT_CYCLES_COUNTER 0 #define NUM_COUNTERS 4 @@ -50,6 +59,7 @@ static DEFINE_IDA(ddr_ida); /* DDR Perf hardware feature */ #define DDR_CAP_AXI_ID_FILTER 0x1 /* support AXI ID filter */ #define DDR_CAP_AXI_ID_FILTER_ENHANCED 0x3 /* support enhanced AXI ID filter */ +#define DDR_CAP_AXI_ID_PORT_CHANNEL_FILTER 0x4 /* support AXI ID PORT CHANNEL filter */ struct fsl_ddr_devtype_data { unsigned int quirks; /* quirks needed for different DDR Perf core */ @@ -144,6 +154,7 @@ static const struct attribute_group ddr_perf_identifier_attr_group = { enum ddr_perf_filter_capabilities { PERF_CAP_AXI_ID_FILTER = 0, PERF_CAP_AXI_ID_FILTER_ENHANCED, + PERF_CAP_AXI_ID_PORT_CHANNEL_FILTER, PERF_CAP_AXI_ID_FEAT_MAX, }; @@ -157,6 +168,8 @@ static u32 ddr_perf_filter_cap_get(struct ddr_pmu *pmu, int cap) case PERF_CAP_AXI_ID_FILTER_ENHANCED: quirks &= DDR_CAP_AXI_ID_FILTER_ENHANCED; return quirks == DDR_CAP_AXI_ID_FILTER_ENHANCED; + case PERF_CAP_AXI_ID_PORT_CHANNEL_FILTER: + return !!(quirks & DDR_CAP_AXI_ID_PORT_CHANNEL_FILTER); default: WARN(1, "unknown filter cap %d\n", cap); } @@ -187,6 +200,7 @@ static ssize_t ddr_perf_filter_cap_show(struct device *dev, static struct attribute *ddr_perf_filter_cap_attr[] = { PERF_FILTER_EXT_ATTR_ENTRY(filter, PERF_CAP_AXI_ID_FILTER), PERF_FILTER_EXT_ATTR_ENTRY(enhanced_filter, PERF_CAP_AXI_ID_FILTER_ENHANCED), + PERF_FILTER_EXT_ATTR_ENTRY(super_filter, PERF_CAP_AXI_ID_PORT_CHANNEL_FILTER), NULL, }; @@ -272,11 +286,15 @@ static const struct attribute_group ddr_perf_events_attr_group = { PMU_FORMAT_ATTR(event, "config:0-7"); PMU_FORMAT_ATTR(axi_id, "config1:0-15"); PMU_FORMAT_ATTR(axi_mask, "config1:16-31"); +PMU_FORMAT_ATTR(axi_port, "config2:0-2"); +PMU_FORMAT_ATTR(axi_channel, "config2:3-3"); static struct attribute *ddr_perf_format_attrs[] = { &format_attr_event.attr, &format_attr_axi_id.attr, &format_attr_axi_mask.attr, + &format_attr_axi_port.attr, + &format_attr_axi_channel.attr, NULL, }; @@ -530,6 +548,7 @@ static int ddr_perf_event_add(struct perf_event *event, int flags) int counter; int cfg = event->attr.config; int cfg1 = event->attr.config1; + int cfg2 = event->attr.config2; if (pmu->devtype_data->quirks & DDR_CAP_AXI_ID_FILTER) { int i; @@ -553,6 +572,26 @@ static int ddr_perf_event_add(struct perf_event *event, int flags) return -EOPNOTSUPP; } + if (pmu->devtype_data->quirks & DDR_CAP_AXI_ID_PORT_CHANNEL_FILTER) { + if (ddr_perf_is_filtered(event)) { + /* revert axi id masking(axi_mask) value */ + cfg1 ^= AXI_MASKING_REVERT; + writel(cfg1, pmu->base + COUNTER_MASK_COMP + ((counter - 1) << 4)); + + if (cfg == 0x41) { + /* revert axi read channel(axi_channel) value */ + cfg2 ^= READ_CHANNEL_REVERT; + cfg2 |= FIELD_PREP(READ_PORT_MASK, cfg2); + } else { + /* revert axi write channel(axi_channel) value */ + cfg2 ^= WRITE_CHANNEL_REVERT; + cfg2 |= FIELD_PREP(WRITE_PORT_MASK, cfg2); + } + + writel(cfg2, pmu->base + COUNTER_MUX_CNTL + ((counter - 1) << 4)); + } + } + pmu->events[counter] = event; hwc->idx = counter; From 9745295358f44cc5da4145b2a5ca3e2921d70841 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Mon, 20 Nov 2023 17:33:14 +0800 Subject: [PATCH 135/310] docs/perf: Add explanation for DDR_CAP_AXI_ID_PORT_CHANNEL_FILTER quirk Add explanation for DDR_CAP_AXI_ID_PORT_CHANNEL_FILTER quirk. Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20231120093317.2652866-2-xu.yang_2@nxp.com Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/imx-ddr.rst | 45 ++++++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/perf/imx-ddr.rst b/Documentation/admin-guide/perf/imx-ddr.rst index 90926d0fb8ec..77418ae5a290 100644 --- a/Documentation/admin-guide/perf/imx-ddr.rst +++ b/Documentation/admin-guide/perf/imx-ddr.rst @@ -13,8 +13,8 @@ is one register for each counter. Counter 0 is special in that it always counts interrupt is raised. If any other counter overflows, it continues counting, and no interrupt is raised. -The "format" directory describes format of the config (event ID) and config1 -(AXI filtering) fields of the perf_event_attr structure, see /sys/bus/event_source/ +The "format" directory describes format of the config (event ID) and config1/2 +(AXI filter setting) fields of the perf_event_attr structure, see /sys/bus/event_source/ devices/imx8_ddr0/format/. The "events" directory describes the events types hardware supported that can be used with perf tool, see /sys/bus/event_source/ devices/imx8_ddr0/events/. The "caps" directory describes filter features implemented @@ -28,12 +28,11 @@ in DDR PMU, see /sys/bus/events_source/devices/imx8_ddr0/caps/. AXI filtering is only used by CSV modes 0x41 (axid-read) and 0x42 (axid-write) to count reading or writing matches filter setting. Filter setting is various from different DRAM controller implementations, which is distinguished by quirks -in the driver. You also can dump info from userspace, filter in "caps" directory -indicates whether PMU supports AXI ID filter or not; enhanced_filter indicates -whether PMU supports enhanced AXI ID filter or not. Value 0 for un-supported, and -value 1 for supported. +in the driver. You also can dump info from userspace, "caps" directory show the +type of AXI filter (filter, enhanced_filter and super_filter). Value 0 for +un-supported, and value 1 for supported. -* With DDR_CAP_AXI_ID_FILTER quirk(filter: 1, enhanced_filter: 0). +* With DDR_CAP_AXI_ID_FILTER quirk(filter: 1, enhanced_filter: 0, super_filter: 0). Filter is defined with two configuration parts: --AXI_ID defines AxID matching value. --AXI_MASKING defines which bits of AxID are meaningful for the matching. @@ -65,7 +64,37 @@ value 1 for supported. perf stat -a -e imx8_ddr0/axid-read,axi_id=0x12/ cmd, which will monitor ARID=0x12 -* With DDR_CAP_AXI_ID_FILTER_ENHANCED quirk(filter: 1, enhanced_filter: 1). +* With DDR_CAP_AXI_ID_FILTER_ENHANCED quirk(filter: 1, enhanced_filter: 1, super_filter: 0). This is an extension to the DDR_CAP_AXI_ID_FILTER quirk which permits counting the number of bytes (as opposed to the number of bursts) from DDR read and write transactions concurrently with another set of data counters. + +* With DDR_CAP_AXI_ID_PORT_CHANNEL_FILTER quirk(filter: 0, enhanced_filter: 0, super_filter: 1). + There is a limitation in previous AXI filter, it cannot filter different IDs + at the same time as the filter is shared between counters. This quirk is the + extension of AXI ID filter. One improvement is that counter 1-3 has their own + filter, means that it supports concurrently filter various IDs. Another + improvement is that counter 1-3 supports AXI PORT and CHANNEL selection. Support + selecting address channel or data channel. + + Filter is defined with 2 configuration registers per counter 1-3. + --Counter N MASK COMP register - including AXI_ID and AXI_MASKING. + --Counter N MUX CNTL register - including AXI CHANNEL and AXI PORT. + + - 0: address channel + - 1: data channel + + PMU in DDR subsystem, only one single port0 exists, so axi_port is reserved + which should be 0. + + .. code-block:: bash + + perf stat -a -e imx8_ddr0/axid-read,axi_mask=0xMMMM,axi_id=0xDDDD,axi_channel=0xH/ cmd + perf stat -a -e imx8_ddr0/axid-write,axi_mask=0xMMMM,axi_id=0xDDDD,axi_channel=0xH/ cmd + + .. note:: + + axi_channel is inverted in userspace, and it will be reverted in driver + automatically. So that users do not need specify axi_channel if want to + monitor data channel from DDR transactions, since data channel is more + meaningful. From 2fe44e7dcb86601f0e12735621ed1113536eb392 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Mon, 20 Nov 2023 17:33:15 +0800 Subject: [PATCH 136/310] dt-bindings: perf: fsl-imx-ddr: Add i.MX8DXL compatible Add a compatible for i.MX8DXL which is compatile with "fsl,imx8-ddr-pmu". Acked-by: Krzysztof Kozlowski Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20231120093317.2652866-3-xu.yang_2@nxp.com Signed-off-by: Will Deacon --- Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml index e9fad4b3de68..6c96a4204e5d 100644 --- a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml +++ b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml @@ -27,6 +27,9 @@ properties: - fsl,imx8mq-ddr-pmu - fsl,imx8mp-ddr-pmu - const: fsl,imx8m-ddr-pmu + - items: + - const: fsl,imx8dxl-ddr-pmu + - const: fsl,imx8-ddr-pmu reg: maxItems: 1 From 46fe448ec3b7a10253fcca7fe7e0d8f01a2b38e4 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Mon, 20 Nov 2023 17:33:16 +0800 Subject: [PATCH 137/310] perf: fsl_imx8_ddr: Add driver support for i.MX8DXL DDR Perf Add driver support for i.MX8DXL DDR Perf, which supports AXI ID PORT CHANNEL filter. Signed-off-by: Xu Yang Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20231120093317.2652866-4-xu.yang_2@nxp.com Signed-off-by: Will Deacon --- drivers/perf/fsl_imx8_ddr_perf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c index d0eae2d7e64b..7dbfaee372c7 100644 --- a/drivers/perf/fsl_imx8_ddr_perf.c +++ b/drivers/perf/fsl_imx8_ddr_perf.c @@ -92,6 +92,11 @@ static const struct fsl_ddr_devtype_data imx8mp_devtype_data = { .identifier = "i.MX8MP", }; +static const struct fsl_ddr_devtype_data imx8dxl_devtype_data = { + .quirks = DDR_CAP_AXI_ID_PORT_CHANNEL_FILTER, + .identifier = "i.MX8DXL", +}; + static const struct of_device_id imx_ddr_pmu_dt_ids[] = { { .compatible = "fsl,imx8-ddr-pmu", .data = &imx8_devtype_data}, { .compatible = "fsl,imx8m-ddr-pmu", .data = &imx8m_devtype_data}, @@ -99,6 +104,7 @@ static const struct of_device_id imx_ddr_pmu_dt_ids[] = { { .compatible = "fsl,imx8mm-ddr-pmu", .data = &imx8mm_devtype_data}, { .compatible = "fsl,imx8mn-ddr-pmu", .data = &imx8mn_devtype_data}, { .compatible = "fsl,imx8mp-ddr-pmu", .data = &imx8mp_devtype_data}, + { .compatible = "fsl,imx8dxl-ddr-pmu", .data = &imx8dxl_devtype_data}, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, imx_ddr_pmu_dt_ids); From 8fd7588fd4eefe2e474b433f930eba99415ece9e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 27 Nov 2023 00:10:45 +0900 Subject: [PATCH 138/310] arm64: replace with Commit ddb5cdbafaaa ("kbuild: generate KSYMTAB entries by modpost") deprecated , which is now a wrapper of . Replace #include with #include . Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20231126151045.1556686-1-masahiroy@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/assembler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 376a980f2bad..7b1975bf4b90 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -12,7 +12,7 @@ #ifndef __ASM_ASSEMBLER_H #define __ASM_ASSEMBLER_H -#include +#include #include #include From 75b5e0bf90bffaca4b1f19114065dc59f5cc161f Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Fri, 24 Nov 2023 11:15:13 +0800 Subject: [PATCH 139/310] arm64: irq: set the correct node for VMAP stack In current code, init_irq_stacks() will call cpu_to_node(). The cpu_to_node() depends on percpu "numa_node" which is initialized in: arch_call_rest_init() --> rest_init() -- kernel_init() --> kernel_init_freeable() --> smp_prepare_cpus() But init_irq_stacks() is called in init_IRQ() which is before arch_call_rest_init(). So in init_irq_stacks(), the cpu_to_node() does not work, it always return 0. In NUMA, it makes the node 1 cpu accesses the IRQ stack which is in the node 0. This patch fixes it by: 1.) export the early_cpu_to_node(), and use it in the init_irq_stacks(). 2.) change init_irq_stacks() to __init function. Reviewed-by: Catalin Marinas Signed-off-by: Huang Shijie Link: https://lore.kernel.org/r/20231124031513.81548-1-shijie@os.amperecomputing.com Signed-off-by: Will Deacon --- arch/arm64/kernel/irq.c | 5 +++-- drivers/base/arch_numa.c | 2 +- include/asm-generic/numa.h | 2 ++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 6ad5c6ef5329..9f253d8efe90 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -51,13 +52,13 @@ static void init_irq_scs(void) } #ifdef CONFIG_VMAP_STACK -static void init_irq_stacks(void) +static void __init init_irq_stacks(void) { int cpu; unsigned long *p; for_each_possible_cpu(cpu) { - p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, cpu_to_node(cpu)); + p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, early_cpu_to_node(cpu)); per_cpu(irq_stack_ptr, cpu) = p; } } diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index eaa31e567d1e..5b59d133b6af 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -144,7 +144,7 @@ void __init early_map_cpu_to_node(unsigned int cpu, int nid) unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); -static int __init early_cpu_to_node(int cpu) +int __init early_cpu_to_node(int cpu) { return cpu_to_node_map[cpu]; } diff --git a/include/asm-generic/numa.h b/include/asm-generic/numa.h index 1a3ad6d29833..c32e0cf23c90 100644 --- a/include/asm-generic/numa.h +++ b/include/asm-generic/numa.h @@ -35,6 +35,7 @@ int __init numa_add_memblk(int nodeid, u64 start, u64 end); void __init numa_set_distance(int from, int to, int distance); void __init numa_free_distance(void); void __init early_map_cpu_to_node(unsigned int cpu, int nid); +int __init early_cpu_to_node(int cpu); void numa_store_cpu_info(unsigned int cpu); void numa_add_cpu(unsigned int cpu); void numa_remove_cpu(unsigned int cpu); @@ -46,6 +47,7 @@ static inline void numa_add_cpu(unsigned int cpu) { } static inline void numa_remove_cpu(unsigned int cpu) { } static inline void arch_numa_init(void) { } static inline void early_map_cpu_to_node(unsigned int cpu, int nid) { } +static inline int early_cpu_to_node(int cpu) { return 0; } #endif /* CONFIG_NUMA */ From 365b1900c93a6a4860fce8c429e2d8b6e154a107 Mon Sep 17 00:00:00 2001 From: Tsung-Han Lin Date: Sun, 3 Dec 2023 09:18:04 +0800 Subject: [PATCH 140/310] Documentation/arch/arm64: Fix typo Should be 'if' here. Signed-off-by: Tsung-Han Lin Reviewed-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20231203011804.27694-1-tsunghan.tw@gmail.com Signed-off-by: Will Deacon --- Documentation/arch/arm64/arm-acpi.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/arch/arm64/arm-acpi.rst b/Documentation/arch/arm64/arm-acpi.rst index a46c34fa9604..e59e4505d0d9 100644 --- a/Documentation/arch/arm64/arm-acpi.rst +++ b/Documentation/arch/arm64/arm-acpi.rst @@ -130,7 +130,7 @@ When an Arm system boots, it can either have DT information, ACPI tables, or in some very unusual cases, both. If no command line parameters are used, the kernel will try to use DT for device enumeration; if there is no DT present, the kernel will try to use ACPI tables, but only if they are present. -In neither is available, the kernel will not boot. If acpi=force is used +If neither is available, the kernel will not boot. If acpi=force is used on the command line, the kernel will attempt to use ACPI tables first, but fall back to DT if there are no ACPI tables present. The basic idea is that the kernel will not fail to boot unless it absolutely has no other choice. From a264f715ecb3e6dac7c4d7135db74eb2379cb086 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Mon, 13 Nov 2023 16:53:14 +0800 Subject: [PATCH 141/310] EDAC/igen6: Make get_mchbar() helper function Make get_mchbar() helper function to retrieve the BAR address of the memory controller. No function changes. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck --- drivers/edac/igen6_edac.c | 46 ++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index 1a18693294db..eadc3e559311 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -222,6 +222,36 @@ static struct work_struct ecclog_work; #define DID_ADL_SKU3 0x4621 #define DID_ADL_SKU4 0x4641 +static int get_mchbar(struct pci_dev *pdev, u64 *mchbar) +{ + union { + u64 v; + struct { + u32 v_lo; + u32 v_hi; + }; + } u; + + if (pci_read_config_dword(pdev, MCHBAR_OFFSET, &u.v_lo)) { + igen6_printk(KERN_ERR, "Failed to read lower MCHBAR\n"); + return -ENODEV; + } + + if (pci_read_config_dword(pdev, MCHBAR_OFFSET + 4, &u.v_hi)) { + igen6_printk(KERN_ERR, "Failed to read upper MCHBAR\n"); + return -ENODEV; + } + + if (!(u.v & MCHBAR_EN)) { + igen6_printk(KERN_ERR, "MCHBAR is disabled\n"); + return -ENODEV; + } + + *mchbar = MCHBAR_BASE(u.v); + + return 0; +} + static bool ehl_ibecc_available(struct pci_dev *pdev) { u32 v; @@ -969,22 +999,8 @@ static int igen6_pci_setup(struct pci_dev *pdev, u64 *mchbar) igen6_tom = u.v & GENMASK_ULL(38, 20); - if (pci_read_config_dword(pdev, MCHBAR_OFFSET, &u.v_lo)) { - igen6_printk(KERN_ERR, "Failed to read lower MCHBAR\n"); + if (get_mchbar(pdev, mchbar)) goto fail; - } - - if (pci_read_config_dword(pdev, MCHBAR_OFFSET + 4, &u.v_hi)) { - igen6_printk(KERN_ERR, "Failed to read upper MCHBAR\n"); - goto fail; - } - - if (!(u.v & MCHBAR_EN)) { - igen6_printk(KERN_ERR, "MCHBAR is disabled\n"); - goto fail; - } - - *mchbar = MCHBAR_BASE(u.v); #ifdef CONFIG_EDAC_DEBUG if (pci_read_config_dword(pdev, TOUUD_OFFSET, &u.v_lo)) From c4a5398991fd2ad3011c486571300574495bc834 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Mon, 13 Nov 2023 16:53:15 +0800 Subject: [PATCH 142/310] EDAC/igen6: Add Intel Alder Lake-N SoCs support Add Intel Alder Lake-N SoC compute die IDs for EDAC support. Alder Lake-N, with one memory controller, is a reduced version of Alder Lake-P, which has two memory controllers. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck --- drivers/edac/igen6_edac.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index eadc3e559311..6e9b0cd076b4 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -222,6 +222,19 @@ static struct work_struct ecclog_work; #define DID_ADL_SKU3 0x4621 #define DID_ADL_SKU4 0x4641 +/* Compute die IDs for Alder Lake-N with IBECC */ +#define DID_ADL_N_SKU1 0x4614 +#define DID_ADL_N_SKU2 0x4617 +#define DID_ADL_N_SKU3 0x461b +#define DID_ADL_N_SKU4 0x461c +#define DID_ADL_N_SKU5 0x4673 +#define DID_ADL_N_SKU6 0x4674 +#define DID_ADL_N_SKU7 0x4675 +#define DID_ADL_N_SKU8 0x4677 +#define DID_ADL_N_SKU9 0x4678 +#define DID_ADL_N_SKU10 0x4679 +#define DID_ADL_N_SKU11 0x467c + static int get_mchbar(struct pci_dev *pdev, u64 *mchbar) { union { @@ -433,6 +446,17 @@ static struct res_config adl_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; +static struct res_config adl_n_cfg = { + .machine_check = true, + .num_imc = 1, + .imc_base = 0xd800, + .ibecc_base = 0xd400, + .ibecc_error_log_offset = 0x68, + .ibecc_available = tgl_ibecc_available, + .err_addr_to_sys_addr = adl_err_addr_to_sys_addr, + .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, +}; + static const struct pci_device_id igen6_pci_tbl[] = { { PCI_VDEVICE(INTEL, DID_EHL_SKU5), (kernel_ulong_t)&ehl_cfg }, { PCI_VDEVICE(INTEL, DID_EHL_SKU6), (kernel_ulong_t)&ehl_cfg }, @@ -454,6 +478,17 @@ static const struct pci_device_id igen6_pci_tbl[] = { { PCI_VDEVICE(INTEL, DID_ADL_SKU2), (kernel_ulong_t)&adl_cfg }, { PCI_VDEVICE(INTEL, DID_ADL_SKU3), (kernel_ulong_t)&adl_cfg }, { PCI_VDEVICE(INTEL, DID_ADL_SKU4), (kernel_ulong_t)&adl_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU1), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU2), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU3), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU4), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU5), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU6), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU7), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU8), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU9), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU10), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_ADL_N_SKU11), (kernel_ulong_t)&adl_n_cfg }, { }, }; MODULE_DEVICE_TABLE(pci, igen6_pci_tbl); From d23627a7688fabff0096a7beaff1a93de76afaad Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Mon, 13 Nov 2023 16:53:16 +0800 Subject: [PATCH 143/310] EDAC/igen6: Add Intel Raptor Lake-P SoCs support Add Intel Raptor Lake-P SoC compute die IDs for EDAC support. These Raptor Lake-P SoCs share similar IBECC registers with Alder Lake-P SoCs but extend the most significant bit of the error address logged in IBECC from bit 38 to bit 45. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck --- drivers/edac/igen6_edac.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index 6e9b0cd076b4..f7914ce3d3d7 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -80,6 +80,7 @@ #define ECC_ERROR_LOG_UE BIT_ULL(63) #define ECC_ERROR_LOG_ADDR_SHIFT 5 #define ECC_ERROR_LOG_ADDR(v) GET_BITFIELD(v, 5, 38) +#define ECC_ERROR_LOG_ADDR45(v) GET_BITFIELD(v, 5, 45) #define ECC_ERROR_LOG_SYND(v) GET_BITFIELD(v, 46, 61) /* Host MMIO base address */ @@ -133,6 +134,8 @@ static struct res_config { u32 ibecc_base; u32 ibecc_error_log_offset; bool (*ibecc_available)(struct pci_dev *pdev); + /* Extract error address logged in IBECC */ + u64 (*err_addr)(u64 ecclog); /* Convert error address logged in IBECC to system physical address */ u64 (*err_addr_to_sys_addr)(u64 eaddr, int mc); /* Convert error address logged in IBECC to integrated memory controller address */ @@ -235,6 +238,13 @@ static struct work_struct ecclog_work; #define DID_ADL_N_SKU10 0x4679 #define DID_ADL_N_SKU11 0x467c +/* Compute die IDs for Raptor Lake-P with IBECC */ +#define DID_RPL_P_SKU1 0xa706 +#define DID_RPL_P_SKU2 0xa707 +#define DID_RPL_P_SKU3 0xa708 +#define DID_RPL_P_SKU4 0xa716 +#define DID_RPL_P_SKU5 0xa718 + static int get_mchbar(struct pci_dev *pdev, u64 *mchbar) { union { @@ -401,6 +411,11 @@ static u64 adl_err_addr_to_imc_addr(u64 eaddr, int mc) return imc_addr; } +static u64 rpl_p_err_addr(u64 ecclog) +{ + return ECC_ERROR_LOG_ADDR45(ecclog); +} + static struct res_config ehl_cfg = { .num_imc = 1, .imc_base = 0x5000, @@ -457,6 +472,18 @@ static struct res_config adl_n_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; +static struct res_config rpl_p_cfg = { + .machine_check = true, + .num_imc = 2, + .imc_base = 0xd800, + .ibecc_base = 0xd400, + .ibecc_error_log_offset = 0x68, + .ibecc_available = tgl_ibecc_available, + .err_addr = rpl_p_err_addr, + .err_addr_to_sys_addr = adl_err_addr_to_sys_addr, + .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, +}; + static const struct pci_device_id igen6_pci_tbl[] = { { PCI_VDEVICE(INTEL, DID_EHL_SKU5), (kernel_ulong_t)&ehl_cfg }, { PCI_VDEVICE(INTEL, DID_EHL_SKU6), (kernel_ulong_t)&ehl_cfg }, @@ -489,6 +516,11 @@ static const struct pci_device_id igen6_pci_tbl[] = { { PCI_VDEVICE(INTEL, DID_ADL_N_SKU9), (kernel_ulong_t)&adl_n_cfg }, { PCI_VDEVICE(INTEL, DID_ADL_N_SKU10), (kernel_ulong_t)&adl_n_cfg }, { PCI_VDEVICE(INTEL, DID_ADL_N_SKU11), (kernel_ulong_t)&adl_n_cfg }, + { PCI_VDEVICE(INTEL, DID_RPL_P_SKU1), (kernel_ulong_t)&rpl_p_cfg }, + { PCI_VDEVICE(INTEL, DID_RPL_P_SKU2), (kernel_ulong_t)&rpl_p_cfg }, + { PCI_VDEVICE(INTEL, DID_RPL_P_SKU3), (kernel_ulong_t)&rpl_p_cfg }, + { PCI_VDEVICE(INTEL, DID_RPL_P_SKU4), (kernel_ulong_t)&rpl_p_cfg }, + { PCI_VDEVICE(INTEL, DID_RPL_P_SKU5), (kernel_ulong_t)&rpl_p_cfg }, { }, }; MODULE_DEVICE_TABLE(pci, igen6_pci_tbl); @@ -744,8 +776,11 @@ static void ecclog_work_cb(struct work_struct *work) llist_for_each_entry_safe(node, tmp, head, llnode) { memset(&res, 0, sizeof(res)); - eaddr = ECC_ERROR_LOG_ADDR(node->ecclog) << - ECC_ERROR_LOG_ADDR_SHIFT; + if (res_cfg->err_addr) + eaddr = res_cfg->err_addr(node->ecclog); + else + eaddr = ECC_ERROR_LOG_ADDR(node->ecclog) << + ECC_ERROR_LOG_ADDR_SHIFT; res.mc = node->mc; res.sys_addr = res_cfg->err_addr_to_sys_addr(eaddr, res.mc); res.imc_addr = res_cfg->err_addr_to_imc_addr(eaddr, res.mc); From 3c77090c1247d963f2559e77810a5d48efeeb7d6 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Mon, 13 Nov 2023 16:53:17 +0800 Subject: [PATCH 144/310] EDAC/igen6: Add Intel Meteor Lake-PS SoCs support Add Intel Meteor Lake-PS SoC compute die IDs for EDAC support. These SoCs share similar IBECC registers with Alder Lake-P SoCs. The only difference is that IBECC presence is detected through an MMIO-mapped register instead of the capability register in the PCI configuration space. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck --- drivers/edac/igen6_edac.c | 44 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index f7914ce3d3d7..d336ba53e67c 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -245,6 +245,12 @@ static struct work_struct ecclog_work; #define DID_RPL_P_SKU4 0xa716 #define DID_RPL_P_SKU5 0xa718 +/* Compute die IDs for Meteor Lake-PS with IBECC */ +#define DID_MTL_PS_SKU1 0x7d21 +#define DID_MTL_PS_SKU2 0x7d22 +#define DID_MTL_PS_SKU3 0x7d23 +#define DID_MTL_PS_SKU4 0x7d24 + static int get_mchbar(struct pci_dev *pdev, u64 *mchbar) { union { @@ -325,6 +331,29 @@ static bool tgl_ibecc_available(struct pci_dev *pdev) return !(CAPID_E_IBECC & v); } +static bool mtl_ps_ibecc_available(struct pci_dev *pdev) +{ +#define MCHBAR_MEMSS_IBECCDIS 0x13c00 + void __iomem *window; + u64 mchbar; + u32 val; + + if (get_mchbar(pdev, &mchbar)) + return false; + + window = ioremap(mchbar, MCHBAR_SIZE * 2); + if (!window) { + igen6_printk(KERN_ERR, "Failed to ioremap 0x%llx\n", mchbar); + return false; + } + + val = readl(window + MCHBAR_MEMSS_IBECCDIS); + iounmap(window); + + /* Bit6: 1 - IBECC is disabled, 0 - IBECC isn't disabled */ + return !GET_BITFIELD(val, 6, 6); +} + static u64 mem_addr_to_sys_addr(u64 maddr) { if (maddr < igen6_tolud) @@ -484,6 +513,17 @@ static struct res_config rpl_p_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; +static struct res_config mtl_ps_cfg = { + .machine_check = true, + .num_imc = 2, + .imc_base = 0xd800, + .ibecc_base = 0xd400, + .ibecc_error_log_offset = 0x170, + .ibecc_available = mtl_ps_ibecc_available, + .err_addr_to_sys_addr = adl_err_addr_to_sys_addr, + .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, +}; + static const struct pci_device_id igen6_pci_tbl[] = { { PCI_VDEVICE(INTEL, DID_EHL_SKU5), (kernel_ulong_t)&ehl_cfg }, { PCI_VDEVICE(INTEL, DID_EHL_SKU6), (kernel_ulong_t)&ehl_cfg }, @@ -521,6 +561,10 @@ static const struct pci_device_id igen6_pci_tbl[] = { { PCI_VDEVICE(INTEL, DID_RPL_P_SKU3), (kernel_ulong_t)&rpl_p_cfg }, { PCI_VDEVICE(INTEL, DID_RPL_P_SKU4), (kernel_ulong_t)&rpl_p_cfg }, { PCI_VDEVICE(INTEL, DID_RPL_P_SKU5), (kernel_ulong_t)&rpl_p_cfg }, + { PCI_VDEVICE(INTEL, DID_MTL_PS_SKU1), (kernel_ulong_t)&mtl_ps_cfg }, + { PCI_VDEVICE(INTEL, DID_MTL_PS_SKU2), (kernel_ulong_t)&mtl_ps_cfg }, + { PCI_VDEVICE(INTEL, DID_MTL_PS_SKU3), (kernel_ulong_t)&mtl_ps_cfg }, + { PCI_VDEVICE(INTEL, DID_MTL_PS_SKU4), (kernel_ulong_t)&mtl_ps_cfg }, { }, }; MODULE_DEVICE_TABLE(pci, igen6_pci_tbl); From 6807434ff0449ff9b3fc18fb40d1ab1c5ff3b708 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Mon, 13 Nov 2023 16:53:18 +0800 Subject: [PATCH 145/310] EDAC/igen6: Add Intel Meteor Lake-P SoCs support Add Intel Meteor Lake-P SoC compute die IDs for EDAC support. These Meteor Lake-P SoCs share similar IBECC registers with Alder Lake-P SoCs. Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck --- drivers/edac/igen6_edac.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index d336ba53e67c..2b0ecdeba5cd 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -58,6 +58,7 @@ /* Capability register E */ #define CAPID_E_OFFSET 0xf0 #define CAPID_E_IBECC BIT(12) +#define CAPID_E_IBECC_BIT18 BIT(18) /* Error Status */ #define ERRSTS_OFFSET 0xc8 @@ -251,6 +252,11 @@ static struct work_struct ecclog_work; #define DID_MTL_PS_SKU3 0x7d23 #define DID_MTL_PS_SKU4 0x7d24 +/* Compute die IDs for Meteor Lake-P with IBECC */ +#define DID_MTL_P_SKU1 0x7d01 +#define DID_MTL_P_SKU2 0x7d02 +#define DID_MTL_P_SKU3 0x7d14 + static int get_mchbar(struct pci_dev *pdev, u64 *mchbar) { union { @@ -331,6 +337,16 @@ static bool tgl_ibecc_available(struct pci_dev *pdev) return !(CAPID_E_IBECC & v); } +static bool mtl_p_ibecc_available(struct pci_dev *pdev) +{ + u32 v; + + if (pci_read_config_dword(pdev, CAPID_E_OFFSET, &v)) + return false; + + return !(CAPID_E_IBECC_BIT18 & v); +} + static bool mtl_ps_ibecc_available(struct pci_dev *pdev) { #define MCHBAR_MEMSS_IBECCDIS 0x13c00 @@ -524,6 +540,17 @@ static struct res_config mtl_ps_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; +static struct res_config mtl_p_cfg = { + .machine_check = true, + .num_imc = 2, + .imc_base = 0xd800, + .ibecc_base = 0xd400, + .ibecc_error_log_offset = 0x170, + .ibecc_available = mtl_p_ibecc_available, + .err_addr_to_sys_addr = adl_err_addr_to_sys_addr, + .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, +}; + static const struct pci_device_id igen6_pci_tbl[] = { { PCI_VDEVICE(INTEL, DID_EHL_SKU5), (kernel_ulong_t)&ehl_cfg }, { PCI_VDEVICE(INTEL, DID_EHL_SKU6), (kernel_ulong_t)&ehl_cfg }, @@ -565,6 +592,9 @@ static const struct pci_device_id igen6_pci_tbl[] = { { PCI_VDEVICE(INTEL, DID_MTL_PS_SKU2), (kernel_ulong_t)&mtl_ps_cfg }, { PCI_VDEVICE(INTEL, DID_MTL_PS_SKU3), (kernel_ulong_t)&mtl_ps_cfg }, { PCI_VDEVICE(INTEL, DID_MTL_PS_SKU4), (kernel_ulong_t)&mtl_ps_cfg }, + { PCI_VDEVICE(INTEL, DID_MTL_P_SKU1), (kernel_ulong_t)&mtl_p_cfg }, + { PCI_VDEVICE(INTEL, DID_MTL_P_SKU2), (kernel_ulong_t)&mtl_p_cfg }, + { PCI_VDEVICE(INTEL, DID_MTL_P_SKU3), (kernel_ulong_t)&mtl_p_cfg }, { }, }; MODULE_DEVICE_TABLE(pci, igen6_pci_tbl); From a50cc8de9995640d3c8062c13bffb67c01782674 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 20 Nov 2023 16:10:45 +0200 Subject: [PATCH 146/310] EDAC, pnd2: Replace custom definition by one from sizes.h The sizes.h provides a set of common size definitions, use it. Signed-off-by: Andy Shevchenko Reviewed-by: Qiuxu Zhuo Signed-off-by: Tony Luck --- drivers/edac/pnd2_edac.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index 2b306f2cc605..45e3c2913d51 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -109,7 +110,6 @@ static struct mem_ctl_info *pnd2_mci; #define MOT_CHAN_INTLV_BIT_1SLC_2CH 12 #define MOT_CHAN_INTLV_BIT_2SLC_2CH 13 #define SELECTOR_DISABLED (-1) -#define _4GB (1ul << 32) #define PMI_ADDRESS_WIDTH 31 #define PND_MAX_PHYS_BIT 39 @@ -587,7 +587,7 @@ static int get_registers(void) /* Get a contiguous memory address (remove the MMIO gap) */ static u64 remove_mmio_gap(u64 sys) { - return (sys < _4GB) ? sys : sys - (_4GB - top_lm); + return (sys < SZ_4G) ? sys : sys - (SZ_4G - top_lm); } /* Squeeze out one address bit, shift upper part down to fill gap */ @@ -643,7 +643,7 @@ static int sys2pmi(const u64 addr, u32 *pmiidx, u64 *pmiaddr, char *msg) /* Give up if address is out of range, or in MMIO gap */ if (addr >= (1ul << PND_MAX_PHYS_BIT) || - (addr >= top_lm && addr < _4GB) || addr >= top_hm) { + (addr >= top_lm && addr < SZ_4G) || addr >= top_hm) { snprintf(msg, PND2_MSG_SIZE, "Error address 0x%llx is not DRAM", addr); return -EINVAL; } From 530258f872138a9c5db5ace73313c9c88a31c96e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 20 Nov 2023 16:10:46 +0200 Subject: [PATCH 147/310] EDAC, pnd2: Apply bit macros and helpers where it makes sense Apply bit macros (BIT()/BIT_ULL()/GENMASK()/etc) and helpers (for_each_set_bit()/etc) where it makes sense. Signed-off-by: Andy Shevchenko Reviewed-by: Qiuxu Zhuo Signed-off-by: Tony Luck --- drivers/edac/pnd2_edac.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index 45e3c2913d51..ee7c9b024354 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -183,7 +183,7 @@ static int _apl_rd_reg(int port, int off, int op, u32 *data) } P2SB_READ(dword, P2SB_DATA_OFF, data); - ret = (status >> 1) & 0x3; + ret = (status >> 1) & GENMASK(1, 0); out: /* Hide the P2SB device, if it was hidden before */ if (hidden) @@ -307,7 +307,7 @@ static bool two_channels; /* Both PMI channels in one slice enabled */ static u8 sym_chan_mask; static u8 asym_chan_mask; -static u8 chan_mask; +static unsigned long chan_mask; static int slice_selector = -1; static int chan_selector = -1; @@ -598,7 +598,7 @@ static void remove_addr_bit(u64 *addr, int bitidx) if (bitidx == -1) return; - mask = (1ull << bitidx) - 1; + mask = BIT_ULL(bitidx) - 1; *addr = ((*addr >> 1) & ~mask) | (*addr & mask); } @@ -642,7 +642,7 @@ static int sys2pmi(const u64 addr, u32 *pmiidx, u64 *pmiaddr, char *msg) int sym_chan_shift = sym_channels >> 1; /* Give up if address is out of range, or in MMIO gap */ - if (addr >= (1ul << PND_MAX_PHYS_BIT) || + if (addr >= BIT(PND_MAX_PHYS_BIT) || (addr >= top_lm && addr < SZ_4G) || addr >= top_hm) { snprintf(msg, PND2_MSG_SIZE, "Error address 0x%llx is not DRAM", addr); return -EINVAL; @@ -727,10 +727,10 @@ static int sys2pmi(const u64 addr, u32 *pmiidx, u64 *pmiaddr, char *msg) } /* Translate PMI address to memory (rank, row, bank, column) */ -#define C(n) (0x10 | (n)) /* column */ -#define B(n) (0x20 | (n)) /* bank */ -#define R(n) (0x40 | (n)) /* row */ -#define RS (0x80) /* rank */ +#define C(n) (BIT(4) | (n)) /* column */ +#define B(n) (BIT(5) | (n)) /* bank */ +#define R(n) (BIT(6) | (n)) /* row */ +#define RS (BIT(7)) /* rank */ /* addrdec values */ #define AMAP_1KB 0 @@ -1064,9 +1064,9 @@ static int apl_check_ecc_active(void) int i, ret = 0; /* Check dramtype and ECC mode for each present DIMM */ - for (i = 0; i < APL_NUM_CHANNELS; i++) - if (chan_mask & BIT(i)) - ret += check_channel(i); + for_each_set_bit(i, &chan_mask, APL_NUM_CHANNELS) + ret += check_channel(i); + return ret ? -EINVAL : 0; } @@ -1205,10 +1205,7 @@ static void apl_get_dimm_config(struct mem_ctl_info *mci) u64 capacity; int i, g; - for (i = 0; i < APL_NUM_CHANNELS; i++) { - if (!(chan_mask & BIT(i))) - continue; - + for_each_set_bit(i, &chan_mask, APL_NUM_CHANNELS) { dimm = edac_get_dimm(mci, i, 0, 0); if (!dimm) { edac_dbg(0, "No allocated DIMM for channel %d\n", i); @@ -1228,8 +1225,7 @@ static void apl_get_dimm_config(struct mem_ctl_info *mci) } pvt->dimm_geom[i] = g; - capacity = (d->rken0 + d->rken1) * 8 * (1ul << dimms[g].rowbits) * - (1ul << dimms[g].colbits); + capacity = (d->rken0 + d->rken1) * 8 * BIT(dimms[g].rowbits + dimms[g].colbits); edac_dbg(0, "Channel %d: %lld MByte DIMM\n", i, capacity >> (20 - 3)); dimm->nr_pages = MiB_TO_PAGES(capacity >> (20 - 3)); dimm->grain = 32; @@ -1295,7 +1291,7 @@ static void dnv_get_dimm_config(struct mem_ctl_info *mci) continue; } - capacity = ranks_of_dimm[j] * banks * (1ul << rowbits) * (1ul << colbits); + capacity = ranks_of_dimm[j] * banks * BIT(rowbits + colbits); edac_dbg(0, "Channel %d DIMM %d: %lld MByte DIMM\n", i, j, capacity >> (20 - 3)); dimm->nr_pages = MiB_TO_PAGES(capacity >> (20 - 3)); dimm->grain = 32; From f1b0b1167f8bf139afa76e795d06fb814067d53e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 20 Nov 2023 16:10:47 +0200 Subject: [PATCH 148/310] EDAC, pnd2: Correct misleading error message in mk_region_mask() The mask parameter is expected to be of a sequence of the set bits. It does not mean it must be power of two (only single bit set). Correct misleading error message. Suggested-by: Qiuxu Zhuo Signed-off-by: Andy Shevchenko Signed-off-by: Tony Luck --- drivers/edac/pnd2_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index ee7c9b024354..969fb2465edb 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -329,7 +329,7 @@ static void mk_region_mask(char *name, struct region *rp, u64 base, u64 mask) return; } if (mask != GENMASK_ULL(PND_MAX_PHYS_BIT, __ffs(mask))) { - pr_info(FW_BUG "MOT mask not power of two\n"); + pr_info(FW_BUG "MOT mask is invalid\n"); return; } if (base & ~mask) { From a69badad736c0f460401080a67e6208e25ab25e2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 20 Nov 2023 16:10:48 +0200 Subject: [PATCH 149/310] EDAC, pnd2: Sort headers alphabetically Sort the headers in alphabetic order in order to ease the maintenance for this part. Signed-off-by: Andy Shevchenko Reviewed-by: Qiuxu Zhuo Signed-off-by: Tony Luck --- drivers/edac/pnd2_edac.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index 969fb2465edb..2afcd148fcf8 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -16,19 +16,20 @@ * rank, bank, row and column using the appropriate "dunit_ops" functions/parameters. */ -#include -#include -#include -#include -#include +#include #include #include -#include -#include -#include -#include +#include #include +#include #include +#include +#include +#include +#include +#include +#include + #include #include From cf0573939d3f4ce822ceb742a8179f38697b1953 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Tue, 28 Nov 2023 15:20:49 +0100 Subject: [PATCH 150/310] Documentation: Begin a RAS section Add some initial RAS documentation. The expectation is for this to collect, among others, all the user-visible features for interaction with the RAS features of the kernel. Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231128142049.GTZWX3QQTSaQk/+u53@fat_crate.local --- Documentation/RAS/ras.rst | 26 ++++++++++++++++++++++++++ Documentation/index.rst | 1 + 2 files changed, 27 insertions(+) create mode 100644 Documentation/RAS/ras.rst diff --git a/Documentation/RAS/ras.rst b/Documentation/RAS/ras.rst new file mode 100644 index 000000000000..2556b397cd27 --- /dev/null +++ b/Documentation/RAS/ras.rst @@ -0,0 +1,26 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Reliability, Availability and Serviceability features +===================================================== + +This documents different aspects of the RAS functionality present in the +kernel. + +Error decoding +--------------- + +* x86 + +Error decoding on AMD systems should be done using the rasdaemon tool: +https://github.com/mchehab/rasdaemon/ + +While the daemon is running, it would automatically log and decode +errors. If not, one can still decode such errors by supplying the +hardware information from the error:: + + $ rasdaemon -p --status --ipid --smca + +Also, the user can pass particular family and model to decode the error +string:: + + $ rasdaemon -p --status --ipid --smca --family --model --bank diff --git a/Documentation/index.rst b/Documentation/index.rst index 9dfdc826618c..36e61783437c 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -113,6 +113,7 @@ to ReStructured Text format, or are simply too old. :maxdepth: 1 staging/index + RAS/ras Translations From 7d370e1812b9a5f5cc68aaa5991bf7d31d8ff52c Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Fri, 1 Dec 2023 18:56:06 +0530 Subject: [PATCH 151/310] KVM: PPC: Book3S HV nestedv2: Invalidate RPT before deleting a guest An L0 must invalidate the L2's RPT during H_GUEST_DELETE if this has not already been done. This is a slow operation that means H_GUEST_DELETE must return H_BUSY multiple times before completing. Invalidating the tables before deleting the guest so there is less work for the L0 to do. Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://msgid.link/20231201132618.555031-2-vaibhav@linux.ibm.com --- arch/powerpc/include/asm/kvm_book3s.h | 1 + arch/powerpc/kvm/book3s_hv.c | 6 ++++-- arch/powerpc/kvm/book3s_hv_nested.c | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 4f527d09c92b..a37736ed3728 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -302,6 +302,7 @@ void kvmhv_nested_exit(void); void kvmhv_vm_nested_init(struct kvm *kvm); long kvmhv_set_partition_table(struct kvm_vcpu *vcpu); long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu); +void kvmhv_flush_lpid(u64 lpid); void kvmhv_set_ptbl_entry(u64 lpid, u64 dw0, u64 dw1); void kvmhv_release_all_nested(struct kvm *kvm); long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1ed6ec140701..5543e8490cd9 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5691,10 +5691,12 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0); } - if (kvmhv_is_nestedv2()) + if (kvmhv_is_nestedv2()) { + kvmhv_flush_lpid(kvm->arch.lpid); plpar_guest_delete(0, kvm->arch.lpid); - else + } else { kvmppc_free_lpid(kvm->arch.lpid); + } kvmppc_free_pimap(kvm); } diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 3b658b8696bc..5c375ec1a3c6 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -503,7 +503,7 @@ void kvmhv_nested_exit(void) } } -static void kvmhv_flush_lpid(u64 lpid) +void kvmhv_flush_lpid(u64 lpid) { long rc; From e0d4acbcba3f2d63dc15bc5432c8e26fc9e19675 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Fri, 1 Dec 2023 18:56:07 +0530 Subject: [PATCH 152/310] KVM: PPC: Book3S HV nestedv2: Avoid reloading the tb offset The kvmppc_get_tb_offset() getter reloads KVMPPC_GSID_TB_OFFSET from the L0 for nestedv2 host. This is unnecessary as the value does not change. KVMPPC_GSID_TB_OFFSET also need not be reloaded in kvmppc_{s,g}et_dec_expires(). Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://msgid.link/20231201132618.555031-3-vaibhav@linux.ibm.com --- arch/powerpc/include/asm/kvm_book3s.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index a37736ed3728..3e1e2a698c9e 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -594,13 +594,17 @@ static inline u##size kvmppc_get_##reg(struct kvm_vcpu *vcpu) \ KVMPPC_BOOK3S_VCORE_ACCESSOR(vtb, 64, KVMPPC_GSID_VTB) -KVMPPC_BOOK3S_VCORE_ACCESSOR(tb_offset, 64, KVMPPC_GSID_TB_OFFSET) KVMPPC_BOOK3S_VCORE_ACCESSOR_GET(arch_compat, 32, KVMPPC_GSID_LOGICAL_PVR) KVMPPC_BOOK3S_VCORE_ACCESSOR_GET(lpcr, 64, KVMPPC_GSID_LPCR) +KVMPPC_BOOK3S_VCORE_ACCESSOR_SET(tb_offset, 64, KVMPPC_GSID_TB_OFFSET) + +static inline u64 kvmppc_get_tb_offset(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.vcore->tb_offset; +} static inline u64 kvmppc_get_dec_expires(struct kvm_vcpu *vcpu) { - WARN_ON(kvmhv_nestedv2_cached_reload(vcpu, KVMPPC_GSID_TB_OFFSET) < 0); WARN_ON(kvmhv_nestedv2_cached_reload(vcpu, KVMPPC_GSID_DEC_EXPIRY_TB) < 0); return vcpu->arch.dec_expires; } @@ -608,7 +612,6 @@ static inline u64 kvmppc_get_dec_expires(struct kvm_vcpu *vcpu) static inline void kvmppc_set_dec_expires(struct kvm_vcpu *vcpu, u64 val) { vcpu->arch.dec_expires = val; - WARN_ON(kvmhv_nestedv2_cached_reload(vcpu, KVMPPC_GSID_TB_OFFSET) < 0); kvmhv_nestedv2_mark_dirty(vcpu, KVMPPC_GSID_DEC_EXPIRY_TB); } From 63ccae78cd88b52fb1d598ae33fa8408ce067b30 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Fri, 1 Dec 2023 18:56:08 +0530 Subject: [PATCH 153/310] KVM: PPC: Book3S HV nestedv2: Do not check msr on hcalls The check for a hcall coming from userspace is done for KVM-PR. This is not supported for nestedv2 and the L0 will directly inject the necessary exception to the L2 if userspace performs a hcall. Avoid checking the MSR and thus avoid a H_GUEST_GET_STATE hcall in the L1. Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://msgid.link/20231201132618.555031-4-vaibhav@linux.ibm.com --- arch/powerpc/kvm/book3s_hv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 5543e8490cd9..069c336b6f3c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1688,7 +1688,7 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, { int i; - if (unlikely(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) { + if (!kvmhv_is_nestedv2() && unlikely(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) { /* * Guest userspace executed sc 1. This can only be * reached by the P9 path because the old path @@ -4949,7 +4949,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) if (run->exit_reason == KVM_EXIT_PAPR_HCALL) { accumulate_time(vcpu, &vcpu->arch.hcall); - if (WARN_ON_ONCE(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) { + if (!kvmhv_is_nestedv2() && WARN_ON_ONCE(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) { /* * These should have been caught reflected * into the guest by now. Final sanity check: From e678748a8dca5b57041a84a66577f6168587b3f7 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Fri, 1 Dec 2023 18:56:09 +0530 Subject: [PATCH 154/310] KVM: PPC: Book3S HV nestedv2: Get the PID only if needed to copy tofrom a guest kvmhv_copy_tofrom_guest_radix() gets the PID at the start of the function. If pid is not used, then this is a wasteful H_GUEST_GET_STATE hcall for nestedv2 hosts. Move the assignment to where pid will be used. Suggested-by: Nicholas Piggin Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://msgid.link/20231201132618.555031-5-vaibhav@linux.ibm.com --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 175a8eb2681f..916af6c153a5 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -97,7 +97,7 @@ static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, void *from, unsigned long n) { int lpid = vcpu->kvm->arch.lpid; - int pid = kvmppc_get_pid(vcpu); + int pid; /* This would cause a data segment intr so don't allow the access */ if (eaddr & (0x3FFUL << 52)) @@ -110,6 +110,8 @@ static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, /* If accessing quadrant 3 then pid is expected to be 0 */ if (((eaddr >> 62) & 0x3) == 0x3) pid = 0; + else + pid = kvmppc_get_pid(vcpu); eaddr &= ~(0xFFFUL << 52); From ec0f6639fa8853cf6bfdfc3588aada7eeb7e5e37 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Fri, 1 Dec 2023 18:56:10 +0530 Subject: [PATCH 155/310] KVM: PPC: Book3S HV nestedv2: Ensure LPCR_MER bit is passed to the L0 LPCR_MER is conditionally set during entry to a guest if there is a pending external interrupt. In the nestedv2 case, this change is not being communicated to the L0, which means it is not being set in the L2. Ensure the updated LPCR value is passed to the L0. Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://msgid.link/20231201132618.555031-6-vaibhav@linux.ibm.com --- arch/powerpc/kvm/book3s_hv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 069c336b6f3c..6d1f0bca27aa 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4084,6 +4084,8 @@ static int kvmhv_vcpu_entry_nestedv2(struct kvm_vcpu *vcpu, u64 time_limit, if (rc < 0) return -EINVAL; + kvmppc_gse_put_u64(io->vcpu_run_input, KVMPPC_GSID_LPCR, lpcr); + accumulate_time(vcpu, &vcpu->arch.in_guest); rc = plpar_guest_run_vcpu(0, vcpu->kvm->arch.lpid, vcpu->vcpu_id, &trap, &i); From ecd10702baae5c16a91d139bde7eff84ce55daee Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 1 Dec 2023 18:56:11 +0530 Subject: [PATCH 156/310] KVM: PPC: Book3S HV: Handle pending exceptions on guest entry with MSR_EE Commit 026728dc5d41 ("KVM: PPC: Book3S HV P9: Inject pending xive interrupts at guest entry") changed guest entry so that if external interrupts are enabled, BOOK3S_IRQPRIO_EXTERNAL is not tested for. Test for this regardless of MSR_EE. For an L1 host, do not inject an interrupt, but always use LPCR_MER. If the L0 desires it can inject an interrupt. Fixes: 026728dc5d41 ("KVM: PPC: Book3S HV P9: Inject pending xive interrupts at guest entry") Signed-off-by: Nicholas Piggin [jpn: use kvmpcc_get_msr(), write commit message] Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://msgid.link/20231201132618.555031-7-vaibhav@linux.ibm.com --- arch/powerpc/kvm/book3s_hv.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6d1f0bca27aa..4dc6a928073f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4738,13 +4738,19 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, if (!nested) { kvmppc_core_prepare_to_enter(vcpu); - if (__kvmppc_get_msr_hv(vcpu) & MSR_EE) { - if (xive_interrupt_pending(vcpu)) + if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, + &vcpu->arch.pending_exceptions) || + xive_interrupt_pending(vcpu)) { + /* + * For nested HV, don't synthesize but always pass MER, + * the L0 will be able to optimise that more + * effectively than manipulating registers directly. + */ + if (!kvmhv_on_pseries() && (__kvmppc_get_msr_hv(vcpu) & MSR_EE)) kvmppc_inject_interrupt_hv(vcpu, - BOOK3S_INTERRUPT_EXTERNAL, 0); - } else if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, - &vcpu->arch.pending_exceptions)) { - lpcr |= LPCR_MER; + BOOK3S_INTERRUPT_EXTERNAL, 0); + else + lpcr |= LPCR_MER; } } else if (vcpu->arch.pending_exceptions || vcpu->arch.doorbell_request || From df938a5576f3f3b08e1f217c660385c0d58a0b91 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Fri, 1 Dec 2023 18:56:12 +0530 Subject: [PATCH 157/310] KVM: PPC: Book3S HV nestedv2: Do not inject certain interrupts There is no need to inject an external interrupt in kvmppc_book3s_irqprio_deliver() as the test for BOOK3S_IRQPRIO_EXTERNAL in kvmhv_run_single_vcpu() before guest entry will raise LPCR_MER if needed. There is also no need to inject the decrementer interrupt as this will be raised within the L2 if needed. Avoiding these injections reduces H_GUEST_GET_STATE hcalls by the L1. Suggested-by: Nicholas Piggin Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://msgid.link/20231201132618.555031-8-vaibhav@linux.ibm.com --- arch/powerpc/kvm/book3s.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 6cd20ab9e94e..8acec144120e 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -302,11 +302,11 @@ static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, switch (priority) { case BOOK3S_IRQPRIO_DECREMENTER: - deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; + deliver = !kvmhv_is_nestedv2() && (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; vec = BOOK3S_INTERRUPT_DECREMENTER; break; case BOOK3S_IRQPRIO_EXTERNAL: - deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; + deliver = !kvmhv_is_nestedv2() && (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; vec = BOOK3S_INTERRUPT_EXTERNAL; break; case BOOK3S_IRQPRIO_SYSTEM_RESET: From a9a3de530d7531bf6cd3f6ccda769cd94c1105a0 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Fri, 1 Dec 2023 18:56:13 +0530 Subject: [PATCH 158/310] KVM: PPC: Book3S HV nestedv2: Avoid msr check in kvmppc_handle_exit_hv() The msr check in kvmppc_handle_exit_hv() is not needed for nestedv2 hosts, skip the check to avoid a H_GUEST_GET_STATE hcall. Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://msgid.link/20231201132618.555031-9-vaibhav@linux.ibm.com --- arch/powerpc/kvm/book3s_hv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 4dc6a928073f..47fe470375df 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1597,7 +1597,7 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, * That can happen due to a bug, or due to a machine check * occurring at just the wrong time. */ - if (__kvmppc_get_msr_hv(vcpu) & MSR_HV) { + if (!kvmhv_is_nestedv2() && (__kvmppc_get_msr_hv(vcpu) & MSR_HV)) { printk(KERN_EMERG "KVM trap in HV mode!\n"); printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", vcpu->arch.trap, kvmppc_get_pc(vcpu), From 4bc8ff6f170c78f64446c5d5f9ef6771eefd3416 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Fri, 1 Dec 2023 18:56:14 +0530 Subject: [PATCH 159/310] KVM: PPC: Book3S HV nestedv2: Do not call H_COPY_TOFROM_GUEST H_COPY_TOFROM_GUEST is part of the nestedv1 API and so should not be called by a nestedv2 host. Do not attempt to call it. Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://msgid.link/20231201132618.555031-10-vaibhav@linux.ibm.com --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 916af6c153a5..4a1abb9f7c05 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -40,6 +40,9 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, unsigned long quadrant, ret = n; bool is_load = !!to; + if (kvmhv_is_nestedv2()) + return H_UNSUPPORTED; + /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */ if (kvmhv_on_pseries()) return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr, From db1dcfae1dae3c042f348175ac0394e2fc14b1b3 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Fri, 1 Dec 2023 18:56:15 +0530 Subject: [PATCH 160/310] KVM: PPC: Book3S HV nestedv2: Register the VPA with the L0 In the nestedv2 case, the L1 may register the L2's VPA with the L0. This allows the L0 to manage the L2's dispatch count, as well as enable possible performance optimisations by seeing if certain resources are not being used by the L2 (such as the PMCs). Use the H_GUEST_SET_STATE call to inform the L0 of the L2's VPA address. This can not be done in the H_GUEST_VCPU_RUN input buffer. Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://msgid.link/20231201132618.555031-11-vaibhav@linux.ibm.com --- arch/powerpc/include/asm/kvm_book3s_64.h | 1 + arch/powerpc/kvm/book3s_hv.c | 38 ++++++++++++++++++------ arch/powerpc/kvm/book3s_hv_nestedv2.c | 29 ++++++++++++++++++ 3 files changed, 59 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 2477021bff54..d8729ec81ca0 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -682,6 +682,7 @@ void kvmhv_nestedv2_vcpu_free(struct kvm_vcpu *vcpu, struct kvmhv_nestedv2_io *i int kvmhv_nestedv2_flush_vcpu(struct kvm_vcpu *vcpu, u64 time_limit); int kvmhv_nestedv2_set_ptbl_entry(unsigned long lpid, u64 dw0, u64 dw1); int kvmhv_nestedv2_parse_output(struct kvm_vcpu *vcpu); +int kvmhv_nestedv2_set_vpa(struct kvm_vcpu *vcpu, unsigned long vpa); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 47fe470375df..2ee3f2478570 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -650,7 +650,8 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, return err; } -static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap) +static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap, + struct kvmppc_vpa *old_vpap) { struct kvm *kvm = vcpu->kvm; void *va; @@ -690,9 +691,8 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap) kvmppc_unpin_guest_page(kvm, va, gpa, false); va = NULL; } - if (vpap->pinned_addr) - kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa, - vpap->dirty); + *old_vpap = *vpap; + vpap->gpa = gpa; vpap->pinned_addr = va; vpap->dirty = false; @@ -702,6 +702,9 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap) static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; + struct kvmppc_vpa old_vpa = { 0 }; + if (!(vcpu->arch.vpa.update_pending || vcpu->arch.slb_shadow.update_pending || vcpu->arch.dtl.update_pending)) @@ -709,17 +712,34 @@ static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) spin_lock(&vcpu->arch.vpa_update_lock); if (vcpu->arch.vpa.update_pending) { - kvmppc_update_vpa(vcpu, &vcpu->arch.vpa); - if (vcpu->arch.vpa.pinned_addr) + kvmppc_update_vpa(vcpu, &vcpu->arch.vpa, &old_vpa); + if (old_vpa.pinned_addr) { + if (kvmhv_is_nestedv2()) + kvmhv_nestedv2_set_vpa(vcpu, ~0ull); + kvmppc_unpin_guest_page(kvm, old_vpa.pinned_addr, old_vpa.gpa, + old_vpa.dirty); + } + if (vcpu->arch.vpa.pinned_addr) { init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); + if (kvmhv_is_nestedv2()) + kvmhv_nestedv2_set_vpa(vcpu, __pa(vcpu->arch.vpa.pinned_addr)); + } } if (vcpu->arch.dtl.update_pending) { - kvmppc_update_vpa(vcpu, &vcpu->arch.dtl); + kvmppc_update_vpa(vcpu, &vcpu->arch.dtl, &old_vpa); + if (old_vpa.pinned_addr) + kvmppc_unpin_guest_page(kvm, old_vpa.pinned_addr, old_vpa.gpa, + old_vpa.dirty); vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr; vcpu->arch.dtl_index = 0; } - if (vcpu->arch.slb_shadow.update_pending) - kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow); + if (vcpu->arch.slb_shadow.update_pending) { + kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow, &old_vpa); + if (old_vpa.pinned_addr) + kvmppc_unpin_guest_page(kvm, old_vpa.pinned_addr, old_vpa.gpa, + old_vpa.dirty); + } + spin_unlock(&vcpu->arch.vpa_update_lock); } diff --git a/arch/powerpc/kvm/book3s_hv_nestedv2.c b/arch/powerpc/kvm/book3s_hv_nestedv2.c index fd3c4f2d9480..5378eb40b162 100644 --- a/arch/powerpc/kvm/book3s_hv_nestedv2.c +++ b/arch/powerpc/kvm/book3s_hv_nestedv2.c @@ -855,6 +855,35 @@ free_gsb: } EXPORT_SYMBOL_GPL(kvmhv_nestedv2_set_ptbl_entry); +/** + * kvmhv_nestedv2_set_vpa() - register L2 VPA with L0 + * @vcpu: vcpu + * @vpa: L1 logical real address + */ +int kvmhv_nestedv2_set_vpa(struct kvm_vcpu *vcpu, unsigned long vpa) +{ + struct kvmhv_nestedv2_io *io; + struct kvmppc_gs_buff *gsb; + int rc = 0; + + io = &vcpu->arch.nestedv2_io; + gsb = io->vcpu_run_input; + + kvmppc_gsb_reset(gsb); + rc = kvmppc_gse_put_u64(gsb, KVMPPC_GSID_VPA, vpa); + if (rc < 0) + goto out; + + rc = kvmppc_gsb_send(gsb, 0); + if (rc < 0) + pr_err("KVM-NESTEDv2: couldn't register the L2 VPA (rc=%d)\n", rc); + +out: + kvmppc_gsb_reset(gsb); + return rc; +} +EXPORT_SYMBOL_GPL(kvmhv_nestedv2_set_vpa); + /** * kvmhv_nestedv2_parse_output() - receive values from H_GUEST_RUN_VCPU output * @vcpu: vcpu From 797a5af8fc7297b19e5c6b1713956ebf1e6c1cde Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Fri, 1 Dec 2023 18:56:16 +0530 Subject: [PATCH 161/310] KVM: PPC: Reduce reliance on analyse_instr() in mmio emulation Commit 709236039964 ("KVM: PPC: Reimplement non-SIMD LOAD/STORE instruction mmio emulation with analyse_instr() input") and commit 2b33cb585f94 ("KVM: PPC: Reimplement LOAD_FP/STORE_FP instruction mmio emulation with analyse_instr() input") made kvmppc_emulate_loadstore() use the results from analyse_instr() for instruction emulation. In particular the effective address from analyse_instr() is used for UPDATE type instructions and fact that op.val is all ready endian corrected is used in the STORE case. However, these changes now have some negative implications for the nestedv2 case. For analyse_instr() to determine the correct effective address, the GPRs must be loaded from the L0. This is not needed as vcpu->arch.vaddr_accessed is already set. Change back to using vcpu->arch.vaddr_accessed. In the STORE case, use kvmppc_get_gpr() value instead of the op.val. kvmppc_get_gpr() will reload from the L0 if needed in the nestedv2 case. This means if a byte reversal is needed must now be passed to kvmppc_handle_store() like in the kvmppc_handle_load() case. This means the call to kvmhv_nestedv2_reload_ptregs() can be avoided as there is no concern about op.val being stale. Drop the call to kvmhv_nestedv2_mark_dirty_ptregs() as without the call to kvmhv_nestedv2_reload_ptregs(), stale state could be marked as valid. This is fine as the required marking things dirty is already handled for the UPDATE case by the call to kvmppc_set_gpr(). For LOADs, it is handled in kvmppc_complete_mmio_load(). This is called either directly in __kvmppc_handle_load() if the load can be handled in KVM, or on the next kvm_arch_vcpu_ioctl_run() if an exit was required. Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://msgid.link/20231201132618.555031-12-vaibhav@linux.ibm.com --- arch/powerpc/kvm/emulate_loadstore.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c index 077fd88a0b68..ec60c7979718 100644 --- a/arch/powerpc/kvm/emulate_loadstore.c +++ b/arch/powerpc/kvm/emulate_loadstore.c @@ -93,7 +93,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) emulated = EMULATE_FAIL; vcpu->arch.regs.msr = kvmppc_get_msr(vcpu); - kvmhv_nestedv2_reload_ptregs(vcpu, &vcpu->arch.regs); if (analyse_instr(&op, &vcpu->arch.regs, inst) == 0) { int type = op.type & INSTR_TYPE_MASK; int size = GETSIZE(op.type); @@ -112,7 +111,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) op.reg, size, !instr_byte_swap); if ((op.type & UPDATE) && (emulated != EMULATE_FAIL)) - kvmppc_set_gpr(vcpu, op.update_reg, op.ea); + kvmppc_set_gpr(vcpu, op.update_reg, vcpu->arch.vaddr_accessed); break; } @@ -132,7 +131,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) KVM_MMIO_REG_FPR|op.reg, size, 1); if ((op.type & UPDATE) && (emulated != EMULATE_FAIL)) - kvmppc_set_gpr(vcpu, op.update_reg, op.ea); + kvmppc_set_gpr(vcpu, op.update_reg, vcpu->arch.vaddr_accessed); break; #endif @@ -224,16 +223,17 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) break; } #endif - case STORE: - /* if need byte reverse, op.val has been reversed by - * analyse_instr(). - */ - emulated = kvmppc_handle_store(vcpu, op.val, size, 1); + case STORE: { + int instr_byte_swap = op.type & BYTEREV; + + emulated = kvmppc_handle_store(vcpu, kvmppc_get_gpr(vcpu, op.reg), + size, !instr_byte_swap); if ((op.type & UPDATE) && (emulated != EMULATE_FAIL)) - kvmppc_set_gpr(vcpu, op.update_reg, op.ea); + kvmppc_set_gpr(vcpu, op.update_reg, vcpu->arch.vaddr_accessed); break; + } #ifdef CONFIG_PPC_FPU case STORE_FP: if (kvmppc_check_fp_disabled(vcpu)) @@ -254,7 +254,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) kvmppc_get_fpr(vcpu, op.reg), size, 1); if ((op.type & UPDATE) && (emulated != EMULATE_FAIL)) - kvmppc_set_gpr(vcpu, op.update_reg, op.ea); + kvmppc_set_gpr(vcpu, op.update_reg, vcpu->arch.vaddr_accessed); break; #endif @@ -358,7 +358,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) } trace_kvm_ppc_instr(ppc_inst_val(inst), kvmppc_get_pc(vcpu), emulated); - kvmhv_nestedv2_mark_dirty_ptregs(vcpu, &vcpu->arch.regs); /* Advance past emulated instruction. */ if (emulated != EMULATE_FAIL) From 180c6b072bf360b686e53d893d8dcf7dbbaec6bb Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Fri, 1 Dec 2023 18:56:17 +0530 Subject: [PATCH 162/310] KVM: PPC: Book3S HV nestedv2: Do not cancel pending decrementer exception In the nestedv2 case, if there is a pending decrementer exception, the L1 must get the L2's timebase from the L0 to see if the exception should be cancelled. This adds the overhead of a H_GUEST_GET_STATE call to the likely case in which the decrementer should not be cancelled. Avoid this logic for the nestedv2 case. Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://msgid.link/20231201132618.555031-13-vaibhav@linux.ibm.com --- arch/powerpc/kvm/book3s_hv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 2ee3f2478570..e48126a59ba7 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4834,7 +4834,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, * entering a nested guest in which case the decrementer is now owned * by L2 and the L1 decrementer is provided in hdec_expires */ - if (kvmppc_core_pending_dec(vcpu) && + if (!kvmhv_is_nestedv2() && kvmppc_core_pending_dec(vcpu) && ((tb < kvmppc_dec_expires_host_tb(vcpu)) || (trap == BOOK3S_INTERRUPT_SYSCALL && kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED))) From 773b93f1d1c38c5c0d5308b8c9229c7a6ec5b2a0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V (IBM)" Date: Mon, 4 Dec 2023 15:06:37 +0530 Subject: [PATCH 163/310] powerpc/book3s/hash: Drop _PAGE_PRIVILEGED from PAGE_NONE There used to be a dependency on _PAGE_PRIVILEGED with pte_savedwrite. But that got dropped by commit 6a56ccbcf6c6 ("mm/autonuma: use can_change_(pte|pmd)_writable() to replace savedwrite") With the change in this patch numa fault pte (pte_protnone()) gets mapped as regular user pte with RWX cleared (no-access) whereas earlier it used to be mapped _PAGE_PRIVILEGED. Hash fault handling code gets some WARN_ON added in this patch because those functions are not expected to get called with _PAGE_READ cleared. commit 18061c17c8ec ("powerpc/mm: Update PROTFAULT handling in the page fault path") explains the details. Signed-off-by: "Aneesh Kumar K.V (IBM)" Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/20231204093638.71503-1-aneesh.kumar@kernel.org --- arch/powerpc/include/asm/book3s/64/pgtable.h | 10 ++-------- arch/powerpc/mm/book3s64/hash_utils.c | 7 +++++++ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index cb77eddca54b..927d585652bc 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -17,12 +17,6 @@ #define _PAGE_EXEC 0x00001 /* execute permission */ #define _PAGE_WRITE 0x00002 /* write access allowed */ #define _PAGE_READ 0x00004 /* read access allowed */ -#define _PAGE_NA _PAGE_PRIVILEGED -#define _PAGE_NAX _PAGE_EXEC -#define _PAGE_RO _PAGE_READ -#define _PAGE_ROX (_PAGE_READ | _PAGE_EXEC) -#define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) -#define _PAGE_RWX (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC) #define _PAGE_PRIVILEGED 0x00008 /* kernel access only */ #define _PAGE_SAO 0x00010 /* Strong access order */ #define _PAGE_NON_IDEMPOTENT 0x00020 /* non idempotent memory */ @@ -532,8 +526,8 @@ static inline bool pte_user(pte_t pte) static inline bool pte_access_permitted(pte_t pte, bool write) { /* - * _PAGE_READ is needed for any access and will be - * cleared for PROT_NONE + * _PAGE_READ is needed for any access and will be cleared for + * PROT_NONE. Execute-only mapping via PROT_EXEC also returns false. */ if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte)) return false; diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index ad2afa08e62e..0626a25b0d72 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -310,9 +310,16 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags else rflags |= 0x3; } + VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request"); } else { if (pteflags & _PAGE_RWX) rflags |= 0x2; + /* + * We should never hit this in normal fault handling because + * a permission check (check_pte_access()) will bubble this + * to higher level linux handler even for PAGE_NONE. + */ + VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request"); if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) rflags |= 0x1; } From a59c14f6b4caad7671dfb81737beba0b313897e4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V (IBM)" Date: Mon, 4 Dec 2023 15:06:38 +0530 Subject: [PATCH 164/310] powerpc/book3s64: Avoid __pte_protnone() check in __pte_flags_need_flush() This reverts commit 1abce0580b89 ("powerpc/64s: Fix __pte_needs_flush() false positive warning") The previous patch dropped the usage of _PAGE_PRIVILEGED with PAGE_NONE. Hence this check can be dropped. Signed-off-by: "Aneesh Kumar K.V (IBM)" Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/20231204093638.71503-2-aneesh.kumar@kernel.org --- arch/powerpc/include/asm/book3s/64/tlbflush.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 1950c1b825b4..fd642b729775 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -158,11 +158,6 @@ static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, */ } -static inline bool __pte_protnone(unsigned long pte) -{ - return (pte & (pgprot_val(PAGE_NONE) | _PAGE_RWX)) == pgprot_val(PAGE_NONE); -} - static inline bool __pte_flags_need_flush(unsigned long oldval, unsigned long newval) { @@ -179,8 +174,8 @@ static inline bool __pte_flags_need_flush(unsigned long oldval, /* * We do not expect kernel mappings or non-PTEs or not-present PTEs. */ - VM_WARN_ON_ONCE(!__pte_protnone(oldval) && oldval & _PAGE_PRIVILEGED); - VM_WARN_ON_ONCE(!__pte_protnone(newval) && newval & _PAGE_PRIVILEGED); + VM_WARN_ON_ONCE(oldval & _PAGE_PRIVILEGED); + VM_WARN_ON_ONCE(newval & _PAGE_PRIVILEGED); VM_WARN_ON_ONCE(!(oldval & _PAGE_PTE)); VM_WARN_ON_ONCE(!(newval & _PAGE_PTE)); VM_WARN_ON_ONCE(!(oldval & _PAGE_PRESENT)); From 4cb3e3ec23fa33a0f58ff133d5aedd26a50356ef Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 5 Dec 2023 16:12:39 +1100 Subject: [PATCH 165/310] MAINTAINERS: powerpc: Transfer PPC83XX to Christophe Christophe volunteered[1] to maintain PPC83XX. 1: https://lore.kernel.org/all/7b1bf4dc-d09d-35b8-f4df-16bf00429b6d@csgroup.eu/ Acked-by: Christophe Leroy Acked-by: Crystal Wood Signed-off-by: Michael Ellerman Link: https://msgid.link/20231205051239.737384-1-mpe@ellerman.id.au --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ea790149af79..b876fc9d4f3a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12285,21 +12285,21 @@ S: Orphan F: arch/powerpc/platforms/40x/ F: arch/powerpc/platforms/44x/ -LINUX FOR POWERPC EMBEDDED PPC83XX AND PPC85XX +LINUX FOR POWERPC EMBEDDED PPC85XX M: Scott Wood L: linuxppc-dev@lists.ozlabs.org S: Odd fixes T: git git://git.kernel.org/pub/scm/linux/kernel/git/scottwood/linux.git F: Documentation/devicetree/bindings/cache/freescale-l2cache.txt F: Documentation/devicetree/bindings/powerpc/fsl/ -F: arch/powerpc/platforms/83xx/ F: arch/powerpc/platforms/85xx/ -LINUX FOR POWERPC EMBEDDED PPC8XX +LINUX FOR POWERPC EMBEDDED PPC8XX AND PPC83XX M: Christophe Leroy L: linuxppc-dev@lists.ozlabs.org S: Maintained F: arch/powerpc/platforms/8xx/ +F: arch/powerpc/platforms/83xx/ LINUX KERNEL DUMP TEST MODULE (LKDTM) M: Kees Cook From dc420877b5bd92db5d80df6b117c7a0f987290af Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 6 Dec 2023 22:55:45 +1100 Subject: [PATCH 166/310] powerpc/Makefile: Don't use $(ARCH) unnecessarily There's no need to use $(ARCH) for references to the arch directory in the source tree, it is always arch/powerpc. Signed-off-by: Michael Ellerman Link: https://msgid.link/20231206115548.1466874-1-mpe@ellerman.id.au --- arch/powerpc/Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index f19dbaa1d541..b0bc17c35ed7 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -161,7 +161,7 @@ CFLAGS-y += $(CONFIG_TUNE_CPU) asinstr := $(call as-instr,lis 9$(comma)foo@high,-DHAVE_AS_ATHIGH=1) -KBUILD_CPPFLAGS += -I $(srctree)/arch/$(ARCH) $(asinstr) +KBUILD_CPPFLAGS += -I $(srctree)/arch/powerpc $(asinstr) KBUILD_AFLAGS += $(AFLAGS-y) KBUILD_CFLAGS += $(call cc-option,-msoft-float) KBUILD_CFLAGS += $(CFLAGS-y) @@ -232,7 +232,7 @@ BOOT_TARGETS2 := zImage% dtbImage% treeImage.% cuImage.% simpleImage.% uImage.% PHONY += $(BOOT_TARGETS1) $(BOOT_TARGETS2) -boot := arch/$(ARCH)/boot +boot := arch/powerpc/boot $(BOOT_TARGETS1): vmlinux $(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@) @@ -336,7 +336,7 @@ PHONY += $(generated_configs) define archhelp echo '* zImage - Build default images selected by kernel config' - echo ' zImage.* - Compressed kernel image (arch/$(ARCH)/boot/zImage.*)' + echo ' zImage.* - Compressed kernel image (arch/powerpc/boot/zImage.*)' echo ' uImage - U-Boot native image format' echo ' cuImage.
- Backwards compatible U-Boot image for older' echo ' versions which do not support device trees' @@ -347,12 +347,12 @@ define archhelp echo ' (your) ~/bin/$(INSTALLKERNEL) or' echo ' (distribution) /sbin/$(INSTALLKERNEL) or' echo ' install to $$(INSTALL_PATH) and run lilo' - echo ' *_defconfig - Select default config from arch/$(ARCH)/configs' + echo ' *_defconfig - Select default config from arch/powerpc/configs' echo '' echo ' Targets with
embed a device tree blob inside the image' echo ' These targets support board with firmware that does not' echo ' support passing a device tree directly. Replace
with the' - echo ' name of a dts file from the arch/$(ARCH)/boot/dts/ directory' + echo ' name of a dts file from the arch/powerpc/boot/dts/ directory' echo ' (minus the .dts extension).' echo $(foreach cfg,$(generated_configs), From 42449052c94f22e18e01d71147d8fd75cb58132a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 6 Dec 2023 22:55:46 +1100 Subject: [PATCH 167/310] powerpc/vdso: No need to undef powerpc for 64-bit build The vdso Makefile adds -U$(ARCH) to CPPFLAGS for the vdso64.lds linker script. ARCH is always powerpc, so it becomes -Upowerpc, which means undefine the "powerpc" symbol. But the 64-bit compiler doesn't define powerpc in the first place, compare: $ gcc-5.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-gcc -m32 -E -dM - Link: https://msgid.link/20231206115548.1466874-2-mpe@ellerman.id.au --- arch/powerpc/kernel/vdso/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile index 0c7d82c270c3..1b93655c2857 100644 --- a/arch/powerpc/kernel/vdso/Makefile +++ b/arch/powerpc/kernel/vdso/Makefile @@ -71,7 +71,7 @@ AS64FLAGS := -D__VDSO64__ targets += vdso32.lds CPPFLAGS_vdso32.lds += -P -C -Upowerpc targets += vdso64.lds -CPPFLAGS_vdso64.lds += -P -C -U$(ARCH) +CPPFLAGS_vdso64.lds += -P -C # link rule for the .so file, .lds has to be first $(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) $(obj)/vgettimeofday-32.o FORCE From 22f17b02f88b48c01d3ac38d40d2b0b695ab2d10 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 6 Dec 2023 22:55:47 +1100 Subject: [PATCH 168/310] powerpc/Makefile: Default to ppc64le_defconfig when cross building If the kernel is being cross compiled, there is no information from uname on which defconfig is most appropriate, so the Makefile defaults to ppc64. However these days almost all distros that support powerpc are little endian, so it's more likely that defaulting to ppc64le_defconfig will produce something useful for a user. Signed-off-by: Michael Ellerman Link: https://msgid.link/20231206115548.1466874-3-mpe@ellerman.id.au --- arch/powerpc/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index b0bc17c35ed7..48c06f5a0dc1 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -16,9 +16,9 @@ HAS_BIARCH := $(call cc-option-yn, -m32) CROSS32_COMPILE ?= # If we're on a ppc/ppc64/ppc64le machine use that defconfig, otherwise just use -# ppc64_defconfig because we have nothing better to go on. +# ppc64le_defconfig because we have nothing better to go on. uname := $(shell uname -m) -KBUILD_DEFCONFIG := $(if $(filter ppc%,$(uname)),$(uname),ppc64)_defconfig +KBUILD_DEFCONFIG := $(if $(filter ppc%,$(uname)),$(uname),ppc64le)_defconfig new_nm := $(shell if $(NM) --help 2>&1 | grep -- '--synthetic' > /dev/null; then echo y; else echo n; fi) From 402928b58ec62b42b11992a7b51ff2f56ed65a18 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 6 Dec 2023 22:55:48 +1100 Subject: [PATCH 169/310] powerpc/Makefile: Auto detect cross compiler If no cross compiler is specified, try to auto detect one. Look for various combinations, matching: powerpc(64(le)?)?(-unknown)?-linux(-gnu)?- There are more possibilities, but the above is known to find a compiler on Fedora and Ubuntu (which use linux-gnu-), and also detects the kernel.org cross compilers (which use linux-). This allows cross compiling with simply: # Ubuntu $ sudo apt install gcc-powerpc-linux-gnu # Fedora $ sudo dnf install gcc-powerpc64-linux-gnu $ make ARCH=powerpc defconfig $ make ARCH=powerpc -j 4 Inspired by arch/parisc/Makefile. Acked-by: Segher Boessenkool Signed-off-by: Michael Ellerman Link: https://msgid.link/20231206115548.1466874-4-mpe@ellerman.id.au --- arch/powerpc/Makefile | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 48c06f5a0dc1..051247027da0 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -10,6 +10,17 @@ # Rewritten by Cort Dougan and Paul Mackerras # +ifdef cross_compiling + ifeq ($(CROSS_COMPILE),) + # Auto detect cross compiler prefix. + # Look for: (powerpc(64(le)?)?)(-unknown)?-linux(-gnu)?- + CC_ARCHES := powerpc powerpc64 powerpc64le + CC_SUFFIXES := linux linux-gnu unknown-linux-gnu + CROSS_COMPILE := $(call cc-cross-prefix, $(foreach a,$(CC_ARCHES), \ + $(foreach s,$(CC_SUFFIXES),$(a)-$(s)-))) + endif +endif + HAS_BIARCH := $(call cc-option-yn, -m32) # Set default 32 bits cross compilers for vdso and boot wrapper From 79c603ee43b2674fba0257803bab265147821955 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 8 Dec 2023 22:57:33 +0100 Subject: [PATCH 170/310] Documentation/x86: Document what /proc/cpuinfo is for This has been long overdue. Write down what x86's version of /proc/cpuinfo is and should be used for. With improvements by dhansen. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Dave Hansen Link: https://lore.kernel.org/r/20231129101700.28482-1-bp@alien8.de --- Documentation/arch/x86/cpuinfo.rst | 85 +++++++++++++++++++++++------- 1 file changed, 66 insertions(+), 19 deletions(-) diff --git a/Documentation/arch/x86/cpuinfo.rst b/Documentation/arch/x86/cpuinfo.rst index 08246e8ac835..8895784d4784 100644 --- a/Documentation/arch/x86/cpuinfo.rst +++ b/Documentation/arch/x86/cpuinfo.rst @@ -7,27 +7,74 @@ x86 Feature Flags Introduction ============ -On x86, flags appearing in /proc/cpuinfo have an X86_FEATURE definition -in arch/x86/include/asm/cpufeatures.h. If the kernel cares about a feature -or KVM want to expose the feature to a KVM guest, it can and should have -an X86_FEATURE_* defined. These flags represent hardware features as -well as software features. +The list of feature flags in /proc/cpuinfo is not complete and +represents an ill-fated attempt from long time ago to put feature flags +in an easy to find place for userspace. -If users want to know if a feature is available on a given system, they -try to find the flag in /proc/cpuinfo. If a given flag is present, it -means that the kernel supports it and is currently making it available. -If such flag represents a hardware feature, it also means that the -hardware supports it. +However, the amount of feature flags is growing by the CPU generation, +leading to unparseable and unwieldy /proc/cpuinfo. + +What is more, those feature flags do not even need to be in that file +because userspace doesn't care about them - glibc et al already use +CPUID to find out what the target machine supports and what not. + +And even if it doesn't show a particular feature flag - although the CPU +still does have support for the respective hardware functionality and +said CPU supports CPUID faulting - userspace can simply probe for the +feature and figure out if it is supported or not, regardless of whether +it is being advertised somewhere. + +Furthermore, those flag strings become an ABI the moment they appear +there and maintaining them forever when nothing even uses them is a lot +of wasted effort. + +So, the current use of /proc/cpuinfo is to show features which the +kernel has *enabled* and *supports*. As in: the CPUID feature flag is +there, there's an additional setup which the kernel has done while +booting and the functionality is ready to use. A perfect example for +that is "user_shstk" where additional code enablement is present in the +kernel to support shadow stack for user programs. + +So, if users want to know if a feature is available on a given system, +they try to find the flag in /proc/cpuinfo. If a given flag is present, +it means that + +* the kernel knows about the feature enough to have an X86_FEATURE bit + +* the kernel supports it and is currently making it available either to + userspace or some other part of the kernel + +* if the flag represents a hardware feature the hardware supports it. + +The absence of a flag in /proc/cpuinfo by itself means almost nothing to +an end user. + +On the one hand, a feature like "vaes" might be fully available to user +applications on a kernel that has not defined X86_FEATURE_VAES and thus +there is no "vaes" in /proc/cpuinfo. + +On the other hand, a new kernel running on non-VAES hardware would also +have no "vaes" in /proc/cpuinfo. There's no way for an application or +user to tell the difference. + +The end result is that the flags field in /proc/cpuinfo is marginally +useful for kernel debugging, but not really for anything else. +Applications should instead use things like the glibc facilities for +querying CPU support. Users should rely on tools like +tools/arch/x86/kcpuid and cpuid(1). + +Regarding implementation, flags appearing in /proc/cpuinfo have an +X86_FEATURE definition in arch/x86/include/asm/cpufeatures.h. These flags +represent hardware features as well as software features. + +If the kernel cares about a feature or KVM want to expose the feature to +a KVM guest, it should only then expose it to the guest when the guest +needs to parse /proc/cpuinfo. Which, as mentioned above, is highly +unlikely. KVM can synthesize the CPUID bit and the KVM guest can simply +query CPUID and figure out what the hypervisor supports and what not. As +already stated, /proc/cpuinfo is not a dumping ground for useless +feature flags. -If the expected flag does not appear in /proc/cpuinfo, things are murkier. -Users need to find out the reason why the flag is missing and find the way -how to enable it, which is not always easy. There are several factors that -can explain missing flags: the expected feature failed to enable, the feature -is missing in hardware, platform firmware did not enable it, the feature is -disabled at build or run time, an old kernel is in use, or the kernel does -not support the feature and thus has not enabled it. In general, /proc/cpuinfo -shows features which the kernel supports. For a full list of CPUID flags -which the CPU supports, use tools/arch/x86/kcpuid. How are feature flags created? ============================== From 772ca413537eca9010e93922699537f56db9c8c4 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 29 Nov 2023 14:33:28 +0100 Subject: [PATCH 171/310] x86/paravirt: Introduce ALT_NOT_XEN Introduce the macro ALT_NOT_XEN as a short form of ALT_NOT(X86_FEATURE_XENPV). No functional changes. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Juergen Gross Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231129133332.31043-2-jgross@suse.com --- arch/x86/include/asm/paravirt.h | 42 ++++++++++++--------------- arch/x86/include/asm/paravirt_types.h | 3 ++ 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 693c61dbdd9c..aa76ac7c806c 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -142,8 +142,7 @@ static inline void write_cr0(unsigned long x) static __always_inline unsigned long read_cr2(void) { return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, - "mov %%cr2, %%rax;", - ALT_NOT(X86_FEATURE_XENPV)); + "mov %%cr2, %%rax;", ALT_NOT_XEN); } static __always_inline void write_cr2(unsigned long x) @@ -154,13 +153,12 @@ static __always_inline void write_cr2(unsigned long x) static inline unsigned long __read_cr3(void) { return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3, - "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%cr3, %%rax;", ALT_NOT_XEN); } static inline void write_cr3(unsigned long x) { - PVOP_ALT_VCALL1(mmu.write_cr3, x, - "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN); } static inline void __write_cr4(unsigned long x) @@ -182,7 +180,7 @@ extern noinstr void pv_native_wbinvd(void); static __always_inline void wbinvd(void) { - PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN); } static inline u64 paravirt_read_msr(unsigned msr) @@ -390,27 +388,25 @@ static inline void paravirt_release_p4d(unsigned long pfn) static inline pte_t __pte(pteval_t val) { return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)) }; + "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pteval_t pte_val(pte_t pte) { return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline pgd_t __pgd(pgdval_t val) { return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)) }; + "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pgdval_t pgd_val(pgd_t pgd) { return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION @@ -444,14 +440,13 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) static inline pmd_t __pmd(pmdval_t val) { return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)) }; + "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pmdval_t pmd_val(pmd_t pmd) { return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void set_pud(pud_t *pudp, pud_t pud) @@ -464,7 +459,7 @@ static inline pud_t __pud(pudval_t val) pudval_t ret; ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); return (pud_t) { ret }; } @@ -472,7 +467,7 @@ static inline pud_t __pud(pudval_t val) static inline pudval_t pud_val(pud_t pud) { return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void pud_clear(pud_t *pudp) @@ -492,8 +487,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) static inline p4d_t __p4d(p4dval_t val) { p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); return (p4d_t) { ret }; } @@ -501,7 +495,7 @@ static inline p4d_t __p4d(p4dval_t val) static inline p4dval_t p4d_val(p4d_t p4d) { return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) @@ -687,17 +681,17 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", - ALT_NOT(X86_FEATURE_XENPV)); + ALT_NOT_XEN); } static __always_inline void arch_local_irq_disable(void) { - PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN); } static __always_inline void arch_local_irq_enable(void) { - PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN); } static __always_inline unsigned long arch_local_irq_save(void) @@ -769,7 +763,7 @@ void native_pv_lock_init(void) __init; .endm #define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \ - ALT_NOT(X86_FEATURE_XENPV) + ALT_NOT_XEN #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index f4fb2e3ec7b8..483e19e5ca7a 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -557,5 +557,8 @@ extern struct paravirt_patch_site __parainstructions[], __parainstructions_end[]; #endif /* __ASSEMBLY__ */ + +#define ALT_NOT_XEN ALT_NOT(X86_FEATURE_XENPV) + #endif /* CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_TYPES_H */ From 9824b00c2b58f9e1072ef3ece571a2cfc71089d4 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 29 Nov 2023 14:33:29 +0100 Subject: [PATCH 172/310] x86/paravirt: Move some functions and defines to alternative.c As a preparation for replacing paravirt patching completely by alternative patching, move some backend functions and #defines to the alternatives code and header. Signed-off-by: Juergen Gross Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231129133332.31043-3-jgross@suse.com --- arch/x86/include/asm/alternative.h | 16 ++++++++++++ arch/x86/include/asm/paravirt.h | 12 --------- arch/x86/include/asm/paravirt_types.h | 4 +-- arch/x86/include/asm/qspinlock_paravirt.h | 4 +-- arch/x86/kernel/alternative.c | 10 ++++++++ arch/x86/kernel/kvm.c | 4 +-- arch/x86/kernel/paravirt.c | 30 +++++++---------------- arch/x86/xen/irq.c | 2 +- 8 files changed, 41 insertions(+), 41 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 65f79092c9d9..ce788ab4e77c 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -330,6 +330,22 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr +/* Macro for creating assembler functions avoiding any C magic. */ +#define DEFINE_ASM_FUNC(func, instr, sec) \ + asm (".pushsection " #sec ", \"ax\"\n" \ + ".global " #func "\n\t" \ + ".type " #func ", @function\n\t" \ + ASM_FUNC_ALIGN "\n" \ + #func ":\n\t" \ + ASM_ENDBR \ + instr "\n\t" \ + ASM_RET \ + ".size " #func ", . - " #func "\n\t" \ + ".popsection") + +void BUG_func(void); +void nop_func(void); + #else /* __ASSEMBLY__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index aa76ac7c806c..f18bfa7f3070 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -720,18 +720,6 @@ static __always_inline unsigned long arch_local_irq_save(void) #undef PVOP_VCALL4 #undef PVOP_CALL4 -#define DEFINE_PARAVIRT_ASM(func, instr, sec) \ - asm (".pushsection " #sec ", \"ax\"\n" \ - ".global " #func "\n\t" \ - ".type " #func ", @function\n\t" \ - ASM_FUNC_ALIGN "\n" \ - #func ":\n\t" \ - ASM_ENDBR \ - instr "\n\t" \ - ASM_RET \ - ".size " #func ", . - " #func "\n\t" \ - ".popsection") - extern void default_banner(void); void native_pv_lock_init(void) __init; diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 483e19e5ca7a..166e9618158f 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -540,8 +540,6 @@ int paravirt_disable_iospace(void); __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -void _paravirt_nop(void); -void paravirt_BUG(void); unsigned long paravirt_ret0(void); #ifdef CONFIG_PARAVIRT_XXL u64 _paravirt_ident_64(u64); @@ -551,7 +549,7 @@ void pv_native_irq_enable(void); unsigned long pv_native_read_cr2(void); #endif -#define paravirt_nop ((void *)_paravirt_nop) +#define paravirt_nop ((void *)nop_func) extern struct paravirt_patch_site __parainstructions[], __parainstructions_end[]; diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 85b6e3609cb9..ef9697f20129 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -56,8 +56,8 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); "pop %rdx\n\t" \ FRAME_END -DEFINE_PARAVIRT_ASM(__raw_callee_save___pv_queued_spin_unlock, - PV_UNLOCK_ASM, .spinlock.text); +DEFINE_ASM_FUNC(__raw_callee_save___pv_queued_spin_unlock, + PV_UNLOCK_ASM, .spinlock.text); #else /* CONFIG_64BIT */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index be35c8ccf826..ca25dd280b8c 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -385,6 +385,16 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) } } +/* Low-level backend functions usable from alternative code replacements. */ +DEFINE_ASM_FUNC(nop_func, "", .entry.text); +EXPORT_SYMBOL_GPL(nop_func); + +noinstr void BUG_func(void) +{ + BUG(); +} +EXPORT_SYMBOL_GPL(BUG_func); + /* * Replace instructions with better alternatives for this CPU type. This runs * before SMP is initialized to avoid SMP problems with self modifying code. diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 0ddb3bd0f1aa..c461c1a4b6af 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -803,8 +803,8 @@ extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \ "setne %al\n\t" -DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted, - PV_VCPU_PREEMPTED_ASM, .text); +DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted, + PV_VCPU_PREEMPTED_ASM, .text); #endif static void __init kvm_guest_init(void) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 97f1436c1a20..acc5b1004f0f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -34,14 +34,8 @@ #include #include -/* - * nop stub, which must not clobber anything *including the stack* to - * avoid confusing the entry prologues. - */ -DEFINE_PARAVIRT_ASM(_paravirt_nop, "", .entry.text); - /* stub always returning 0. */ -DEFINE_PARAVIRT_ASM(paravirt_ret0, "xor %eax,%eax", .entry.text); +DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text); void __init default_banner(void) { @@ -49,12 +43,6 @@ void __init default_banner(void) pv_info.name); } -/* Undefined instruction for dealing with missing ops pointers. */ -noinstr void paravirt_BUG(void) -{ - BUG(); -} - static unsigned paravirt_patch_call(void *insn_buff, const void *target, unsigned long addr, unsigned len) { @@ -64,11 +52,11 @@ static unsigned paravirt_patch_call(void *insn_buff, const void *target, } #ifdef CONFIG_PARAVIRT_XXL -DEFINE_PARAVIRT_ASM(_paravirt_ident_64, "mov %rdi, %rax", .text); -DEFINE_PARAVIRT_ASM(pv_native_save_fl, "pushf; pop %rax", .noinstr.text); -DEFINE_PARAVIRT_ASM(pv_native_irq_disable, "cli", .noinstr.text); -DEFINE_PARAVIRT_ASM(pv_native_irq_enable, "sti", .noinstr.text); -DEFINE_PARAVIRT_ASM(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); +DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text); +DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text); +DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text); +DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text); +DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); #endif DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); @@ -96,9 +84,9 @@ unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned ret; if (opfunc == NULL) - /* If there's no function, patch it with paravirt_BUG() */ - ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len); - else if (opfunc == _paravirt_nop) + /* If there's no function, patch it with BUG_func() */ + ret = paravirt_patch_call(insn_buff, BUG_func, addr, len); + else if (opfunc == nop_func) ret = 0; else /* Otherwise call the function. */ diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 6092fea7d651..39982f955cfe 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -45,7 +45,7 @@ static const typeof(pv_ops) xen_irq_ops __initconst = { /* Initial interrupt flag handling only called while interrupts off. */ .save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0), .irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop), - .irq_enable = __PV_IS_CALLEE_SAVE(paravirt_BUG), + .irq_enable = __PV_IS_CALLEE_SAVE(BUG_func), .safe_halt = xen_safe_halt, .halt = xen_halt, From da0fe6e68e104f79c1fef5c62a17bdd1634ea61c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Sun, 10 Dec 2023 07:21:36 +0100 Subject: [PATCH 173/310] x86/alternative: Add indirect call patching In order to prepare replacing of paravirt patching with alternative patching, add the capability to replace an indirect call with a direct one. This is done via a new flag ALT_FLAG_CALL as the target of the CALL instruction needs to be evaluated using the value of the location addressed by the indirect call. For convenience, add a macro for a default CALL instruction. In case it is being used without the new flag being set, it will result in a BUG() when being executed. As in most cases, the feature used will be X86_FEATURE_ALWAYS so add another macro ALT_CALL_ALWAYS usable for the flags parameter of the ALTERNATIVE macros. For a complete replacement, handle the special cases of calling a nop function and an indirect call of NULL the same way as paravirt does. [ bp: Massage commit message, fixup the debug output and clarify flow more. ] Co-developed-by: Peter Zijlstra (Intel) Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Juergen Gross Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231210062138.2417-4-jgross@suse.com --- arch/x86/include/asm/alternative.h | 9 +++++ arch/x86/kernel/alternative.c | 58 ++++++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index ce788ab4e77c..472334eed6f3 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -10,6 +10,9 @@ #define ALT_FLAG_NOT (1 << 0) #define ALT_NOT(feature) ((ALT_FLAG_NOT << ALT_FLAGS_SHIFT) | (feature)) +#define ALT_FLAG_DIRECT_CALL (1 << 1) +#define ALT_DIRECT_CALL(feature) ((ALT_FLAG_DIRECT_CALL << ALT_FLAGS_SHIFT) | (feature)) +#define ALT_CALL_ALWAYS ALT_DIRECT_CALL(X86_FEATURE_ALWAYS) #ifndef __ASSEMBLY__ @@ -150,6 +153,8 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ +#define ALT_CALL_INSTR "call BUG_func" + #define b_replacement(num) "664"#num #define e_replacement(num) "665"#num @@ -386,6 +391,10 @@ void nop_func(void); .byte \alt_len .endm +.macro ALT_CALL_INSTR + call BUG_func +.endm + /* * Define an alternative between two instructions. If @feature is * present, early code in apply_alternatives() replaces @oldinstr with diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ca25dd280b8c..657305137bf9 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -395,6 +395,53 @@ noinstr void BUG_func(void) } EXPORT_SYMBOL_GPL(BUG_func); +#define CALL_RIP_REL_OPCODE 0xff +#define CALL_RIP_REL_MODRM 0x15 + +/* + * Rewrite the "call BUG_func" replacement to point to the target of the + * indirect pv_ops call "call *disp(%ip)". + */ +static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) +{ + void *target, *bug = &BUG_func; + s32 disp; + + if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) { + pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n"); + BUG(); + } + + if (a->instrlen != 6 || + instr[0] != CALL_RIP_REL_OPCODE || + instr[1] != CALL_RIP_REL_MODRM) { + pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); + BUG(); + } + + /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ + disp = *(s32 *)(instr + 2); +#ifdef CONFIG_X86_64 + /* ff 15 00 00 00 00 call *0x0(%rip) */ + /* target address is stored at "next instruction + disp". */ + target = *(void **)(instr + a->instrlen + disp); +#else + /* ff 15 00 00 00 00 call *0x0 */ + /* target address is stored at disp. */ + target = *(void **)disp; +#endif + if (!target) + target = bug; + + /* (BUG_func - .) + (target - BUG_func) := target - . */ + *(s32 *)(insn_buff + 1) += target - bug; + + if (target == &nop_func) + return 0; + + return 5; +} + /* * Replace instructions with better alternatives for this CPU type. This runs * before SMP is initialized to avoid SMP problems with self modifying code. @@ -452,16 +499,21 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, continue; } - DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", - (a->flags & ALT_FLAG_NOT) ? "!" : "", + DPRINTK(ALT, "feat: %d32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x", a->cpuid >> 5, a->cpuid & 0x1f, instr, instr, a->instrlen, - replacement, a->replacementlen); + replacement, a->replacementlen, a->flags); memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; + if (a->flags & ALT_FLAG_DIRECT_CALL) { + insn_buff_sz = alt_replace_call(instr, insn_buff, a); + if (insn_buff_sz < 0) + continue; + } + for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; From 60bc276b129eef8113f9d9b0a5cd5ae7f4c90acb Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Sun, 10 Dec 2023 07:21:37 +0100 Subject: [PATCH 174/310] x86/paravirt: Switch mixed paravirt/alternative calls to alternatives Instead of stacking alternative and paravirt patching, use the new ALT_FLAG_CALL flag to switch those mixed calls to pure alternative handling. Eliminate the need to be careful regarding the sequence of alternative and paravirt patching. [ bp: Touch up commit message. ] Signed-off-by: Juergen Gross Signed-off-by: Borislav Petkov (AMD) Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231210062138.2417-5-jgross@suse.com --- arch/x86/include/asm/alternative.h | 5 ++-- arch/x86/include/asm/paravirt.h | 12 ++++---- arch/x86/include/asm/paravirt_types.h | 40 +++++++++++++++------------ arch/x86/kernel/alternative.c | 1 - arch/x86/kernel/callthunks.c | 17 ++++++------ arch/x86/kernel/module.c | 20 ++++---------- 6 files changed, 44 insertions(+), 51 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 472334eed6f3..fcd20c6dc7f9 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -89,6 +89,8 @@ struct alt_instr { u8 replacementlen; /* length of new instruction */ } __packed; +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; + /* * Debug flag that can be tested to see whether alternative * instructions were patched in already: @@ -104,11 +106,10 @@ extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, s32 *start_cfi, s32 *end_cfi); struct module; -struct paravirt_patch_site; struct callthunk_sites { s32 *call_start, *call_end; - struct paravirt_patch_site *pv_start, *pv_end; + struct alt_instr *alt_start, *alt_end; }; #ifdef CONFIG_CALL_THUNKS diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index f18bfa7f3070..973c2ac2d25c 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -738,20 +738,20 @@ void native_pv_lock_init(void) __init; #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL +#ifdef CONFIG_DEBUG_ENTRY #define PARA_PATCH(off) ((off) / 8) #define PARA_SITE(ptype, ops) _PVSITE(ptype, ops) #define PARA_INDIRECT(addr) *addr(%rip) -#ifdef CONFIG_DEBUG_ENTRY .macro PARA_IRQ_save_fl - PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), - ANNOTATE_RETPOLINE_SAFE; - call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);) + ANNOTATE_RETPOLINE_SAFE; + call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); .endm -#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \ - ALT_NOT_XEN +#define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;", \ + "ALT_CALL_INSTR;", ALT_CALL_ALWAYS, \ + "pushf; pop %rax;", ALT_NOT_XEN #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 166e9618158f..9cad536fc08d 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -276,15 +276,11 @@ extern struct paravirt_patch_template pv_ops; #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len); +#define paravirt_ptr(op) [paravirt_opptr] "m" (pv_ops.op) int paravirt_disable_iospace(void); -/* - * This generates an indirect call based on the operation type number. - * The type number, computed in PARAVIRT_PATCH, is derived from the - * offset into the paravirt_patch_template structure, and can therefore be - * freely converted back into a structure offset. - */ +/* This generates an indirect call based on the operation type number. */ #define PARAVIRT_CALL \ ANNOTATE_RETPOLINE_SAFE \ "call *%[paravirt_opptr];" @@ -317,12 +313,6 @@ int paravirt_disable_iospace(void); * However, x86_64 also has to clobber all caller saved registers, which * unfortunately, are quite a bit (r8 - r11) * - * The call instruction itself is marked by placing its start address - * and size into the .parainstructions section, so that - * apply_paravirt() in arch/i386/kernel/alternative.c can do the - * appropriate patching under the control of the backend pv_init_ops - * implementation. - * * Unfortunately there's no way to get gcc to generate the args setup * for the call, and then allow the call itself to be generated by an * inline asm. Because of this, we must do the complete arg setup and @@ -421,14 +411,27 @@ int paravirt_disable_iospace(void); __mask & __eax; \ }) - +/* + * Use alternative patching for paravirt calls: + * - For replacing an indirect call with a direct one, use the "normal" + * ALTERNATIVE() macro with the indirect call as the initial code sequence, + * which will be replaced with the related direct call by using the + * ALT_FLAG_DIRECT_CALL special case and the "always on" feature. + * - In case the replacement is either a direct call or a short code sequence + * depending on a feature bit, the ALTERNATIVE_2() macro is being used. + * The indirect call is the initial code sequence again, while the special + * code sequence is selected with the specified feature bit. In case the + * feature is not active, the direct call is used as above via the + * ALT_FLAG_DIRECT_CALL special case and the "always on" feature. + */ #define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...) \ ({ \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ + asm volatile(ALTERNATIVE(PARAVIRT_CALL, ALT_CALL_INSTR, \ + ALT_CALL_ALWAYS) \ : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ + : paravirt_ptr(op), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ @@ -439,10 +442,11 @@ int paravirt_disable_iospace(void); ({ \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL), \ - alt, cond) \ + asm volatile(ALTERNATIVE_2(PARAVIRT_CALL, \ + ALT_CALL_INSTR, ALT_CALL_ALWAYS, \ + alt, cond) \ : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ + : paravirt_ptr(op), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 657305137bf9..48938bc36d71 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -160,7 +160,6 @@ extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern s32 __return_sites[], __return_sites_end[]; extern s32 __cfi_sites[], __cfi_sites_end[]; extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; -extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 57e5c2e75c2a..64ad2ddea121 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -233,14 +233,13 @@ patch_call_sites(s32 *start, s32 *end, const struct core_text *ct) } static __init_or_module void -patch_paravirt_call_sites(struct paravirt_patch_site *start, - struct paravirt_patch_site *end, - const struct core_text *ct) +patch_alt_call_sites(struct alt_instr *start, struct alt_instr *end, + const struct core_text *ct) { - struct paravirt_patch_site *p; + struct alt_instr *a; - for (p = start; p < end; p++) - patch_call((void *)&p->instr_offset + p->instr_offset, ct); + for (a = start; a < end; a++) + patch_call((void *)&a->instr_offset + a->instr_offset, ct); } static __init_or_module void @@ -248,7 +247,7 @@ callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct) { prdbg("Patching call sites %s\n", ct->name); patch_call_sites(cs->call_start, cs->call_end, ct); - patch_paravirt_call_sites(cs->pv_start, cs->pv_end, ct); + patch_alt_call_sites(cs->alt_start, cs->alt_end, ct); prdbg("Patching call sites done%s\n", ct->name); } @@ -257,8 +256,8 @@ void __init callthunks_patch_builtin_calls(void) struct callthunk_sites cs = { .call_start = __call_sites, .call_end = __call_sites_end, - .pv_start = __parainstructions, - .pv_end = __parainstructions_end + .alt_start = __alt_instructions, + .alt_end = __alt_instructions_end }; if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 5f71a0cf4399..e18914c0e38a 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -276,7 +276,7 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s, *alt = NULL, *locks = NULL, - *para = NULL, *orc = NULL, *orc_ip = NULL, + *orc = NULL, *orc_ip = NULL, *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, *calls = NULL, *cfi = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; @@ -286,8 +286,6 @@ int module_finalize(const Elf_Ehdr *hdr, alt = s; if (!strcmp(".smp_locks", secstrings + s->sh_name)) locks = s; - if (!strcmp(".parainstructions", secstrings + s->sh_name)) - para = s; if (!strcmp(".orc_unwind", secstrings + s->sh_name)) orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) @@ -304,14 +302,6 @@ int module_finalize(const Elf_Ehdr *hdr, ibt_endbr = s; } - /* - * See alternative_instructions() for the ordering rules between the - * various patching types. - */ - if (para) { - void *pseg = (void *)para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } if (retpolines || cfi) { void *rseg = NULL, *cseg = NULL; unsigned int rsize = 0, csize = 0; @@ -341,7 +331,7 @@ int module_finalize(const Elf_Ehdr *hdr, void *aseg = (void *)alt->sh_addr; apply_alternatives(aseg, aseg + alt->sh_size); } - if (calls || para) { + if (calls || alt) { struct callthunk_sites cs = {}; if (calls) { @@ -349,9 +339,9 @@ int module_finalize(const Elf_Ehdr *hdr, cs.call_end = (void *)calls->sh_addr + calls->sh_size; } - if (para) { - cs.pv_start = (void *)para->sh_addr; - cs.pv_end = (void *)para->sh_addr + para->sh_size; + if (alt) { + cs.alt_start = (void *)alt->sh_addr; + cs.alt_end = (void *)alt->sh_addr + alt->sh_size; } callthunks_patch_module_calls(&cs, me); From f7af6977621a41661696d94c0c0a20c761404476 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Sun, 10 Dec 2023 07:21:38 +0100 Subject: [PATCH 175/310] x86/paravirt: Remove no longer needed paravirt patching code Now that paravirt is using the alternatives patching infrastructure, remove the paravirt patching code. Signed-off-by: Juergen Gross Signed-off-by: Borislav Petkov (AMD) Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231210062138.2417-6-jgross@suse.com --- arch/x86/include/asm/paravirt.h | 13 ------ arch/x86/include/asm/paravirt_types.h | 38 --------------- arch/x86/include/asm/text-patching.h | 12 ----- arch/x86/kernel/alternative.c | 67 +-------------------------- arch/x86/kernel/paravirt.c | 30 ------------ arch/x86/kernel/vmlinux.lds.S | 13 ------ arch/x86/tools/relocs.c | 2 +- 7 files changed, 3 insertions(+), 172 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 973c2ac2d25c..8bcf7584e7dd 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -725,23 +725,10 @@ void native_pv_lock_init(void) __init; #else /* __ASSEMBLY__ */ -#define _PVSITE(ptype, ops) \ -771:; \ - ops; \ -772:; \ - .pushsection .parainstructions,"a"; \ - .long 771b-.; \ - .byte ptype; \ - .byte 772b-771b; \ - .popsection - - #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL #ifdef CONFIG_DEBUG_ENTRY -#define PARA_PATCH(off) ((off) / 8) -#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops) #define PARA_INDIRECT(addr) *addr(%rip) .macro PARA_IRQ_save_fl diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 9cad536fc08d..d8e85d2cf8d5 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -2,15 +2,6 @@ #ifndef _ASM_X86_PARAVIRT_TYPES_H #define _ASM_X86_PARAVIRT_TYPES_H -#ifndef __ASSEMBLY__ -/* These all sit in the .parainstructions section to tell us what to patch. */ -struct paravirt_patch_site { - s32 instr_offset; /* original instructions */ - u8 type; /* type of this instruction */ - u8 len; /* length of original instruction */ -} __packed; -#endif - #ifdef CONFIG_PARAVIRT #ifndef __ASSEMBLY__ @@ -250,32 +241,6 @@ struct paravirt_patch_template { extern struct pv_info pv_info; extern struct paravirt_patch_template pv_ops; -#define PARAVIRT_PATCH(x) \ - (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) - -#define paravirt_type(op) \ - [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ - [paravirt_opptr] "m" (pv_ops.op) -/* - * Generate some code, and mark it as patchable by the - * apply_paravirt() alternate instruction patcher. - */ -#define _paravirt_alt(insn_string, type) \ - "771:\n\t" insn_string "\n" "772:\n" \ - ".pushsection .parainstructions,\"a\"\n" \ - " .long 771b-.\n" \ - " .byte " type "\n" \ - " .byte 772b-771b\n" \ - ".popsection\n" - -/* Generate patchable code, with the default asm parameters. */ -#define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]") - -/* Simple instruction patching code. */ -#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" - -unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len); #define paravirt_ptr(op) [paravirt_opptr] "m" (pv_ops.op) int paravirt_disable_iospace(void); @@ -555,9 +520,6 @@ unsigned long pv_native_read_cr2(void); #define paravirt_nop ((void *)nop_func) -extern struct paravirt_patch_site __parainstructions[], - __parainstructions_end[]; - #endif /* __ASSEMBLY__ */ #define ALT_NOT_XEN ALT_NOT(X86_FEATURE_XENPV) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 29832c338cdc..0b70653a98c1 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -6,18 +6,6 @@ #include #include -struct paravirt_patch_site; -#ifdef CONFIG_PARAVIRT -void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end); -#else -static inline void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end) -{} -#define __parainstructions NULL -#define __parainstructions_end NULL -#endif - /* * Currently, the max observed size in the kernel code is * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 48938bc36d71..f26983a3e035 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1472,48 +1472,6 @@ int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ -#ifdef CONFIG_PARAVIRT - -/* Use this to add nops to a buffer, then text_poke the whole buffer. */ -static void __init_or_module add_nops(void *insns, unsigned int len) -{ - while (len > 0) { - unsigned int noplen = len; - if (noplen > ASM_NOP_MAX) - noplen = ASM_NOP_MAX; - memcpy(insns, x86_nops[noplen], noplen); - insns += noplen; - len -= noplen; - } -} - -void __init_or_module apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end) -{ - struct paravirt_patch_site *p; - char insn_buff[MAX_PATCH_LEN]; - u8 *instr; - - for (p = start; p < end; p++) { - unsigned int used; - - instr = (u8 *)&p->instr_offset + p->instr_offset; - BUG_ON(p->len > MAX_PATCH_LEN); - /* prep the buffer with the original instructions */ - memcpy(insn_buff, instr, p->len); - used = paravirt_patch(p->type, insn_buff, (unsigned long)instr, p->len); - - BUG_ON(used > p->len); - - /* Pad the rest with nops */ - add_nops(insn_buff + used, p->len - used); - text_poke_early(instr, insn_buff, p->len); - } -} -extern struct paravirt_patch_site __start_parainstructions[], - __stop_parainstructions[]; -#endif /* CONFIG_PARAVIRT */ - /* * Self-test for the INT3 based CALL emulation code. * @@ -1649,28 +1607,11 @@ void __init alternative_instructions(void) */ /* - * Paravirt patching and alternative patching can be combined to - * replace a function call with a short direct code sequence (e.g. - * by setting a constant return value instead of doing that in an - * external function). - * In order to make this work the following sequence is required: - * 1. set (artificial) features depending on used paravirt - * functions which can later influence alternative patching - * 2. apply paravirt patching (generally replacing an indirect - * function call with a direct one) - * 3. apply alternative patching (e.g. replacing a direct function - * call with a custom code sequence) - * Doing paravirt patching after alternative patching would clobber - * the optimization of the custom code with a function call again. + * Make sure to set (artificial) features depending on used paravirt + * functions which can later influence alternative patching. */ paravirt_set_cap(); - /* - * First patch paravirt functions, such that we overwrite the indirect - * call with the direct call. - */ - apply_paravirt(__parainstructions, __parainstructions_end); - __apply_fineibt(__retpoline_sites, __retpoline_sites_end, __cfi_sites, __cfi_sites_end, true); @@ -1681,10 +1622,6 @@ void __init alternative_instructions(void) apply_retpolines(__retpoline_sites, __retpoline_sites_end); apply_returns(__return_sites, __return_sites_end); - /* - * Then patch alternatives, such that those paravirt calls that are in - * alternatives can be overwritten by their immediate fragments. - */ apply_alternatives(__alt_instructions, __alt_instructions_end); /* diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index acc5b1004f0f..5358d43886ad 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -43,14 +43,6 @@ void __init default_banner(void) pv_info.name); } -static unsigned paravirt_patch_call(void *insn_buff, const void *target, - unsigned long addr, unsigned len) -{ - __text_gen_insn(insn_buff, CALL_INSN_OPCODE, - (void *)addr, target, CALL_INSN_SIZE); - return CALL_INSN_SIZE; -} - #ifdef CONFIG_PARAVIRT_XXL DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text); DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text); @@ -73,28 +65,6 @@ static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) tlb_remove_page(tlb, table); } -unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, - unsigned int len) -{ - /* - * Neat trick to map patch type back to the call within the - * corresponding structure. - */ - void *opfunc = *((void **)&pv_ops + type); - unsigned ret; - - if (opfunc == NULL) - /* If there's no function, patch it with BUG_func() */ - ret = paravirt_patch_call(insn_buff, BUG_func, addr, len); - else if (opfunc == nop_func) - ret = 0; - else - /* Otherwise call the function. */ - ret = paravirt_patch_call(insn_buff, opfunc, addr, len); - - return ret; -} - struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 54a5596adaa6..a349dbfc6d5a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -267,19 +267,6 @@ SECTIONS } #endif - /* - * start address and size of operations which during runtime - * can be patched with virtualization friendly instructions or - * baremetal native ones. Think page table operations. - * Details in paravirt_types.h - */ - . = ALIGN(8); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - #ifdef CONFIG_RETPOLINE /* * List of instructions that call/jmp/jcc to retpoline thunks diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index d30949e25ebd..a3bae2b24626 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -66,7 +66,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { [S_REL] = "^(__init_(begin|end)|" "__x86_cpu_dev_(start|end)|" - "(__parainstructions|__alt_instructions)(_end)?|" + "__alt_instructions(_end)?|" "(__iommu_table|__apicdrivers|__smp_locks)(_end)?|" "__(start|end)_pci_.*|" #if CONFIG_FW_LOADER From f789383fa34a266d0c1a76f272043a15a8edf733 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Thu, 30 Nov 2023 16:39:33 +0100 Subject: [PATCH 176/310] x86/ia32: State that IA32 emulation is disabled Issue a short message once, on the first try to load a 32-bit process to save people time when wondering why it won't load and trying to execute it, would say: -bash: ./strsep32: cannot execute binary file: Exec format error Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/r/20231130155213.1407-1-bp@alien8.de --- arch/x86/include/asm/elf.h | 2 +- arch/x86/include/asm/ia32.h | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index a0234dfd1031..1e16bd5ac781 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -150,7 +150,7 @@ do { \ ((x)->e_machine == EM_X86_64) #define compat_elf_check_arch(x) \ - ((elf_check_arch_ia32(x) && ia32_enabled()) || \ + ((elf_check_arch_ia32(x) && ia32_enabled_verbose()) || \ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) static inline void elf_common_init(struct thread_struct *t, diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 5a2ae24b1204..094886a8717e 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -2,7 +2,6 @@ #ifndef _ASM_X86_IA32_H #define _ASM_X86_IA32_H - #ifdef CONFIG_IA32_EMULATION #include @@ -84,4 +83,14 @@ static inline bool ia32_enabled(void) #endif +static inline bool ia32_enabled_verbose(void) +{ + bool enabled = ia32_enabled(); + + if (IS_ENABLED(CONFIG_IA32_EMULATION) && !enabled) + pr_notice_once("32-bit emulation disabled. You can reenable with ia32_emulation=on\n"); + + return enabled; +} + #endif /* _ASM_X86_IA32_H */ From 8885c7398fe56c49f14be6ce0ac202385f3cd818 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 27 Nov 2023 13:00:52 +0100 Subject: [PATCH 177/310] arm64: mm: Only map KPTI trampoline if it is going to be used Avoid creating the fixmap entries for the KPTI trampoline if KPTI is not in use. Signed-off-by: Ard Biesheuvel Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20231127120049.2258650-7-ardb@google.com Signed-off-by: Will Deacon --- arch/arm64/mm/mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 15f6347d23b6..6c8078916f5e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -674,6 +674,9 @@ static int __init map_entry_trampoline(void) { int i; + if (!arm64_kernel_unmapped_at_el0()) + return 0; + pgprot_t prot = kernel_exec_prot(); phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start); From 7540f70df98f5c46feb5fe2257f93f543e5821e5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 27 Nov 2023 13:00:53 +0100 Subject: [PATCH 178/310] arm64: Kconfig: drop KAISER reference from KPTI option description KAISER is a reference to the KASLR hardening technique that already existed before Meltdown happened, and by now, it is sufficiently obscure that mentioning it does not actually clarify anything. So remove this reference, and replace it with KPTI. Signed-off-by: Ard Biesheuvel Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20231127120049.2258650-8-ardb@google.com Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7b071a00425d..b67e6934316f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1549,7 +1549,7 @@ config ARCH_FORCE_MAX_ORDER Don't change if unsure. config UNMAP_KERNEL_AT_EL0 - bool "Unmap kernel when running in userspace (aka \"KAISER\")" if EXPERT + bool "Unmap kernel when running in userspace (KPTI)" if EXPERT default y help Speculation attacks against some high-performance processors can From 1beef60e7d6b90be826a73f626204f55a4cd7640 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 24 Nov 2023 11:05:10 +0000 Subject: [PATCH 179/310] arm64: stacktrace: factor out kernel unwind state On arm64 we share some unwinding code between the regular kernel unwinder and the KVM hyp unwinder. Some of this common code only matters to the regular unwinder, e.g. the `kr_cur` and `task` fields in the common struct unwind_state. We're likely to add more state which only matters for regular kernel unwinding (or only for hyp unwinding). In preparation for such changes, this patch factors out the kernel-specific state into a new struct kunwind_state, and updates the kernel unwind code accordingly. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Mark Brown Cc: Puranjay Mohan Cc: Will Deacon Reviewed-by: Puranjay Mohan Reviewed-by: Kalesh Singh Reviewed-by: Madhavan T. Venkataraman Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20231124110511.2795958-2-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/stacktrace/common.h | 19 +--- arch/arm64/include/asm/stacktrace/nvhe.h | 2 +- arch/arm64/kernel/stacktrace.c | 113 +++++++++++++-------- 3 files changed, 74 insertions(+), 60 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h index 508f734de46e..f63dc654e545 100644 --- a/arch/arm64/include/asm/stacktrace/common.h +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -9,7 +9,6 @@ #ifndef __ASM_STACKTRACE_COMMON_H #define __ASM_STACKTRACE_COMMON_H -#include #include struct stack_info { @@ -23,12 +22,6 @@ struct stack_info { * @fp: The fp value in the frame record (or the real fp) * @pc: The lr value in the frame record (or the real lr) * - * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance - * associated with the most recently encountered replacement lr - * value. - * - * @task: The task being unwound. - * * @stack: The stack currently being unwound. * @stacks: An array of stacks which can be unwound. * @nr_stacks: The number of stacks in @stacks. @@ -36,10 +29,6 @@ struct stack_info { struct unwind_state { unsigned long fp; unsigned long pc; -#ifdef CONFIG_KRETPROBES - struct llist_node *kr_cur; -#endif - struct task_struct *task; struct stack_info stack; struct stack_info *stacks; @@ -66,14 +55,8 @@ static inline bool stackinfo_on_stack(const struct stack_info *info, return true; } -static inline void unwind_init_common(struct unwind_state *state, - struct task_struct *task) +static inline void unwind_init_common(struct unwind_state *state) { - state->task = task; -#ifdef CONFIG_KRETPROBES - state->kr_cur = NULL; -#endif - state->stack = stackinfo_get_unknown(); } diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h index 25ab83a315a7..44759281d0d4 100644 --- a/arch/arm64/include/asm/stacktrace/nvhe.h +++ b/arch/arm64/include/asm/stacktrace/nvhe.h @@ -31,7 +31,7 @@ static inline void kvm_nvhe_unwind_init(struct unwind_state *state, unsigned long fp, unsigned long pc) { - unwind_init_common(state, NULL); + unwind_init_common(state); state->fp = fp; state->pc = pc; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 17f66a74c745..aafc89192787 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -18,6 +19,31 @@ #include #include +/* + * Kernel unwind state + * + * @common: Common unwind state. + * @task: The task being unwound. + * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance + * associated with the most recently encountered replacement lr + * value. + */ +struct kunwind_state { + struct unwind_state common; + struct task_struct *task; +#ifdef CONFIG_KRETPROBES + struct llist_node *kr_cur; +#endif +}; + +static __always_inline void +kunwind_init(struct kunwind_state *state, + struct task_struct *task) +{ + unwind_init_common(&state->common); + state->task = task; +} + /* * Start an unwind from a pt_regs. * @@ -26,13 +52,13 @@ * The regs must be on a stack currently owned by the calling task. */ static __always_inline void -unwind_init_from_regs(struct unwind_state *state, - struct pt_regs *regs) +kunwind_init_from_regs(struct kunwind_state *state, + struct pt_regs *regs) { - unwind_init_common(state, current); + kunwind_init(state, current); - state->fp = regs->regs[29]; - state->pc = regs->pc; + state->common.fp = regs->regs[29]; + state->common.pc = regs->pc; } /* @@ -44,12 +70,12 @@ unwind_init_from_regs(struct unwind_state *state, * The function which invokes this must be noinline. */ static __always_inline void -unwind_init_from_caller(struct unwind_state *state) +kunwind_init_from_caller(struct kunwind_state *state) { - unwind_init_common(state, current); + kunwind_init(state, current); - state->fp = (unsigned long)__builtin_frame_address(1); - state->pc = (unsigned long)__builtin_return_address(0); + state->common.fp = (unsigned long)__builtin_frame_address(1); + state->common.pc = (unsigned long)__builtin_return_address(0); } /* @@ -63,35 +89,38 @@ unwind_init_from_caller(struct unwind_state *state) * call this for the current task. */ static __always_inline void -unwind_init_from_task(struct unwind_state *state, - struct task_struct *task) +kunwind_init_from_task(struct kunwind_state *state, + struct task_struct *task) { - unwind_init_common(state, task); + kunwind_init(state, task); - state->fp = thread_saved_fp(task); - state->pc = thread_saved_pc(task); + state->common.fp = thread_saved_fp(task); + state->common.pc = thread_saved_pc(task); } static __always_inline int -unwind_recover_return_address(struct unwind_state *state) +kunwind_recover_return_address(struct kunwind_state *state) { #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (state->task->ret_stack && - (state->pc == (unsigned long)return_to_handler)) { + (state->common.pc == (unsigned long)return_to_handler)) { unsigned long orig_pc; - orig_pc = ftrace_graph_ret_addr(state->task, NULL, state->pc, - (void *)state->fp); - if (WARN_ON_ONCE(state->pc == orig_pc)) + orig_pc = ftrace_graph_ret_addr(state->task, NULL, + state->common.pc, + (void *)state->common.fp); + if (WARN_ON_ONCE(state->common.pc == orig_pc)) return -EINVAL; - state->pc = orig_pc; + state->common.pc = orig_pc; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ #ifdef CONFIG_KRETPROBES - if (is_kretprobe_trampoline(state->pc)) { - state->pc = kretprobe_find_ret_addr(state->task, - (void *)state->fp, - &state->kr_cur); + if (is_kretprobe_trampoline(state->common.pc)) { + unsigned long orig_pc; + orig_pc = kretprobe_find_ret_addr(state->task, + (void *)state->common.fp, + &state->kr_cur); + state->common.pc = orig_pc; } #endif /* CONFIG_KRETPROBES */ @@ -106,38 +135,38 @@ unwind_recover_return_address(struct unwind_state *state) * and the location (but not the fp value) of B. */ static __always_inline int -unwind_next(struct unwind_state *state) +kunwind_next(struct kunwind_state *state) { struct task_struct *tsk = state->task; - unsigned long fp = state->fp; + unsigned long fp = state->common.fp; int err; /* Final frame; nothing to unwind */ if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) return -ENOENT; - err = unwind_next_frame_record(state); + err = unwind_next_frame_record(&state->common); if (err) return err; - state->pc = ptrauth_strip_kernel_insn_pac(state->pc); + state->common.pc = ptrauth_strip_kernel_insn_pac(state->common.pc); - return unwind_recover_return_address(state); + return kunwind_recover_return_address(state); } static __always_inline void -unwind(struct unwind_state *state, stack_trace_consume_fn consume_entry, - void *cookie) +do_kunwind(struct kunwind_state *state, stack_trace_consume_fn consume_entry, + void *cookie) { - if (unwind_recover_return_address(state)) + if (kunwind_recover_return_address(state)) return; while (1) { int ret; - if (!consume_entry(cookie, state->pc)) + if (!consume_entry(cookie, state->common.pc)) break; - ret = unwind_next(state); + ret = kunwind_next(state); if (ret < 0) break; } @@ -190,22 +219,24 @@ noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, STACKINFO_EFI, #endif }; - struct unwind_state state = { - .stacks = stacks, - .nr_stacks = ARRAY_SIZE(stacks), + struct kunwind_state state = { + .common = { + .stacks = stacks, + .nr_stacks = ARRAY_SIZE(stacks), + }, }; if (regs) { if (task != current) return; - unwind_init_from_regs(&state, regs); + kunwind_init_from_regs(&state, regs); } else if (task == current) { - unwind_init_from_caller(&state); + kunwind_init_from_caller(&state); } else { - unwind_init_from_task(&state, task); + kunwind_init_from_task(&state, task); } - unwind(&state, consume_entry, cookie); + do_kunwind(&state, consume_entry, cookie); } static bool dump_backtrace_entry(void *arg, unsigned long where) From 1aba06e7b2b49c1a0e905fcaf9c56d70164fbd8d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 24 Nov 2023 11:05:11 +0000 Subject: [PATCH 180/310] arm64: stacktrace: factor out kunwind_stack_walk() Currently arm64 uses the generic arch_stack_walk() interface for all stack walking code. This only passes a PC value and cookie to the unwind callback, whereas we'd like to pass some additional information in some cases. For example, the BPF exception unwinder wants the FP, for reliable stacktrace we'll want to perform additional checks on other portions of unwind state, and we'd like to expand the information printed by dump_backtrace() to include provenance and reliability information. As preparation for all of the above, this patch factors the core unwind logic out of arch_stack_walk() and into a new kunwind_stack_walk() function which provides all of the unwind state to a callback function. The existing arch_stack_walk() interface is implemented atop this. The kunwind_stack_walk() function is intended to be a private implementation detail of unwinders in stacktrace.c, and not something to be exported generally to kernel code. It is __always_inline'd into its caller so that neither it or its caller appear in stactraces (which is the existing/required behavior for arch_stack_walk() and friends) and so that the compiler can optimize away some of the indirection. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Mark Brown Cc: Puranjay Mohan Cc: Will Deacon Reviewed-by: Kalesh Singh Reviewed-by: Puranjay Mohan Reviewed-by: Madhavan T. Venkataraman Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20231124110511.2795958-3-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/stacktrace.c | 39 ++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index aafc89192787..7f88028a00c0 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -154,8 +154,10 @@ kunwind_next(struct kunwind_state *state) return kunwind_recover_return_address(state); } +typedef bool (*kunwind_consume_fn)(const struct kunwind_state *state, void *cookie); + static __always_inline void -do_kunwind(struct kunwind_state *state, stack_trace_consume_fn consume_entry, +do_kunwind(struct kunwind_state *state, kunwind_consume_fn consume_state, void *cookie) { if (kunwind_recover_return_address(state)) @@ -164,7 +166,7 @@ do_kunwind(struct kunwind_state *state, stack_trace_consume_fn consume_entry, while (1) { int ret; - if (!consume_entry(cookie, state->common.pc)) + if (!consume_state(state, cookie)) break; ret = kunwind_next(state); if (ret < 0) @@ -201,9 +203,10 @@ do_kunwind(struct kunwind_state *state, stack_trace_consume_fn consume_entry, : stackinfo_get_unknown(); \ }) -noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, - void *cookie, struct task_struct *task, - struct pt_regs *regs) +static __always_inline void +kunwind_stack_walk(kunwind_consume_fn consume_state, + void *cookie, struct task_struct *task, + struct pt_regs *regs) { struct stack_info stacks[] = { stackinfo_get_task(task), @@ -236,7 +239,31 @@ noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, kunwind_init_from_task(&state, task); } - do_kunwind(&state, consume_entry, cookie); + do_kunwind(&state, consume_state, cookie); +} + +struct kunwind_consume_entry_data { + stack_trace_consume_fn consume_entry; + void *cookie; +}; + +static bool +arch_kunwind_consume_entry(const struct kunwind_state *state, void *cookie) +{ + struct kunwind_consume_entry_data *data = cookie; + return data->consume_entry(data->cookie, state->common.pc); +} + +noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task, + struct pt_regs *regs) +{ + struct kunwind_consume_entry_data data = { + .consume_entry = consume_entry, + .cookie = cookie, + }; + + kunwind_stack_walk(arch_kunwind_consume_entry, &data, task, regs); } static bool dump_backtrace_entry(void *arg, unsigned long where) From 33c1a7785a41216ec44d0fadc1890103e2db88b0 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 24 Nov 2023 15:22:21 +0000 Subject: [PATCH 181/310] kselftest/arm64: Improve output for skipped TPIDR2 ABI test When TPIDR2 is not supported the tpidr2 ABI test prints the same message for each skipped test: ok 1 skipped, TPIDR2 not supported which isn't ideal for test automation software since it tracks kselftest results based on the string used to describe the test. This is also not standard KTAP output, the expected format is: ok 1 # SKIP default_value Updated the program to generate this, using the same set of test names that we would run if the test actually executed. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20231124-kselftest-arm64-tpidr2-skip-v1-1-e05d0ccef101@kernel.org Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/abi/tpidr2.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/arm64/abi/tpidr2.c b/tools/testing/selftests/arm64/abi/tpidr2.c index 351a098b503a..02ee3a91b780 100644 --- a/tools/testing/selftests/arm64/abi/tpidr2.c +++ b/tools/testing/selftests/arm64/abi/tpidr2.c @@ -254,6 +254,12 @@ static int write_clone_read(void) putnum(++tests_run); \ putstr(" " #name "\n"); +#define skip_test(name) \ + tests_skipped++; \ + putstr("ok "); \ + putnum(++tests_run); \ + putstr(" # SKIP " #name "\n"); + int main(int argc, char **argv) { int ret, i; @@ -283,13 +289,11 @@ int main(int argc, char **argv) } else { putstr("# SME support not present\n"); - for (i = 0; i < EXPECTED_TESTS; i++) { - putstr("ok "); - putnum(i); - putstr(" skipped, TPIDR2 not supported\n"); - } - - tests_skipped += EXPECTED_TESTS; + skip_test(default_value); + skip_test(write_read); + skip_test(write_sleep_read); + skip_test(write_fork_read); + skip_test(write_clone_read); } print_summary(); From 48f7ab21f731f5a02ec34a4b83ae88c4a41d6a10 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 5 Dec 2023 14:24:44 +0000 Subject: [PATCH 182/310] kselftest/arm64: Log SVCR when the SME tests barf On failure we log the actual and expected value of the register we detect a mismatch in. For SME one obvious potential source of corruption would be if we had corrupted SVCR since changes in streaming mode will reset the register values, log the value to aid in understanding issues. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20231205-arm64-kselftest-log-svcr-v1-1-b77abd9ee7f3@kernel.org Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/sve-test.S | 10 ++++++++++ tools/testing/selftests/arm64/fp/za-test.S | 6 ++++++ tools/testing/selftests/arm64/fp/zt-test.S | 5 +++++ 3 files changed, 21 insertions(+) diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S index 547d077e3517..fff60e2a25ad 100644 --- a/tools/testing/selftests/arm64/fp/sve-test.S +++ b/tools/testing/selftests/arm64/fp/sve-test.S @@ -515,6 +515,10 @@ function barf mov x11, x1 // actual data mov x12, x2 // data size +#ifdef SSVE + mrs x13, S3_3_C4_C2_2 +#endif + puts "Mismatch: PID=" mov x0, x20 bl putdec @@ -534,6 +538,12 @@ function barf bl dumphex puts "]\n" +#ifdef SSVE + puts "\tSVCR: " + mov x0, x13 + bl putdecn +#endif + mov x8, #__NR_getpid svc #0 // fpsimd.c acitivty log dump hack diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S index 9dcd70911397..095b45531640 100644 --- a/tools/testing/selftests/arm64/fp/za-test.S +++ b/tools/testing/selftests/arm64/fp/za-test.S @@ -333,6 +333,9 @@ function barf // mov w8, #__NR_exit // svc #0 // end hack + + mrs x13, S3_3_C4_C2_2 + smstop mov x10, x0 // expected data mov x11, x1 // actual data @@ -356,6 +359,9 @@ function barf mov x1, x12 bl dumphex puts "]\n" + puts "\tSVCR: " + mov x0, x13 + bl putdecn mov x8, #__NR_getpid svc #0 diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S index d63286397638..b5c81e81a379 100644 --- a/tools/testing/selftests/arm64/fp/zt-test.S +++ b/tools/testing/selftests/arm64/fp/zt-test.S @@ -267,6 +267,8 @@ function barf // mov w8, #__NR_exit // svc #0 // end hack + + mrs x13, S3_3_C4_C2_2 smstop mov x10, x0 // expected data mov x11, x1 // actual data @@ -287,6 +289,9 @@ function barf mov x1, x12 bl dumphex puts "]\n" + puts "\tSVCR: " + mov x0, x13 + bl putdecn mov x8, #__NR_getpid svc #0 From 86d1921c9d5a25ff057284f1208e731145e24508 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Wed, 6 Dec 2023 00:01:40 +0800 Subject: [PATCH 183/310] arm64: Delete the zero_za macro zero_za was introduced in commit ca8a4ebcff44 ("arm64/sme: Manually encode SME instructions") but doesn't appear to have any in kernel user. Drop it. Signed-off-by: Zenghui Yu Acked-by: Mark Brown Link: https://lore.kernel.org/r/20231205160140.1438-1-yuzenghui@huawei.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/fpsimdmacros.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index cdf6a35e3994..cda81d009c9b 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -242,14 +242,6 @@ | (\nx << 5) .endm -/* - * Zero the entire ZA array - * ZERO ZA - */ -.macro zero_za - .inst 0xc00800ff -.endm - .macro __for from:req, to:req .if (\from) == (\to) _for__body %\from From 256f442895ed9846bddf020d95c112de830c336c Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Sat, 9 Dec 2023 01:02:47 +0000 Subject: [PATCH 184/310] arm64/sysreg: Update HFGITR_EL2 definiton to DDI0601 2023-09 The 2023-09 release of the architecture XML (DDI0601) adds a new field ATS1E1A to HFGITR_EL2, update our definition of the register to match. This was extracted from Faud Tabba's patch "KVM: arm64: Add latest HFGITR_EL2 FGT entries to nested virt" [Extracted the sysreg definition from Faud's original patch and reword subject to match -- broonie] Signed-off-by: Fuad Tabba Message-Id: <20231206100503.564090-4-tabba@google.com> Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20231209-b4-arm64-sysreg-additions-v1-1-45284e538474@kernel.org Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 96cbeeab4eec..8faeab1ee024 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2102,7 +2102,9 @@ Fields HFGxTR_EL2 EndSysreg Sysreg HFGITR_EL2 3 4 1 1 6 -Res0 63:61 +Res0 63 +Field 62 ATS1E1A +Res0 61 Field 60 COSPRCTX Field 59 nGCSEPP Field 58 nGCSSTR_EL1 From 41bb68fbd016f0735798348dee2034f35cc06a17 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Sat, 9 Dec 2023 01:02:48 +0000 Subject: [PATCH 185/310] arm64/sysreg: Add definition for HAFGRTR_EL2 Add a definition of HAFGRTR_EL2 (fine grained trap control for the AMU) as per DDI0601 2023-09. This was extracted from Fuad Tabba's patch "KVM: arm64: Handle HAFGRTR_EL2 trapping in nested virt". Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20231206100503.564090-6-tabba@google.com [Extract sysreg update and rewrite commit message -- broonie] Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20231209-b4-arm64-sysreg-additions-v1-2-45284e538474@kernel.org Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 43 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 8faeab1ee024..145b33f75a96 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2297,6 +2297,49 @@ Field 1 DBGBVRn_EL1 Field 0 DBGBCRn_EL1 EndSysreg +Sysreg HAFGRTR_EL2 3 4 3 1 6 +Res0 63:50 +Field 49 AMEVTYPER115_EL0 +Field 48 AMEVCNTR115_EL0 +Field 47 AMEVTYPER114_EL0 +Field 46 AMEVCNTR114_EL0 +Field 45 AMEVTYPER113_EL0 +Field 44 AMEVCNTR113_EL0 +Field 43 AMEVTYPER112_EL0 +Field 42 AMEVCNTR112_EL0 +Field 41 AMEVTYPER111_EL0 +Field 40 AMEVCNTR111_EL0 +Field 39 AMEVTYPER110_EL0 +Field 38 AMEVCNTR110_EL0 +Field 37 AMEVTYPER19_EL0 +Field 36 AMEVCNTR19_EL0 +Field 35 AMEVTYPER18_EL0 +Field 34 AMEVCNTR18_EL0 +Field 33 AMEVTYPER17_EL0 +Field 32 AMEVCNTR17_EL0 +Field 31 AMEVTYPER16_EL0 +Field 30 AMEVCNTR16_EL0 +Field 29 AMEVTYPER15_EL0 +Field 28 AMEVCNTR15_EL0 +Field 27 AMEVTYPER14_EL0 +Field 26 AMEVCNTR14_EL0 +Field 25 AMEVTYPER13_EL0 +Field 24 AMEVCNTR13_EL0 +Field 23 AMEVTYPER12_EL0 +Field 22 AMEVCNTR12_EL0 +Field 21 AMEVTYPER11_EL0 +Field 20 AMEVCNTR11_EL0 +Field 19 AMEVTYPER10_EL0 +Field 18 AMEVCNTR10_EL0 +Field 17 AMCNTEN1 +Res0 16:5 +Field 4 AMEVCNTR03_EL0 +Field 3 AMEVCNTR02_EL0 +Field 2 AMEVCNTR01_EL0 +Field 1 AMEVCNTR00_EL0 +Field 0 AMCNTEN0 +EndSysreg + Sysreg ZCR_EL2 3 4 1 2 0 Fields ZCR_ELx EndSysreg From c0c5a8ea96b877e903b40ed2345e73f83b0ed612 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Sat, 9 Dec 2023 01:02:49 +0000 Subject: [PATCH 186/310] arm64/sysreg: add system register POR_EL{0,1} Add POR_EL{0,1} according to DDI0601 2023-03. Signed-off-by: Joey Gouly Reviewed-by: Mark Brown Acked-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20231209-b4-arm64-sysreg-additions-v1-3-45284e538474@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/sysreg.h | 13 +++++++++++++ arch/arm64/tools/sysreg | 12 ++++++++++++ 2 files changed, 25 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 5e65f51c10d2..9c2caf0efdc7 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1039,6 +1039,19 @@ #define PIRx_ELx_PERM(idx, perm) ((perm) << ((idx) * 4)) +/* + * Permission Overlay Extension (POE) permission encodings. + */ +#define POE_NONE UL(0x0) +#define POE_R UL(0x1) +#define POE_X UL(0x2) +#define POE_RX UL(0x3) +#define POE_W UL(0x4) +#define POE_RW UL(0x5) +#define POE_XW UL(0x6) +#define POE_RXW UL(0x7) +#define POE_MASK UL(0xf) + #define ARM64_FEATURE_FIELD_BITS 4 /* Defined for compatibility only, do not add new users. */ diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 145b33f75a96..1d371a24da6e 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2555,6 +2555,18 @@ Sysreg PIR_EL2 3 4 10 2 3 Fields PIRx_ELx EndSysreg +Sysreg POR_EL0 3 3 10 2 4 +Fields PIRx_ELx +EndSysreg + +Sysreg POR_EL1 3 0 10 2 4 +Fields PIRx_ELx +EndSysreg + +Sysreg POR_EL12 3 5 10 2 4 +Fields PIRx_ELx +EndSysreg + Sysreg LORSA_EL1 3 0 10 4 0 Res0 63:52 Field 51:16 SA From 35768b23d830302c9b818a213a9c1e5efb618218 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Sat, 9 Dec 2023 01:02:50 +0000 Subject: [PATCH 187/310] arm64/sysreg: update CPACR_EL1 register Add E0POE bit that traps accesses to POR_EL0 from EL0. Updated according to DDI0601 2023-03. Signed-off-by: Joey Gouly Reviewed-by: Mark Brown Acked-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20231209-b4-arm64-sysreg-additions-v1-4-45284e538474@kernel.org Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 1d371a24da6e..c2dbbaa22620 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1747,7 +1747,8 @@ Field 0 M EndSysreg SysregFields CPACR_ELx -Res0 63:29 +Res0 63:30 +Field 29 E0POE Field 28 TTA Res0 27:26 Field 25:24 SMEN From 9fb5dc53a1176402905a7dde6cd812bc01ce6831 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 9 Dec 2023 01:02:51 +0000 Subject: [PATCH 188/310] arm64/sysreg: Add definition for ID_AA64PFR2_EL1 DDI0601 2023-09 defines a new system register ID_AA64PFR2_EL1 which enumerates FPMR and some new MTE features. Add a definition of this register. Signed-off-by: Mark Brown Reviewed-by: Fuad Tabba Link: https://lore.kernel.org/r/20231209-b4-arm64-sysreg-additions-v1-5-45284e538474@kernel.org Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index c2dbbaa22620..c48a3b8d00ad 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1002,6 +1002,27 @@ UnsignedEnum 3:0 BT EndEnum EndSysreg +Sysreg ID_AA64PFR2_EL1 3 0 0 4 2 +Res0 63:36 +UnsignedEnum 35:32 FPMR + 0b0000 NI + 0b0001 IMP +EndEnum +Res0 31:12 +UnsignedEnum 11:8 MTEFAR + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 7:4 MTESTOREONLY + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 3:0 MTEPERM + 0b0000 NI + 0b0001 IMP +EndEnum +EndSysreg + Sysreg ID_AA64ZFR0_EL1 3 0 0 4 4 Res0 63:60 UnsignedEnum 59:56 F64MM From 6e3dcfd139755d95f2c0d1f865f2e093d2b35c91 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 9 Dec 2023 01:02:52 +0000 Subject: [PATCH 189/310] arm64/sysreg: Update ID_AA64ISAR2_EL1 defintion for DDI0601 2023-09 DDI0601 2023-09 defines some new fields in previously RES0 space in ID_AA64ISAR2_EL1, together with one new enum value. Update the system register definition to reflect this. Signed-off-by: Mark Brown Reviewed-by: Fuad Tabba Link: https://lore.kernel.org/r/20231209-b4-arm64-sysreg-additions-v1-6-45284e538474@kernel.org Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index c48a3b8d00ad..7af081c52ce7 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1365,7 +1365,14 @@ EndEnum EndSysreg Sysreg ID_AA64ISAR2_EL1 3 0 0 6 2 -Res0 63:56 +UnsignedEnum 63:60 ATS1A + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 59:56 LUT + 0b0000 NI + 0b0001 IMP +EndEnum UnsignedEnum 55:52 CSSC 0b0000 NI 0b0001 IMP @@ -1374,7 +1381,19 @@ UnsignedEnum 51:48 RPRFM 0b0000 NI 0b0001 IMP EndEnum -Res0 47:32 +Res0 47:44 +UnsignedEnum 43:40 PRFMSLC + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 39:36 SYSINSTR_128 + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 35:32 SYSREG_128 + 0b0000 NI + 0b0001 IMP +EndEnum UnsignedEnum 31:28 CLRBHB 0b0000 NI 0b0001 IMP @@ -1398,6 +1417,7 @@ UnsignedEnum 15:12 APA3 0b0011 PAuth2 0b0100 FPAC 0b0101 FPACCOMBINE + 0b0110 PAuth_LR EndEnum UnsignedEnum 11:8 GPA3 0b0000 NI From b5aefb668701c2b019011f6f8fe29814b8529ecd Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 9 Dec 2023 01:02:53 +0000 Subject: [PATCH 190/310] arm64/sysreg: Add definition for ID_AA64ISAR3_EL1 DDI0601 2023-09 adds a new system register ID_AA64ISAR3_EL1 enumerating new floating point and TLB invalidation features. Add a defintion for it. Signed-off-by: Mark Brown Reviewed-by: Fuad Tabba Link: https://lore.kernel.org/r/20231209-b4-arm64-sysreg-additions-v1-7-45284e538474@kernel.org Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 7af081c52ce7..7dc7c76ee4ce 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1433,6 +1433,23 @@ UnsignedEnum 3:0 WFxT EndEnum EndSysreg +Sysreg ID_AA64ISAR3_EL1 3 0 0 6 3 +Res0 63:12 +UnsignedEnum 11:8 TLBIW + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 7:4 FAMINMAX + 0b0000 NI + 0b0001 IMP +EndEnum +UnsignedEnum 3:0 CPA + 0b0000 NI + 0b0001 IMP + 0b0010 CPA2 +EndEnum +EndSysreg + Sysreg ID_AA64MMFR0_EL1 3 0 0 7 0 UnsignedEnum 63:60 ECV 0b0000 NI From 9e4f409b07df14443ae4840f17f07e5025435e3d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 9 Dec 2023 01:02:54 +0000 Subject: [PATCH 191/310] arm64/sysreg: Add definition for ID_AA64FPFR0_EL1 DDI0601 2023-09 defines a new feature register ID_AA64FPFR0_EL1 which enumerates a number of FP8 related features. Add a definition for it. Signed-off-by: Mark Brown Reviewed-by: Fuad Tabba Link: https://lore.kernel.org/r/20231209-b4-arm64-sysreg-additions-v1-8-45284e538474@kernel.org Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 7dc7c76ee4ce..167906adae40 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1131,6 +1131,35 @@ EndEnum Res0 31:0 EndSysreg +Sysreg ID_AA64FPFR0_EL1 3 0 0 4 7 +Res0 63:32 +UnsignedEnum 31 F8CVT + 0b0 NI + 0b1 IMP +EndEnum +UnsignedEnum 30 F8FMA + 0b0 NI + 0b1 IMP +EndEnum +UnsignedEnum 29 F8DP4 + 0b0 NI + 0b1 IMP +EndEnum +UnsignedEnum 28 F8DP2 + 0b0 NI + 0b1 IMP +EndEnum +Res0 27:2 +UnsignedEnum 1 F8E4M3 + 0b0 NI + 0b1 IMP +EndEnum +UnsignedEnum 0 F8E5M2 + 0b0 NI + 0b1 IMP +EndEnum +EndSysreg + Sysreg ID_AA64DFR0_EL1 3 0 0 5 0 Enum 63:60 HPMN0 0b0000 UNPREDICTABLE From 8afe582d77000ad244b66ed278aedc4ab5ee1634 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 9 Dec 2023 01:02:55 +0000 Subject: [PATCH 192/310] arm64/sysreg: Update ID_AA64SMFR0_EL1 definition for DDI0601 2023-09 The 2023-09 release of DDI0601 defines a number of new feature enumeration fields in ID_AA64SMFR0_EL1. Add these fields. Signed-off-by: Mark Brown Reviewed-by: Fuad Tabba Link: https://lore.kernel.org/r/20231209-b4-arm64-sysreg-additions-v1-9-45284e538474@kernel.org Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 167906adae40..191b59db79d8 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1079,7 +1079,11 @@ UnsignedEnum 63 FA64 0b0 NI 0b1 IMP EndEnum -Res0 62:60 +Res0 62:61 +UnsignedEnum 60 LUTv2 + 0b0 NI + 0b1 IMP +EndEnum UnsignedEnum 59:56 SMEver 0b0000 SME 0b0001 SME2 @@ -1107,7 +1111,14 @@ UnsignedEnum 42 F16F16 0b0 NI 0b1 IMP EndEnum -Res0 41:40 +UnsignedEnum 41 F8F16 + 0b0 NI + 0b1 IMP +EndEnum +UnsignedEnum 40 F8F32 + 0b0 NI + 0b1 IMP +EndEnum UnsignedEnum 39:36 I8I32 0b0000 NI 0b1111 IMP @@ -1128,7 +1139,20 @@ UnsignedEnum 32 F32F32 0b0 NI 0b1 IMP EndEnum -Res0 31:0 +Res0 31 +UnsignedEnum 30 SF8FMA + 0b0 NI + 0b1 IMP +EndEnum +UnsignedEnum 29 SF8DP4 + 0b0 NI + 0b1 IMP +EndEnum +UnsignedEnum 28 SF8DP2 + 0b0 NI + 0b1 IMP +EndEnum +Res0 27:0 EndSysreg Sysreg ID_AA64FPFR0_EL1 3 0 0 4 7 From a6052284a9f9bcfb982edfe00044ecdfdf72eaa7 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 9 Dec 2023 01:02:56 +0000 Subject: [PATCH 193/310] arm64/sysreg: Update SCTLR_EL1 for DDI0601 2023-09 DDI0601 2023-09 defines some new fields in SCTLR_EL1 controlling new MTE and floating point features. Update our sysreg definition to reflect these. Signed-off-by: Mark Brown Reviewed-by: Fuad Tabba Link: https://lore.kernel.org/r/20231209-b4-arm64-sysreg-additions-v1-10-45284e538474@kernel.org Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 191b59db79d8..48df1ffafe21 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1791,7 +1791,8 @@ Field 63 TIDCP Field 62 SPINTMASK Field 61 NMI Field 60 EnTP2 -Res0 59:58 +Field 59 TCSO +Field 58 TCSO0 Field 57 EPAN Field 56 EnALS Field 55 EnAS0 @@ -1820,7 +1821,7 @@ EndEnum Field 37 ITFSB Field 36 BT1 Field 35 BT0 -Res0 34 +Field 34 EnFPM Field 33 MSCEn Field 32 CMOW Field 31 EnIA From 126cb3a60d35cc2ce7db090b087e00ff85b12cfc Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 9 Dec 2023 01:02:57 +0000 Subject: [PATCH 194/310] arm64/sysreg: Update HCRX_EL2 definition for DDI0601 2023-09 DDI0601 2023-09 defines new fields in HCRX_EL2 controlling access to new system registers, update our definition of HCRX_EL2 to reflect this. Signed-off-by: Mark Brown Reviewed-by: Fuad Tabba Link: https://lore.kernel.org/r/20231209-b4-arm64-sysreg-additions-v1-11-45284e538474@kernel.org Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 48df1ffafe21..9b405c999adf 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2458,7 +2458,9 @@ Fields ZCR_ELx EndSysreg Sysreg HCRX_EL2 3 4 1 2 2 -Res0 63:23 +Res0 63:25 +Field 24 PACMEn +Field 23 EnFPM Field 22 GCSEn Field 21 EnIDCP128 Field 20 EnSDERR From e3a649ecf8b9253cb1d05ceb085544472b06446f Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 9 Dec 2023 01:02:58 +0000 Subject: [PATCH 195/310] arm64/sysreg: Add definition for FPMR DDI0601 2023-09 defines a new sysrem register FPMR (Floating Point Mode Register) which configures the new FP8 features. Add a definition of this register. Signed-off-by: Mark Brown Reviewed-by: Fuad Tabba Link: https://lore.kernel.org/r/20231209-b4-arm64-sysreg-additions-v1-12-45284e538474@kernel.org Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 9b405c999adf..2698dcd49765 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2139,6 +2139,29 @@ Field 1 ZA Field 0 SM EndSysreg +Sysreg FPMR 3 3 4 4 2 +Res0 63:38 +Field 37:32 LSCALE2 +Field 31:24 NSCALE +Res0 23 +Field 22:16 LSCALE +Field 15 OSC +Field 14 OSM +Res0 13:9 +UnsignedEnum 8:6 F8D + 0b000 E5M2 + 0b001 E4M3 +EndEnum +UnsignedEnum 5:3 F8S2 + 0b000 E5M2 + 0b001 E4M3 +EndEnum +UnsignedEnum 2:0 F8S1 + 0b000 E5M2 + 0b001 E4M3 +EndEnum +EndSysreg + SysregFields HFGxTR_EL2 Field 63 nAMAIR2_EL1 Field 62 nMAIR2_EL1 From e94e06d8a7960fd840ea92021ca1bf1362ea67f8 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 9 Dec 2023 01:02:59 +0000 Subject: [PATCH 196/310] arm64/sysreg: Add new system registers for GCS FEAT_GCS introduces a number of new system registers. Add the registers available up to EL2 to sysreg as per DDI0601 2022-12. Signed-off-by: Mark Brown Reviewed-by: Fuad Tabba Link: https://lore.kernel.org/r/20231209-b4-arm64-sysreg-additions-v1-13-45284e538474@kernel.org Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 55 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 2698dcd49765..2c4b6665c5bf 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1903,6 +1903,41 @@ Sysreg SMCR_EL1 3 0 1 2 6 Fields SMCR_ELx EndSysreg +SysregFields GCSCR_ELx +Res0 63:10 +Field 9 STREn +Field 8 PUSHMEn +Res0 7 +Field 6 EXLOCKEN +Field 5 RVCHKEN +Res0 4:1 +Field 0 PCRSEL +EndSysregFields + +Sysreg GCSCR_EL1 3 0 2 5 0 +Fields GCSCR_ELx +EndSysreg + +SysregFields GCSPR_ELx +Field 63:3 PTR +Res0 2:0 +EndSysregFields + +Sysreg GCSPR_EL1 3 0 2 5 1 +Fields GCSPR_ELx +EndSysreg + +Sysreg GCSCRE0_EL1 3 0 2 5 2 +Res0 63:11 +Field 10 nTR +Field 9 STREn +Field 8 PUSHMEn +Res0 7:6 +Field 5 RVCHKEN +Res0 4:1 +Field 0 PCRSEL +EndSysreg + Sysreg ALLINT 3 0 4 3 0 Res0 63:14 Field 13 ALLINT @@ -2133,6 +2168,10 @@ Field 4 DZP Field 3:0 BS EndSysreg +Sysreg GCSPR_EL0 3 3 2 5 1 +Fields GCSPR_ELx +EndSysreg + Sysreg SVCR 3 3 4 2 2 Res0 63:2 Field 1 ZA @@ -2531,6 +2570,14 @@ Sysreg SMCR_EL2 3 4 1 2 6 Fields SMCR_ELx EndSysreg +Sysreg GCSCR_EL2 3 4 2 5 0 +Fields GCSCR_ELx +EndSysreg + +Sysreg GCSPR_EL2 3 4 2 5 1 +Fields GCSPR_ELx +EndSysreg + Sysreg DACR32_EL2 3 4 3 0 0 Res0 63:32 Field 31:30 D15 @@ -2590,6 +2637,14 @@ Sysreg SMCR_EL12 3 5 1 2 6 Fields SMCR_ELx EndSysreg +Sysreg GCSCR_EL12 3 5 2 5 0 +Fields GCSCR_ELx +EndSysreg + +Sysreg GCSPR_EL12 3 5 2 5 1 +Fields GCSPR_ELx +EndSysreg + Sysreg FAR_EL12 3 5 6 0 0 Field 63:0 ADDR EndSysreg From 79c03ed4b896513d226abdd83214b8436efdc22c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 10 Dec 2023 18:48:18 +0100 Subject: [PATCH 197/310] drivers/perf: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/85b0b73a1b2f743dd5db15d4765c7685100de27f.1702230488.git.christophe.jaillet@wanadoo.fr Signed-off-by: Will Deacon --- drivers/perf/fsl_imx9_ddr_perf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/perf/fsl_imx9_ddr_perf.c b/drivers/perf/fsl_imx9_ddr_perf.c index 5cf770a1bc31..9685645bfe04 100644 --- a/drivers/perf/fsl_imx9_ddr_perf.c +++ b/drivers/perf/fsl_imx9_ddr_perf.c @@ -617,7 +617,7 @@ static int ddr_perf_probe(struct platform_device *pdev) platform_set_drvdata(pdev, pmu); - pmu->id = ida_simple_get(&ddr_ida, 0, 0, GFP_KERNEL); + pmu->id = ida_alloc(&ddr_ida, GFP_KERNEL); name = devm_kasprintf(&pdev->dev, GFP_KERNEL, DDR_PERF_DEV_NAME "%d", pmu->id); if (!name) { ret = -ENOMEM; @@ -674,7 +674,7 @@ cpuhp_instance_err: cpuhp_remove_multi_state(pmu->cpuhp_state); cpuhp_state_err: format_string_err: - ida_simple_remove(&ddr_ida, pmu->id); + ida_free(&ddr_ida, pmu->id); dev_warn(&pdev->dev, "i.MX9 DDR Perf PMU failed (%d), disabled\n", ret); return ret; } @@ -688,7 +688,7 @@ static int ddr_perf_remove(struct platform_device *pdev) perf_pmu_unregister(&pmu->pmu); - ida_simple_remove(&ddr_ida, pmu->id); + ida_free(&ddr_ida, pmu->id); return 0; } From 5ca8ab55084de7b92a5979a2d9fa233158cb1ac2 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 12 Dec 2023 09:26:38 +0000 Subject: [PATCH 198/310] drivers/perf: arm_dsu_pmu: Remove kerneldoc-style comment syntax For some reason, the Arm DSU PMU driver uses kerneldoc-style comment syntax (i.e. /** ) for non-kerneldoc comments. This makes the robots very angry indeed, so just revert these to normal comments to stop the noise. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312092000.8ltwotjt-lkp@intel.com/ Signed-off-by: Will Deacon --- drivers/perf/arm_dsu_pmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c index 8223c49bd082..7ec4498e312f 100644 --- a/drivers/perf/arm_dsu_pmu.c +++ b/drivers/perf/arm_dsu_pmu.c @@ -371,7 +371,7 @@ static inline u32 dsu_pmu_get_reset_overflow(void) return __dsu_pmu_get_reset_overflow(); } -/** +/* * dsu_pmu_set_event_period: Set the period for the counter. * * All DSU PMU event counters, except the cycle counter are 32bit @@ -602,7 +602,7 @@ static struct dsu_pmu *dsu_pmu_alloc(struct platform_device *pdev) return dsu_pmu; } -/** +/* * dsu_pmu_dt_get_cpus: Get the list of CPUs in the cluster * from device tree. */ @@ -632,7 +632,7 @@ static int dsu_pmu_dt_get_cpus(struct device *dev, cpumask_t *mask) return 0; } -/** +/* * dsu_pmu_acpi_get_cpus: Get the list of CPUs in the cluster * from ACPI. */ From 9343c790e6de7edd2bab17572832f4f21c748dc2 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:13 +0000 Subject: [PATCH 199/310] arm: perf: Remove inlines from arm_pmuv3.c These are all static and in one compilation unit so the inline has no effect on the binary. Except if FTRACE is enabled, then 3 functions which were already not inlined now get the nops added which allows them to be traced. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-2-james.clark@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_pmuv3.c | 46 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index bbde60f1b08c..09478e2b825e 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -304,12 +304,12 @@ PMU_FORMAT_ATTR(rdpmc, "config1:1"); static int sysctl_perf_user_access __read_mostly; -static inline bool armv8pmu_event_is_64bit(struct perf_event *event) +static bool armv8pmu_event_is_64bit(struct perf_event *event) { return event->attr.config1 & 0x1; } -static inline bool armv8pmu_event_want_user_access(struct perf_event *event) +static bool armv8pmu_event_want_user_access(struct perf_event *event) { return event->attr.config1 & 0x2; } @@ -401,7 +401,7 @@ static bool armv8pmu_has_long_event(struct arm_pmu *cpu_pmu) return (IS_ENABLED(CONFIG_ARM64) && is_pmuv3p5(cpu_pmu->pmuver)); } -static inline bool armv8pmu_event_has_user_read(struct perf_event *event) +static bool armv8pmu_event_has_user_read(struct perf_event *event) { return event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT; } @@ -411,7 +411,7 @@ static inline bool armv8pmu_event_has_user_read(struct perf_event *event) * except when we have allocated the 64bit cycle counter (for CPU * cycles event) or when user space counter access is enabled. */ -static inline bool armv8pmu_event_is_chained(struct perf_event *event) +static bool armv8pmu_event_is_chained(struct perf_event *event) { int idx = event->hw.idx; struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); @@ -432,36 +432,36 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event) #define ARMV8_IDX_TO_COUNTER(x) \ (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK) -static inline u64 armv8pmu_pmcr_read(void) +static u64 armv8pmu_pmcr_read(void) { return read_pmcr(); } -static inline void armv8pmu_pmcr_write(u64 val) +static void armv8pmu_pmcr_write(u64 val) { val &= ARMV8_PMU_PMCR_MASK; isb(); write_pmcr(val); } -static inline int armv8pmu_has_overflowed(u32 pmovsr) +static int armv8pmu_has_overflowed(u32 pmovsr) { return pmovsr & ARMV8_PMU_OVERFLOWED_MASK; } -static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) +static int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) { return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx)); } -static inline u64 armv8pmu_read_evcntr(int idx) +static u64 armv8pmu_read_evcntr(int idx) { u32 counter = ARMV8_IDX_TO_COUNTER(idx); return read_pmevcntrn(counter); } -static inline u64 armv8pmu_read_hw_counter(struct perf_event *event) +static u64 armv8pmu_read_hw_counter(struct perf_event *event) { int idx = event->hw.idx; u64 val = armv8pmu_read_evcntr(idx); @@ -523,14 +523,14 @@ static u64 armv8pmu_read_counter(struct perf_event *event) return armv8pmu_unbias_long_counter(event, value); } -static inline void armv8pmu_write_evcntr(int idx, u64 value) +static void armv8pmu_write_evcntr(int idx, u64 value) { u32 counter = ARMV8_IDX_TO_COUNTER(idx); write_pmevcntrn(counter, value); } -static inline void armv8pmu_write_hw_counter(struct perf_event *event, +static void armv8pmu_write_hw_counter(struct perf_event *event, u64 value) { int idx = event->hw.idx; @@ -556,7 +556,7 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value) armv8pmu_write_hw_counter(event, value); } -static inline void armv8pmu_write_evtype(int idx, u32 val) +static void armv8pmu_write_evtype(int idx, u32 val) { u32 counter = ARMV8_IDX_TO_COUNTER(idx); @@ -564,7 +564,7 @@ static inline void armv8pmu_write_evtype(int idx, u32 val) write_pmevtypern(counter, val); } -static inline void armv8pmu_write_event_type(struct perf_event *event) +static void armv8pmu_write_event_type(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; @@ -598,7 +598,7 @@ static u32 armv8pmu_event_cnten_mask(struct perf_event *event) return mask; } -static inline void armv8pmu_enable_counter(u32 mask) +static void armv8pmu_enable_counter(u32 mask) { /* * Make sure event configuration register writes are visible before we @@ -608,7 +608,7 @@ static inline void armv8pmu_enable_counter(u32 mask) write_pmcntenset(mask); } -static inline void armv8pmu_enable_event_counter(struct perf_event *event) +static void armv8pmu_enable_event_counter(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; u32 mask = armv8pmu_event_cnten_mask(event); @@ -620,7 +620,7 @@ static inline void armv8pmu_enable_event_counter(struct perf_event *event) armv8pmu_enable_counter(mask); } -static inline void armv8pmu_disable_counter(u32 mask) +static void armv8pmu_disable_counter(u32 mask) { write_pmcntenclr(mask); /* @@ -630,7 +630,7 @@ static inline void armv8pmu_disable_counter(u32 mask) isb(); } -static inline void armv8pmu_disable_event_counter(struct perf_event *event) +static void armv8pmu_disable_event_counter(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; u32 mask = armv8pmu_event_cnten_mask(event); @@ -642,18 +642,18 @@ static inline void armv8pmu_disable_event_counter(struct perf_event *event) armv8pmu_disable_counter(mask); } -static inline void armv8pmu_enable_intens(u32 mask) +static void armv8pmu_enable_intens(u32 mask) { write_pmintenset(mask); } -static inline void armv8pmu_enable_event_irq(struct perf_event *event) +static void armv8pmu_enable_event_irq(struct perf_event *event) { u32 counter = ARMV8_IDX_TO_COUNTER(event->hw.idx); armv8pmu_enable_intens(BIT(counter)); } -static inline void armv8pmu_disable_intens(u32 mask) +static void armv8pmu_disable_intens(u32 mask) { write_pmintenclr(mask); isb(); @@ -662,13 +662,13 @@ static inline void armv8pmu_disable_intens(u32 mask) isb(); } -static inline void armv8pmu_disable_event_irq(struct perf_event *event) +static void armv8pmu_disable_event_irq(struct perf_event *event) { u32 counter = ARMV8_IDX_TO_COUNTER(event->hw.idx); armv8pmu_disable_intens(BIT(counter)); } -static inline u32 armv8pmu_getreset_flags(void) +static u32 armv8pmu_getreset_flags(void) { u32 value; From 62e1f212e5fe7624249212813ee96202e0c31430 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:14 +0000 Subject: [PATCH 200/310] arm: perf/kvm: Use GENMASK for ARMV8_PMU_PMCR_N This is so that FIELD_GET and FIELD_PREP can be used and that the fields are in a consistent format to arm64/tools/sysreg Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-3-james.clark@arm.com Signed-off-by: Will Deacon --- arch/arm64/kvm/pmu-emul.c | 8 +++----- arch/arm64/kvm/sys_regs.c | 4 ++-- drivers/perf/arm_pmuv3.c | 4 ++-- include/linux/perf/arm_pmuv3.h | 3 +-- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index fe99b3dab6ce..3d9467ff73bc 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -267,9 +267,8 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) { - u64 val = kvm_vcpu_read_pmcr(vcpu) >> ARMV8_PMU_PMCR_N_SHIFT; + u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu)); - val &= ARMV8_PMU_PMCR_N_MASK; if (val == 0) return BIT(ARMV8_PMU_CYCLE_IDX); else @@ -1136,8 +1135,7 @@ u8 kvm_arm_pmu_get_pmuver_limit(void) */ u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) { - u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0) & - ~(ARMV8_PMU_PMCR_N_MASK << ARMV8_PMU_PMCR_N_SHIFT); + u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0); - return pmcr | ((u64)vcpu->kvm->arch.pmcr_n << ARMV8_PMU_PMCR_N_SHIFT); + return u64_replace_bits(pmcr, vcpu->kvm->arch.pmcr_n, ARMV8_PMU_PMCR_N); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 4735e1b37fb3..ff45d688bd7d 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -877,7 +877,7 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx) u64 pmcr, val; pmcr = kvm_vcpu_read_pmcr(vcpu); - val = (pmcr >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK; + val = FIELD_GET(ARMV8_PMU_PMCR_N, pmcr); if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) { kvm_inject_undefined(vcpu); return false; @@ -1143,7 +1143,7 @@ static int get_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val) { - u8 new_n = (val >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK; + u8 new_n = FIELD_GET(ARMV8_PMU_PMCR_N, val); struct kvm *kvm = vcpu->kvm; mutex_lock(&kvm->arch.config_lock); diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 09478e2b825e..374e973a4e42 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -1111,8 +1112,7 @@ static void __armv8pmu_probe_pmu(void *info) probe->present = true; /* Read the nb of CNTx counters supported from PMNC */ - cpu_pmu->num_events = (armv8pmu_pmcr_read() >> ARMV8_PMU_PMCR_N_SHIFT) - & ARMV8_PMU_PMCR_N_MASK; + cpu_pmu->num_events = FIELD_GET(ARMV8_PMU_PMCR_N, armv8pmu_pmcr_read()); /* Add the CPU cycles counter */ cpu_pmu->num_events += 1; diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 9c226adf938a..ed62bd75cec7 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -215,8 +215,7 @@ #define ARMV8_PMU_PMCR_DP (1 << 5) /* Disable CCNT if non-invasive debug*/ #define ARMV8_PMU_PMCR_LC (1 << 6) /* Overflow on 64 bit cycle counter */ #define ARMV8_PMU_PMCR_LP (1 << 7) /* Long event counter enable */ -#define ARMV8_PMU_PMCR_N_SHIFT 11 /* Number of counters supported */ -#define ARMV8_PMU_PMCR_N_MASK 0x1f +#define ARMV8_PMU_PMCR_N GENMASK(15, 11) /* Number of counters supported */ #define ARMV8_PMU_PMCR_MASK 0xff /* Mask for writable bits */ /* From 2f6a00f30600417ee2737f2b1229c75663f1e3c9 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:15 +0000 Subject: [PATCH 201/310] arm: perf: Use GENMASK for PMMIR fields This is so that FIELD_GET and FIELD_PREP can be used and that the fields are in a consistent format to arm64/tools/sysreg Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-4-james.clark@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_pmuv3.c | 8 +++----- include/linux/perf/arm_pmuv3.h | 9 +++------ 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 374e973a4e42..36bc00494f56 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -332,7 +332,7 @@ static ssize_t slots_show(struct device *dev, struct device_attribute *attr, { struct pmu *pmu = dev_get_drvdata(dev); struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu); - u32 slots = cpu_pmu->reg_pmmir & ARMV8_PMU_SLOTS_MASK; + u32 slots = FIELD_GET(ARMV8_PMU_SLOTS, cpu_pmu->reg_pmmir); return sysfs_emit(page, "0x%08x\n", slots); } @@ -344,8 +344,7 @@ static ssize_t bus_slots_show(struct device *dev, struct device_attribute *attr, { struct pmu *pmu = dev_get_drvdata(dev); struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu); - u32 bus_slots = (cpu_pmu->reg_pmmir >> ARMV8_PMU_BUS_SLOTS_SHIFT) - & ARMV8_PMU_BUS_SLOTS_MASK; + u32 bus_slots = FIELD_GET(ARMV8_PMU_BUS_SLOTS, cpu_pmu->reg_pmmir); return sysfs_emit(page, "0x%08x\n", bus_slots); } @@ -357,8 +356,7 @@ static ssize_t bus_width_show(struct device *dev, struct device_attribute *attr, { struct pmu *pmu = dev_get_drvdata(dev); struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu); - u32 bus_width = (cpu_pmu->reg_pmmir >> ARMV8_PMU_BUS_WIDTH_SHIFT) - & ARMV8_PMU_BUS_WIDTH_MASK; + u32 bus_width = FIELD_GET(ARMV8_PMU_BUS_WIDTH, cpu_pmu->reg_pmmir); u32 val = 0; /* Encoded as Log2(number of bytes), plus one */ diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index ed62bd75cec7..1bc7678c10d4 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -250,12 +250,9 @@ #define ARMV8_PMU_USERENR_ER (1 << 3) /* Event counter can be read at EL0 */ /* PMMIR_EL1.SLOTS mask */ -#define ARMV8_PMU_SLOTS_MASK 0xff - -#define ARMV8_PMU_BUS_SLOTS_SHIFT 8 -#define ARMV8_PMU_BUS_SLOTS_MASK 0xff -#define ARMV8_PMU_BUS_WIDTH_SHIFT 16 -#define ARMV8_PMU_BUS_WIDTH_MASK 0xf +#define ARMV8_PMU_SLOTS GENMASK(7, 0) +#define ARMV8_PMU_BUS_SLOTS GENMASK(15, 8) +#define ARMV8_PMU_BUS_WIDTH GENMASK(19, 16) /* * This code is really good From d30f09b6d7de5d159dbb537f9d67dceb67409420 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:16 +0000 Subject: [PATCH 202/310] arm: perf: Convert remaining fields to use GENMASK Convert the remaining fields to use either GENMASK or be built from other fields. These all already started at bit 0 so don't need a code change for the lack of _SHIFT. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-5-james.clark@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_pmuv3.c | 2 +- include/linux/perf/arm_pmuv3.h | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 36bc00494f56..a93b4cf88562 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -675,7 +675,7 @@ static u32 armv8pmu_getreset_flags(void) value = read_pmovsclr(); /* Write to clear flags */ - value &= ARMV8_PMU_OVSR_MASK; + value &= ARMV8_PMU_OVERFLOWED_MASK; write_pmovsclr(value); return value; diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 1bc7678c10d4..daa63542242d 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -216,19 +216,25 @@ #define ARMV8_PMU_PMCR_LC (1 << 6) /* Overflow on 64 bit cycle counter */ #define ARMV8_PMU_PMCR_LP (1 << 7) /* Long event counter enable */ #define ARMV8_PMU_PMCR_N GENMASK(15, 11) /* Number of counters supported */ -#define ARMV8_PMU_PMCR_MASK 0xff /* Mask for writable bits */ +/* Mask for writable bits */ +#define ARMV8_PMU_PMCR_MASK (ARMV8_PMU_PMCR_E | ARMV8_PMU_PMCR_P | \ + ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_D | \ + ARMV8_PMU_PMCR_X | ARMV8_PMU_PMCR_DP | \ + ARMV8_PMU_PMCR_LC | ARMV8_PMU_PMCR_LP) /* * PMOVSR: counters overflow flag status reg */ -#define ARMV8_PMU_OVSR_MASK 0xffffffff /* Mask for writable bits */ -#define ARMV8_PMU_OVERFLOWED_MASK ARMV8_PMU_OVSR_MASK +#define ARMV8_PMU_OVSR_P GENMASK(30, 0) +#define ARMV8_PMU_OVSR_C BIT(31) +/* Mask for writable bits is both P and C fields */ +#define ARMV8_PMU_OVERFLOWED_MASK (ARMV8_PMU_OVSR_P | ARMV8_PMU_OVSR_C) /* * PMXEVTYPER: Event selection reg */ #define ARMV8_PMU_EVTYPE_MASK 0xc800ffff /* Mask for writable bits */ -#define ARMV8_PMU_EVTYPE_EVENT 0xffff /* Mask for EVENT bits */ +#define ARMV8_PMU_EVTYPE_EVENT GENMASK(15, 0) /* Mask for EVENT bits */ /* * Event filters for PMUv3 @@ -243,11 +249,13 @@ /* * PMUSERENR: user enable reg */ -#define ARMV8_PMU_USERENR_MASK 0xf /* Mask for writable bits */ #define ARMV8_PMU_USERENR_EN (1 << 0) /* PMU regs can be accessed at EL0 */ #define ARMV8_PMU_USERENR_SW (1 << 1) /* PMSWINC can be written at EL0 */ #define ARMV8_PMU_USERENR_CR (1 << 2) /* Cycle counter can be read at EL0 */ #define ARMV8_PMU_USERENR_ER (1 << 3) /* Event counter can be read at EL0 */ +/* Mask for writable bits */ +#define ARMV8_PMU_USERENR_MASK (ARMV8_PMU_USERENR_EN | ARMV8_PMU_USERENR_SW | \ + ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_ER) /* PMMIR_EL1.SLOTS mask */ #define ARMV8_PMU_SLOTS GENMASK(7, 0) From 3115ee021bfb04efde2e96507bfcc1330261a6a1 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:17 +0000 Subject: [PATCH 203/310] arm64: perf: Include threshold control fields in PMEVTYPER mask FEAT_PMUv3_TH (Armv8.8) adds two new fields to PMEVTYPER, so include them in the mask. These aren't writable on 32 bit kernels as they are in the high part of the register, so only include them for arm64. It would be difficult to do this statically in the asm header files for each platform without resulting in circular includes or #ifdefs inline in the code. For that reason the ARMV8_PMU_EVTYPE_MASK definition has been removed and the mask is constructed programmatically. Reviewed-by: Suzuki K Poulose Reviewed-by: Anshuman Khandual Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-6-james.clark@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_pmuv3.c | 9 ++++++++- include/linux/perf/arm_pmuv3.h | 3 ++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index a93b4cf88562..441bf73ee3d5 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -558,8 +558,15 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value) static void armv8pmu_write_evtype(int idx, u32 val) { u32 counter = ARMV8_IDX_TO_COUNTER(idx); + unsigned long mask = ARMV8_PMU_EVTYPE_EVENT | + ARMV8_PMU_INCLUDE_EL2 | + ARMV8_PMU_EXCLUDE_EL0 | + ARMV8_PMU_EXCLUDE_EL1; - val &= ARMV8_PMU_EVTYPE_MASK; + if (IS_ENABLED(CONFIG_ARM64)) + mask |= ARMV8_PMU_EVTYPE_TC | ARMV8_PMU_EVTYPE_TH; + + val &= mask; write_pmevtypern(counter, val); } diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index daa63542242d..91957b3468e9 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -233,8 +233,9 @@ /* * PMXEVTYPER: Event selection reg */ -#define ARMV8_PMU_EVTYPE_MASK 0xc800ffff /* Mask for writable bits */ #define ARMV8_PMU_EVTYPE_EVENT GENMASK(15, 0) /* Mask for EVENT bits */ +#define ARMV8_PMU_EVTYPE_TH GENMASK(43, 32) +#define ARMV8_PMU_EVTYPE_TC GENMASK(63, 61) /* * Event filters for PMUv3 From f6da86969a3c284466ab6080764b2ed91689f262 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:18 +0000 Subject: [PATCH 204/310] arm: pmu: Share user ABI format mechanism with SPE This mechanism makes it much easier to define and read new attributes so move it to the arm_pmu.h header so that it can be shared. At the same time update the existing format attributes to use it. GENMASK has to be changed to GENMASK_ULL because the config fields are 64 bits even on arm32 where this will also be used now. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-7-james.clark@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_pmuv3.c | 21 ++++++++++++++++----- drivers/perf/arm_spe_pmu.c | 22 ---------------------- include/linux/perf/arm_pmu.h | 22 ++++++++++++++++++++++ 3 files changed, 38 insertions(+), 27 deletions(-) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 441bf73ee3d5..8a573db81da1 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -299,20 +299,31 @@ static const struct attribute_group armv8_pmuv3_events_attr_group = { .is_visible = armv8pmu_event_attr_is_visible, }; -PMU_FORMAT_ATTR(event, "config:0-15"); -PMU_FORMAT_ATTR(long, "config1:0"); -PMU_FORMAT_ATTR(rdpmc, "config1:1"); +/* User ABI */ +#define ATTR_CFG_FLD_event_CFG config +#define ATTR_CFG_FLD_event_LO 0 +#define ATTR_CFG_FLD_event_HI 15 +#define ATTR_CFG_FLD_long_CFG config1 +#define ATTR_CFG_FLD_long_LO 0 +#define ATTR_CFG_FLD_long_HI 0 +#define ATTR_CFG_FLD_rdpmc_CFG config1 +#define ATTR_CFG_FLD_rdpmc_LO 1 +#define ATTR_CFG_FLD_rdpmc_HI 1 + +GEN_PMU_FORMAT_ATTR(event); +GEN_PMU_FORMAT_ATTR(long); +GEN_PMU_FORMAT_ATTR(rdpmc); static int sysctl_perf_user_access __read_mostly; static bool armv8pmu_event_is_64bit(struct perf_event *event) { - return event->attr.config1 & 0x1; + return ATTR_CFG_GET_FLD(&event->attr, long); } static bool armv8pmu_event_want_user_access(struct perf_event *event) { - return event->attr.config1 & 0x2; + return ATTR_CFG_GET_FLD(&event->attr, rdpmc); } static struct attribute *armv8_pmuv3_format_attrs[] = { diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index d2b0cbf0e0c4..b622d75d8c9e 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -206,28 +206,6 @@ static const struct attribute_group arm_spe_pmu_cap_group = { #define ATTR_CFG_FLD_inv_event_filter_LO 0 #define ATTR_CFG_FLD_inv_event_filter_HI 63 -/* Why does everything I do descend into this? */ -#define __GEN_PMU_FORMAT_ATTR(cfg, lo, hi) \ - (lo) == (hi) ? #cfg ":" #lo "\n" : #cfg ":" #lo "-" #hi - -#define _GEN_PMU_FORMAT_ATTR(cfg, lo, hi) \ - __GEN_PMU_FORMAT_ATTR(cfg, lo, hi) - -#define GEN_PMU_FORMAT_ATTR(name) \ - PMU_FORMAT_ATTR(name, \ - _GEN_PMU_FORMAT_ATTR(ATTR_CFG_FLD_##name##_CFG, \ - ATTR_CFG_FLD_##name##_LO, \ - ATTR_CFG_FLD_##name##_HI)) - -#define _ATTR_CFG_GET_FLD(attr, cfg, lo, hi) \ - ((((attr)->cfg) >> lo) & GENMASK(hi - lo, 0)) - -#define ATTR_CFG_GET_FLD(attr, name) \ - _ATTR_CFG_GET_FLD(attr, \ - ATTR_CFG_FLD_##name##_CFG, \ - ATTR_CFG_FLD_##name##_LO, \ - ATTR_CFG_FLD_##name##_HI) - GEN_PMU_FORMAT_ATTR(ts_enable); GEN_PMU_FORMAT_ATTR(pa_enable); GEN_PMU_FORMAT_ATTR(pct_enable); diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index e2503d48ddee..b3b34f6670cf 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -183,4 +183,26 @@ void armpmu_free_irq(int irq, int cpu); #define ARMV8_SPE_PDEV_NAME "arm,spe-v1" #define ARMV8_TRBE_PDEV_NAME "arm,trbe" +/* Why does everything I do descend into this? */ +#define __GEN_PMU_FORMAT_ATTR(cfg, lo, hi) \ + (lo) == (hi) ? #cfg ":" #lo "\n" : #cfg ":" #lo "-" #hi + +#define _GEN_PMU_FORMAT_ATTR(cfg, lo, hi) \ + __GEN_PMU_FORMAT_ATTR(cfg, lo, hi) + +#define GEN_PMU_FORMAT_ATTR(name) \ + PMU_FORMAT_ATTR(name, \ + _GEN_PMU_FORMAT_ATTR(ATTR_CFG_FLD_##name##_CFG, \ + ATTR_CFG_FLD_##name##_LO, \ + ATTR_CFG_FLD_##name##_HI)) + +#define _ATTR_CFG_GET_FLD(attr, cfg, lo, hi) \ + ((((attr)->cfg) >> lo) & GENMASK_ULL(hi - lo, 0)) + +#define ATTR_CFG_GET_FLD(attr, name) \ + _ATTR_CFG_GET_FLD(attr, \ + ATTR_CFG_FLD_##name##_CFG, \ + ATTR_CFG_FLD_##name##_LO, \ + ATTR_CFG_FLD_##name##_HI) + #endif /* __ARM_PMU_H__ */ From a5f4ca68f348ac059efd6a3d7ad4040aed1c0818 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:19 +0000 Subject: [PATCH 205/310] perf/arm_dmc620: Remove duplicate format attribute #defines These were copied from the SPE driver, but now they're in the arm_pmu.h header so delete them and use the header instead. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-8-james.clark@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_dmc620_pmu.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c index 30cea6859574..9de9dc8db8db 100644 --- a/drivers/perf/arm_dmc620_pmu.c +++ b/drivers/perf/arm_dmc620_pmu.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -189,27 +190,6 @@ static const struct attribute_group dmc620_pmu_events_attr_group = { #define ATTR_CFG_FLD_clkdiv2_LO 9 #define ATTR_CFG_FLD_clkdiv2_HI 9 -#define __GEN_PMU_FORMAT_ATTR(cfg, lo, hi) \ - (lo) == (hi) ? #cfg ":" #lo "\n" : #cfg ":" #lo "-" #hi - -#define _GEN_PMU_FORMAT_ATTR(cfg, lo, hi) \ - __GEN_PMU_FORMAT_ATTR(cfg, lo, hi) - -#define GEN_PMU_FORMAT_ATTR(name) \ - PMU_FORMAT_ATTR(name, \ - _GEN_PMU_FORMAT_ATTR(ATTR_CFG_FLD_##name##_CFG, \ - ATTR_CFG_FLD_##name##_LO, \ - ATTR_CFG_FLD_##name##_HI)) - -#define _ATTR_CFG_GET_FLD(attr, cfg, lo, hi) \ - ((((attr)->cfg) >> lo) & GENMASK_ULL(hi - lo, 0)) - -#define ATTR_CFG_GET_FLD(attr, name) \ - _ATTR_CFG_GET_FLD(attr, \ - ATTR_CFG_FLD_##name##_CFG, \ - ATTR_CFG_FLD_##name##_LO, \ - ATTR_CFG_FLD_##name##_HI) - GEN_PMU_FORMAT_ATTR(mask); GEN_PMU_FORMAT_ATTR(match); GEN_PMU_FORMAT_ATTR(invert); From c7b98bf0fc79bd2d91f6ef84e07b5f648d43c13e Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:20 +0000 Subject: [PATCH 206/310] KVM: selftests: aarch64: Update tools copy of arm_pmuv3.h Now that ARMV8_PMU_PMCR_N is made with GENMASK, update usages to treat it as a pre-shifted mask. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-9-james.clark@arm.com Signed-off-by: Will Deacon --- tools/include/perf/arm_pmuv3.h | 43 +++++++++++-------- .../kvm/aarch64/vpmu_counter_access.c | 5 +-- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/tools/include/perf/arm_pmuv3.h b/tools/include/perf/arm_pmuv3.h index e822d49fb5b8..1e397d55384e 100644 --- a/tools/include/perf/arm_pmuv3.h +++ b/tools/include/perf/arm_pmuv3.h @@ -218,45 +218,54 @@ #define ARMV8_PMU_PMCR_DP (1 << 5) /* Disable CCNT if non-invasive debug*/ #define ARMV8_PMU_PMCR_LC (1 << 6) /* Overflow on 64 bit cycle counter */ #define ARMV8_PMU_PMCR_LP (1 << 7) /* Long event counter enable */ -#define ARMV8_PMU_PMCR_N_SHIFT 11 /* Number of counters supported */ -#define ARMV8_PMU_PMCR_N_MASK 0x1f -#define ARMV8_PMU_PMCR_MASK 0xff /* Mask for writable bits */ +#define ARMV8_PMU_PMCR_N GENMASK(15, 11) /* Number of counters supported */ +/* Mask for writable bits */ +#define ARMV8_PMU_PMCR_MASK (ARMV8_PMU_PMCR_E | ARMV8_PMU_PMCR_P | \ + ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_D | \ + ARMV8_PMU_PMCR_X | ARMV8_PMU_PMCR_DP | \ + ARMV8_PMU_PMCR_LC | ARMV8_PMU_PMCR_LP) /* * PMOVSR: counters overflow flag status reg */ -#define ARMV8_PMU_OVSR_MASK 0xffffffff /* Mask for writable bits */ -#define ARMV8_PMU_OVERFLOWED_MASK ARMV8_PMU_OVSR_MASK +#define ARMV8_PMU_OVSR_P GENMASK(30, 0) +#define ARMV8_PMU_OVSR_C BIT(31) +/* Mask for writable bits is both P and C fields */ +#define ARMV8_PMU_OVERFLOWED_MASK (ARMV8_PMU_OVSR_P | ARMV8_PMU_OVSR_C) /* * PMXEVTYPER: Event selection reg */ -#define ARMV8_PMU_EVTYPE_MASK 0xc800ffff /* Mask for writable bits */ -#define ARMV8_PMU_EVTYPE_EVENT 0xffff /* Mask for EVENT bits */ +#define ARMV8_PMU_EVTYPE_EVENT GENMASK(15, 0) /* Mask for EVENT bits */ +#define ARMV8_PMU_EVTYPE_TH GENMASK(43, 32) +#define ARMV8_PMU_EVTYPE_TC GENMASK(63, 61) /* * Event filters for PMUv3 */ -#define ARMV8_PMU_EXCLUDE_EL1 (1U << 31) -#define ARMV8_PMU_EXCLUDE_EL0 (1U << 30) -#define ARMV8_PMU_INCLUDE_EL2 (1U << 27) +#define ARMV8_PMU_EXCLUDE_EL1 (1U << 31) +#define ARMV8_PMU_EXCLUDE_EL0 (1U << 30) +#define ARMV8_PMU_EXCLUDE_NS_EL1 (1U << 29) +#define ARMV8_PMU_EXCLUDE_NS_EL0 (1U << 28) +#define ARMV8_PMU_INCLUDE_EL2 (1U << 27) +#define ARMV8_PMU_EXCLUDE_EL3 (1U << 26) /* * PMUSERENR: user enable reg */ -#define ARMV8_PMU_USERENR_MASK 0xf /* Mask for writable bits */ #define ARMV8_PMU_USERENR_EN (1 << 0) /* PMU regs can be accessed at EL0 */ #define ARMV8_PMU_USERENR_SW (1 << 1) /* PMSWINC can be written at EL0 */ #define ARMV8_PMU_USERENR_CR (1 << 2) /* Cycle counter can be read at EL0 */ #define ARMV8_PMU_USERENR_ER (1 << 3) /* Event counter can be read at EL0 */ +/* Mask for writable bits */ +#define ARMV8_PMU_USERENR_MASK (ARMV8_PMU_USERENR_EN | ARMV8_PMU_USERENR_SW | \ + ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_ER) /* PMMIR_EL1.SLOTS mask */ -#define ARMV8_PMU_SLOTS_MASK 0xff - -#define ARMV8_PMU_BUS_SLOTS_SHIFT 8 -#define ARMV8_PMU_BUS_SLOTS_MASK 0xff -#define ARMV8_PMU_BUS_WIDTH_SHIFT 16 -#define ARMV8_PMU_BUS_WIDTH_MASK 0xf +#define ARMV8_PMU_SLOTS GENMASK(7, 0) +#define ARMV8_PMU_BUS_SLOTS GENMASK(15, 8) +#define ARMV8_PMU_BUS_WIDTH GENMASK(19, 16) +#define ARMV8_PMU_THWIDTH GENMASK(23, 20) /* * This code is really good diff --git a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c index 5ea78986e665..9d51b5691349 100644 --- a/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c +++ b/tools/testing/selftests/kvm/aarch64/vpmu_counter_access.c @@ -42,13 +42,12 @@ struct pmreg_sets { static uint64_t get_pmcr_n(uint64_t pmcr) { - return (pmcr >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK; + return FIELD_GET(ARMV8_PMU_PMCR_N, pmcr); } static void set_pmcr_n(uint64_t *pmcr, uint64_t pmcr_n) { - *pmcr = *pmcr & ~(ARMV8_PMU_PMCR_N_MASK << ARMV8_PMU_PMCR_N_SHIFT); - *pmcr |= (pmcr_n << ARMV8_PMU_PMCR_N_SHIFT); + u64p_replace_bits((__u64 *) pmcr, pmcr_n, ARMV8_PMU_PMCR_N); } static uint64_t get_counters_mask(uint64_t n) From 186c91aaf54989a9c74869dcc6ba031313d8e2b8 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:21 +0000 Subject: [PATCH 207/310] arm: pmu: Move error message and -EOPNOTSUPP to individual PMUs -EPERM or -EINVAL always get converted to -EOPNOTSUPP, so replace them. This will allow __hw_perf_event_init() to return a different code or not print that particular message for a different error in the next commit. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-10-james.clark@arm.com Signed-off-by: Will Deacon --- arch/arm/kernel/perf_event_v7.c | 6 ++++-- drivers/perf/apple_m1_cpu_pmu.c | 6 ++++-- drivers/perf/arm_pmu.c | 11 +++++------ drivers/perf/arm_pmuv3.c | 6 ++++-- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c index c890354b04e9..a3322e2b3ea4 100644 --- a/arch/arm/kernel/perf_event_v7.c +++ b/arch/arm/kernel/perf_event_v7.c @@ -1052,8 +1052,10 @@ static int armv7pmu_set_event_filter(struct hw_perf_event *event, { unsigned long config_base = 0; - if (attr->exclude_idle) - return -EPERM; + if (attr->exclude_idle) { + pr_debug("ARM performance counters do not support mode exclusion\n"); + return -EOPNOTSUPP; + } if (attr->exclude_user) config_base |= ARMV7_EXCLUDE_USER; if (attr->exclude_kernel) diff --git a/drivers/perf/apple_m1_cpu_pmu.c b/drivers/perf/apple_m1_cpu_pmu.c index cd2de44b61b9..f322e5ca1114 100644 --- a/drivers/perf/apple_m1_cpu_pmu.c +++ b/drivers/perf/apple_m1_cpu_pmu.c @@ -524,8 +524,10 @@ static int m1_pmu_set_event_filter(struct hw_perf_event *event, { unsigned long config_base = 0; - if (!attr->exclude_guest) - return -EINVAL; + if (!attr->exclude_guest) { + pr_debug("ARM performance counters do not support mode exclusion\n"); + return -EOPNOTSUPP; + } if (!attr->exclude_kernel) config_base |= M1_PMU_CFG_COUNT_KERNEL; if (!attr->exclude_user) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 379479b50bdd..8458fe2cebb4 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -445,7 +445,7 @@ __hw_perf_event_init(struct perf_event *event) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; - int mapping; + int mapping, ret; hwc->flags = 0; mapping = armpmu->map_event(event); @@ -470,11 +470,10 @@ __hw_perf_event_init(struct perf_event *event) /* * Check whether we need to exclude the counter from certain modes. */ - if (armpmu->set_event_filter && - armpmu->set_event_filter(hwc, &event->attr)) { - pr_debug("ARM performance counters do not support " - "mode exclusion\n"); - return -EOPNOTSUPP; + if (armpmu->set_event_filter) { + ret = armpmu->set_event_filter(hwc, &event->attr); + if (ret) + return ret; } /* diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 8a573db81da1..2ba215f74d92 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -936,8 +936,10 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event, { unsigned long config_base = 0; - if (attr->exclude_idle) - return -EPERM; + if (attr->exclude_idle) { + pr_debug("ARM performance counters do not support mode exclusion\n"); + return -EOPNOTSUPP; + } /* * If we're running in hyp mode, then we *are* the hypervisor. From 816c26754447e8b28d6c604e1f5b1d205b2586ee Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:22 +0000 Subject: [PATCH 208/310] arm64: perf: Add support for event counting threshold FEAT_PMUv3_TH (Armv8.8) permits a PMU counter to increment only on events whose count meets a specified threshold condition. For example if PMEVTYPERn.TC (Threshold Control) is set to 0b101 (Greater than or equal, count), and the threshold is set to 2, then the PMU counter will now only increment by 1 when an event would have previously incremented the PMU counter by 2 or more on a single processor cycle. Three new Perf event config fields, 'threshold', 'threshold_compare' and 'threshold_count' have been added to control the feature. threshold_compare maps to the upper two bits of PMEVTYPERn.TC and threshold_count maps to the first bit of TC. These separate attributes have been picked rather than enumerating all the possible combinations of the TC field as in the Arm ARM. The attributes would be used on a Perf command line like this: $ perf stat -e stall_slot/threshold=2,threshold_compare=2/ A new capability for reading out the maximum supported threshold value has also been added: $ cat /sys/bus/event_source/devices/armv8_pmuv3/caps/threshold_max 0x000000ff If a threshold higher than threshold_max is provided, then an error is generated. If FEAT_PMUv3_TH isn't implemented or a 32 bit kernel is running, then threshold_max reads zero, and attempting to set a threshold value will also result in an error. The threshold is per PMU counter, and there are potentially different threshold_max values per PMU type on heterogeneous systems. Bits higher than 32 now need to be written into PMEVTYPER, so armv8pmu_write_evtype() has to be updated to take an unsigned long value rather than u32 which gives the correct behavior on both aarch32 and 64. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-11-james.clark@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_pmuv3.c | 79 +++++++++++++++++++++++++++++++++- include/linux/perf/arm_pmuv3.h | 1 + 2 files changed, 79 insertions(+), 1 deletion(-) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 2ba215f74d92..23fa6c5da82c 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -309,10 +309,22 @@ static const struct attribute_group armv8_pmuv3_events_attr_group = { #define ATTR_CFG_FLD_rdpmc_CFG config1 #define ATTR_CFG_FLD_rdpmc_LO 1 #define ATTR_CFG_FLD_rdpmc_HI 1 +#define ATTR_CFG_FLD_threshold_count_CFG config1 /* PMEVTYPER.TC[0] */ +#define ATTR_CFG_FLD_threshold_count_LO 2 +#define ATTR_CFG_FLD_threshold_count_HI 2 +#define ATTR_CFG_FLD_threshold_compare_CFG config1 /* PMEVTYPER.TC[2:1] */ +#define ATTR_CFG_FLD_threshold_compare_LO 3 +#define ATTR_CFG_FLD_threshold_compare_HI 4 +#define ATTR_CFG_FLD_threshold_CFG config1 /* PMEVTYPER.TH */ +#define ATTR_CFG_FLD_threshold_LO 5 +#define ATTR_CFG_FLD_threshold_HI 16 GEN_PMU_FORMAT_ATTR(event); GEN_PMU_FORMAT_ATTR(long); GEN_PMU_FORMAT_ATTR(rdpmc); +GEN_PMU_FORMAT_ATTR(threshold_count); +GEN_PMU_FORMAT_ATTR(threshold_compare); +GEN_PMU_FORMAT_ATTR(threshold); static int sysctl_perf_user_access __read_mostly; @@ -326,10 +338,27 @@ static bool armv8pmu_event_want_user_access(struct perf_event *event) return ATTR_CFG_GET_FLD(&event->attr, rdpmc); } +static u8 armv8pmu_event_threshold_control(struct perf_event_attr *attr) +{ + u8 th_compare = ATTR_CFG_GET_FLD(attr, threshold_compare); + u8 th_count = ATTR_CFG_GET_FLD(attr, threshold_count); + + /* + * The count bit is always the bottom bit of the full control field, and + * the comparison is the upper two bits, but it's not explicitly + * labelled in the Arm ARM. For the Perf interface we split it into two + * fields, so reconstruct it here. + */ + return (th_compare << 1) | th_count; +} + static struct attribute *armv8_pmuv3_format_attrs[] = { &format_attr_event.attr, &format_attr_long.attr, &format_attr_rdpmc.attr, + &format_attr_threshold.attr, + &format_attr_threshold_compare.attr, + &format_attr_threshold_count.attr, NULL, }; @@ -379,10 +408,38 @@ static ssize_t bus_width_show(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR_RO(bus_width); +static u32 threshold_max(struct arm_pmu *cpu_pmu) +{ + /* + * PMMIR.THWIDTH is readable and non-zero on aarch32, but it would be + * impossible to write the threshold in the upper 32 bits of PMEVTYPER. + */ + if (IS_ENABLED(CONFIG_ARM)) + return 0; + + /* + * The largest value that can be written to PMEVTYPER_EL0.TH is + * (2 ^ PMMIR.THWIDTH) - 1. + */ + return (1 << FIELD_GET(ARMV8_PMU_THWIDTH, cpu_pmu->reg_pmmir)) - 1; +} + +static ssize_t threshold_max_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu); + + return sysfs_emit(page, "0x%08x\n", threshold_max(cpu_pmu)); +} + +static DEVICE_ATTR_RO(threshold_max); + static struct attribute *armv8_pmuv3_caps_attrs[] = { &dev_attr_slots.attr, &dev_attr_bus_slots.attr, &dev_attr_bus_width.attr, + &dev_attr_threshold_max.attr, NULL, }; @@ -566,7 +623,7 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value) armv8pmu_write_hw_counter(event, value); } -static void armv8pmu_write_evtype(int idx, u32 val) +static void armv8pmu_write_evtype(int idx, unsigned long val) { u32 counter = ARMV8_IDX_TO_COUNTER(idx); unsigned long mask = ARMV8_PMU_EVTYPE_EVENT | @@ -935,6 +992,10 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event, struct perf_event_attr *attr) { unsigned long config_base = 0; + struct perf_event *perf_event = container_of(attr, struct perf_event, + attr); + struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu); + u32 th; if (attr->exclude_idle) { pr_debug("ARM performance counters do not support mode exclusion\n"); @@ -968,6 +1029,22 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event, if (attr->exclude_user) config_base |= ARMV8_PMU_EXCLUDE_EL0; + /* + * If FEAT_PMUv3_TH isn't implemented, then THWIDTH (threshold_max) will + * be 0 and will also trigger this check, preventing it from being used. + */ + th = ATTR_CFG_GET_FLD(attr, threshold); + if (th > threshold_max(cpu_pmu)) { + pr_debug("PMU event threshold exceeds max value\n"); + return -EINVAL; + } + + if (IS_ENABLED(CONFIG_ARM64) && th) { + config_base |= FIELD_PREP(ARMV8_PMU_EVTYPE_TH, th); + config_base |= FIELD_PREP(ARMV8_PMU_EVTYPE_TC, + armv8pmu_event_threshold_control(attr)); + } + /* * Install the filter into config_base as this is used to * construct the event type. diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 91957b3468e9..0f4d62ef3a9a 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -262,6 +262,7 @@ #define ARMV8_PMU_SLOTS GENMASK(7, 0) #define ARMV8_PMU_BUS_SLOTS GENMASK(15, 8) #define ARMV8_PMU_BUS_WIDTH GENMASK(19, 16) +#define ARMV8_PMU_THWIDTH GENMASK(23, 20) /* * This code is really good From bd690638e2c27dcea1d56376aa4bf3995d82ccfc Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:23 +0000 Subject: [PATCH 209/310] Documentation: arm64: Document the PMU event counting threshold feature Add documentation for the new Perf event open parameters and the threshold_max capability file. Reviewed-by: Anshuman Khandual Reviewed-by: Suzuki K Poulose Acked-by: Namhyung Kim Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-12-james.clark@arm.com Signed-off-by: Will Deacon --- Documentation/arch/arm64/perf.rst | 72 +++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/Documentation/arch/arm64/perf.rst b/Documentation/arch/arm64/perf.rst index 1f87b57c2332..997fd716b82f 100644 --- a/Documentation/arch/arm64/perf.rst +++ b/Documentation/arch/arm64/perf.rst @@ -164,3 +164,75 @@ and should be used to mask the upper bits as needed. https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/arch/arm64/tests/user-events.c .. _tools/lib/perf/tests/test-evsel.c: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/perf/tests/test-evsel.c + +Event Counting Threshold +========================================== + +Overview +-------- + +FEAT_PMUv3_TH (Armv8.8) permits a PMU counter to increment only on +events whose count meets a specified threshold condition. For example if +threshold_compare is set to 2 ('Greater than or equal'), and the +threshold is set to 2, then the PMU counter will now only increment by +when an event would have previously incremented the PMU counter by 2 or +more on a single processor cycle. + +To increment by 1 after passing the threshold condition instead of the +number of events on that cycle, add the 'threshold_count' option to the +commandline. + +How-to +------ + +These are the parameters for controlling the feature: + +.. list-table:: + :header-rows: 1 + + * - Parameter + - Description + * - threshold + - Value to threshold the event by. A value of 0 means that + thresholding is disabled and the other parameters have no effect. + * - threshold_compare + - | Comparison function to use, with the following values supported: + | + | 0: Not-equal + | 1: Equals + | 2: Greater-than-or-equal + | 3: Less-than + * - threshold_count + - If this is set, count by 1 after passing the threshold condition + instead of the value of the event on this cycle. + +The threshold, threshold_compare and threshold_count values can be +provided per event, for example: + +.. code-block:: sh + + perf stat -e stall_slot/threshold=2,threshold_compare=2/ \ + -e dtlb_walk/threshold=10,threshold_compare=3,threshold_count/ + +In this example the stall_slot event will count by 2 or more on every +cycle where 2 or more stalls happen. And dtlb_walk will count by 1 on +every cycle where the number of dtlb walks were less than 10. + +The maximum supported threshold value can be read from the caps of each +PMU, for example: + +.. code-block:: sh + + cat /sys/bus/event_source/devices/armv8_pmuv3/caps/threshold_max + + 0x000000ff + +If a value higher than this is given, then opening the event will result +in an error. The highest possible maximum is 4095, as the config field +for threshold is limited to 12 bits, and the Perf tool will refuse to +parse higher values. + +If the PMU doesn't support FEAT_PMUv3_TH, then threshold_max will read +0, and attempting to set a threshold value will also result in an error. +threshold_max will also read as 0 on aarch32 guests, even if the host +is running on hardware with the feature. From 232afb557835d6f6859c73bf610bad308c96b131 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Sat, 2 Dec 2023 12:50:23 +0100 Subject: [PATCH 210/310] x86/CPU/AMD: Add X86_FEATURE_ZEN1 Add a synthetic feature flag specifically for first generation Zen machines. There's need to have a generic flag for all Zen generations so make X86_FEATURE_ZEN be that flag. Fixes: 30fa92832f40 ("x86/CPU/AMD: Add ZenX generations flags") Suggested-by: Brian Gerst Suggested-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/dc3835e3-0731-4230-bbb9-336bbe3d042b@amd.com --- arch/x86/include/asm/cpufeatures.h | 3 ++- arch/x86/kernel/cpu/amd.c | 11 ++++++----- tools/arch/x86/include/asm/cpufeatures.h | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 149cc5d5c2ae..632c26cdeeda 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -218,7 +218,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU based on Zen microarchitecture */ +#define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ #define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ @@ -315,6 +315,7 @@ #define X86_FEATURE_ZEN2 (11*32+28) /* "" CPU based on Zen2 microarchitecture */ #define X86_FEATURE_ZEN3 (11*32+29) /* "" CPU based on Zen3 microarchitecture */ #define X86_FEATURE_ZEN4 (11*32+30) /* "" CPU based on Zen4 microarchitecture */ +#define X86_FEATURE_ZEN1 (11*32+31) /* "" CPU based on Zen1 microarchitecture */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 89bbb1ac9ecf..339586394e7e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -542,7 +542,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) switch (c->x86_model) { case 0x00 ... 0x2f: case 0x50 ... 0x5f: - setup_force_cpu_cap(X86_FEATURE_ZEN); + setup_force_cpu_cap(X86_FEATURE_ZEN1); break; case 0x30 ... 0x4f: case 0x60 ... 0x7f: @@ -948,12 +948,13 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) static void init_amd_zen_common(void) { + setup_force_cpu_cap(X86_FEATURE_ZEN); #ifdef CONFIG_NUMA node_reclaim_distance = 32; #endif } -static void init_amd_zen(struct cpuinfo_x86 *c) +static void init_amd_zen1(struct cpuinfo_x86 *c) { init_amd_zen_common(); fix_erratum_1386(c); @@ -1075,8 +1076,8 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x16: init_amd_jg(c); break; } - if (boot_cpu_has(X86_FEATURE_ZEN)) - init_amd_zen(c); + if (boot_cpu_has(X86_FEATURE_ZEN1)) + init_amd_zen1(c); else if (boot_cpu_has(X86_FEATURE_ZEN2)) init_amd_zen2(c); else if (boot_cpu_has(X86_FEATURE_ZEN3)) @@ -1143,7 +1144,7 @@ static void init_amd(struct cpuinfo_x86 *c) * Counter May Be Inaccurate". */ if (cpu_has(c, X86_FEATURE_IRPERF) && - (boot_cpu_has(X86_FEATURE_ZEN) && c->x86_model > 0x2f)) + (boot_cpu_has(X86_FEATURE_ZEN1) && c->x86_model > 0x2f)) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); check_null_seg_clears_base(c); diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 798e60b5454b..845a4023ba44 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -219,7 +219,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */ +#define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ #define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ From 3dfdc2750c6cdc6a5ebf5effb07f92db761de35d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 29 Nov 2023 12:15:57 +0100 Subject: [PATCH 211/310] arm64: kernel: Disable latent_entropy GCC plugin in early C runtime In subsequent patches, mark portions of the early C code will be marked as __init. Unfortunarely, __init implies __latent_entropy, and this would result in the early C code being instrumented in an unsafe manner. Disable the latent entropy plugin for the early C code. Acked-by: Mark Rutland Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20231129111555.3594833-44-ardb@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/pi/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile index 4c0ea3cd4ea4..c844a0546d7f 100644 --- a/arch/arm64/kernel/pi/Makefile +++ b/arch/arm64/kernel/pi/Makefile @@ -3,6 +3,7 @@ KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \ -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \ + $(DISABLE_LATENT_ENTROPY_PLUGIN) \ $(call cc-option,-mbranch-protection=none) \ -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \ -include $(srctree)/include/linux/hidden.h \ From a22fc8e102dc475e91dc13e6e1e395f4d95ae684 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 29 Nov 2023 12:15:58 +0100 Subject: [PATCH 212/310] arm64: mm: Take potential load offset into account when KASLR is off We enable CONFIG_RELOCATABLE even when CONFIG_RANDOMIZE_BASE is disabled, and this permits the loader (i.e., EFI) to place the kernel anywhere in physical memory as long as the base address is 64k aligned. This means that the 'KASLR' case described in the header that defines the size of the statically allocated page tables could take effect even when CONFIG_RANDMIZE_BASE=n. So check for CONFIG_RELOCATABLE instead. Signed-off-by: Ard Biesheuvel Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20231129111555.3594833-45-ardb@google.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/kernel-pgtable.h | 27 ++++++------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 85d26143faa5..83ddb14b95a5 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -37,27 +37,12 @@ /* - * If KASLR is enabled, then an offset K is added to the kernel address - * space. The bottom 21 bits of this offset are zero to guarantee 2MB - * alignment for PA and VA. - * - * For each pagetable level of the swapper, we know that the shift will - * be larger than 21 (for the 4KB granule case we use section maps thus - * the smallest shift is actually 30) thus there is the possibility that - * KASLR can increase the number of pagetable entries by 1, so we make - * room for this extra entry. - * - * Note KASLR cannot increase the number of required entries for a level - * by more than one because it increments both the virtual start and end - * addresses equally (the extra entry comes from the case where the end - * address is just pushed over a boundary and the start address isn't). + * A relocatable kernel may execute from an address that differs from the one at + * which it was linked. In the worst case, its runtime placement may intersect + * with two adjacent PGDIR entries, which means that an additional page table + * may be needed at each subordinate level. */ - -#ifdef CONFIG_RANDOMIZE_BASE -#define EARLY_KASLR (1) -#else -#define EARLY_KASLR (0) -#endif +#define EXTRA_PAGE __is_defined(CONFIG_RELOCATABLE) #define SPAN_NR_ENTRIES(vstart, vend, shift) \ ((((vend) - 1) >> (shift)) - ((vstart) >> (shift)) + 1) @@ -83,7 +68,7 @@ + EARLY_PGDS((vstart), (vend), add) /* each PGDIR needs a next level page table */ \ + EARLY_PUDS((vstart), (vend), add) /* each PUD needs a next level page table */ \ + EARLY_PMDS((vstart), (vend), add)) /* each PMD needs a next level page table */ -#define INIT_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR, _end, EARLY_KASLR)) +#define INIT_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR, _end, EXTRA_PAGE)) /* the initial ID map may need two extra pages if it needs to be extended */ #if VA_BITS < 48 From 376f5a3bd7e21337dc41f7abb56c2c74ac63038a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 29 Nov 2023 12:15:59 +0100 Subject: [PATCH 213/310] arm64: mm: get rid of kimage_vaddr global variable We store the address of _text in kimage_vaddr, but since commit 09e3c22a86f6889d ("arm64: Use a variable to store non-global mappings decision"), we no longer reference this variable from modules so we no longer need to export it. In fact, we don't need it at all so let's just get rid of it. Acked-by: Mark Rutland Signed-off-by: Ard Biesheuvel Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20231129111555.3594833-46-ardb@google.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/memory.h | 6 ++---- arch/arm64/kernel/head.S | 2 +- arch/arm64/mm/mmu.c | 3 --- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index fde4186cc387..b8d726f951ae 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -182,6 +182,7 @@ #include #include #include +#include #if VA_BITS > 48 extern u64 vabits_actual; @@ -193,15 +194,12 @@ extern s64 memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ #define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; }) -/* the virtual base of the kernel image */ -extern u64 kimage_vaddr; - /* the offset between the kernel virtual and physical mappings */ extern u64 kimage_voffset; static inline unsigned long kaslr_offset(void) { - return kimage_vaddr - KIMAGE_VADDR; + return (u64)&_text - KIMAGE_VADDR; } #ifdef CONFIG_RANDOMIZE_BASE diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 7b236994f0e1..cab7f91949d8 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -482,7 +482,7 @@ SYM_FUNC_START_LOCAL(__primary_switched) str_l x21, __fdt_pointer, x5 // Save FDT pointer - ldr_l x4, kimage_vaddr // Save the offset between + adrp x4, _text // Save the offset between sub x4, x4, x0 // the kernel virtual and str_l x4, kimage_voffset, x5 // physical mappings diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 15f6347d23b6..03c73e9197ac 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -52,9 +52,6 @@ u64 vabits_actual __ro_after_init = VA_BITS_MIN; EXPORT_SYMBOL(vabits_actual); #endif -u64 kimage_vaddr __ro_after_init = (u64)&_text; -EXPORT_SYMBOL(kimage_vaddr); - u64 kimage_voffset __ro_after_init; EXPORT_SYMBOL(kimage_voffset); From cbc59c9a4e5785796ccac9a975a94cb52c87feb1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 29 Nov 2023 12:16:10 +0100 Subject: [PATCH 214/310] arm64: idreg-override: Omit non-NULL checks for override pointer Now that override pointers are always set, we can drop the various non-NULL checks that we have in the code. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20231129111555.3594833-57-ardb@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/idreg-override.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 3addc09f8746..536bc33859bc 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -216,9 +216,6 @@ static void __init match_options(const char *cmdline) for (i = 0; i < ARRAY_SIZE(regs); i++) { int f; - if (!regs[i]->override) - continue; - for (f = 0; strlen(regs[i]->fields[f].name); f++) { u64 shift = regs[i]->fields[f].shift; u64 width = regs[i]->fields[f].width ?: 4; @@ -319,10 +316,8 @@ asmlinkage void __init init_feature_override(u64 boot_status) int i; for (i = 0; i < ARRAY_SIZE(regs); i++) { - if (regs[i]->override) { - regs[i]->override->val = 0; - regs[i]->override->mask = 0; - } + regs[i]->override->val = 0; + regs[i]->override->mask = 0; } __boot_status = boot_status; @@ -330,9 +325,8 @@ asmlinkage void __init init_feature_override(u64 boot_status) parse_cmdline(); for (i = 0; i < ARRAY_SIZE(regs); i++) { - if (regs[i]->override) - dcache_clean_inval_poc((unsigned long)regs[i]->override, - (unsigned long)regs[i]->override + - sizeof(*regs[i]->override)); + dcache_clean_inval_poc((unsigned long)regs[i]->override, + (unsigned long)regs[i]->override + + sizeof(*regs[i]->override)); } } From 01fd29092a35833ef87bd13c0a025e726550d646 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 29 Nov 2023 12:16:11 +0100 Subject: [PATCH 215/310] arm64: idreg-override: Prepare for place relative reloc patching The ID reg override handling code uses a rather elaborate data structure that relies on statically initialized absolute address values in pointer fields. This means that this code cannot run until relocation fixups have been applied, and this is unfortunate, because it means we cannot discover overrides for KASLR or LVA/LPA without creating the kernel mapping and performing the relocations first. This can be solved by switching to place-relative relocations, which can be applied by the linker at build time. This means some additional arithmetic is required when dereferencing these pointers, as we can no longer dereference the pointer members directly. So let's implement this for idreg-override.c in a preliminary way, i.e., convert all the references in code to use a special accessor that produces the correct absolute value at runtime. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20231129111555.3594833-58-ardb@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/idreg-override.c | 89 +++++++++++++++++++----------- 1 file changed, 56 insertions(+), 33 deletions(-) diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 536bc33859bc..ca1b8d2dbe99 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -21,14 +21,25 @@ static u64 __boot_status __initdata; +// temporary __prel64 related definitions +// to be removed when this code is moved under pi/ + +#define __prel64_initconst __initconst + +#define PREL64(type, name) union { type *name; } + +#define prel64_pointer(__d) (__d) + +typedef bool filter_t(u64 val); + struct ftr_set_desc { char name[FTR_DESC_NAME_LEN]; - struct arm64_ftr_override *override; + PREL64(struct arm64_ftr_override, override); struct { char name[FTR_DESC_FIELD_LEN]; u8 shift; u8 width; - bool (*filter)(u64 val); + PREL64(filter_t, filter); } fields[]; }; @@ -46,7 +57,7 @@ static bool __init mmfr1_vh_filter(u64 val) val == 0); } -static const struct ftr_set_desc mmfr1 __initconst = { +static const struct ftr_set_desc mmfr1 __prel64_initconst = { .name = "id_aa64mmfr1", .override = &id_aa64mmfr1_override, .fields = { @@ -70,7 +81,7 @@ static bool __init pfr0_sve_filter(u64 val) return true; } -static const struct ftr_set_desc pfr0 __initconst = { +static const struct ftr_set_desc pfr0 __prel64_initconst = { .name = "id_aa64pfr0", .override = &id_aa64pfr0_override, .fields = { @@ -94,7 +105,7 @@ static bool __init pfr1_sme_filter(u64 val) return true; } -static const struct ftr_set_desc pfr1 __initconst = { +static const struct ftr_set_desc pfr1 __prel64_initconst = { .name = "id_aa64pfr1", .override = &id_aa64pfr1_override, .fields = { @@ -105,7 +116,7 @@ static const struct ftr_set_desc pfr1 __initconst = { }, }; -static const struct ftr_set_desc isar1 __initconst = { +static const struct ftr_set_desc isar1 __prel64_initconst = { .name = "id_aa64isar1", .override = &id_aa64isar1_override, .fields = { @@ -117,7 +128,7 @@ static const struct ftr_set_desc isar1 __initconst = { }, }; -static const struct ftr_set_desc isar2 __initconst = { +static const struct ftr_set_desc isar2 __prel64_initconst = { .name = "id_aa64isar2", .override = &id_aa64isar2_override, .fields = { @@ -128,7 +139,7 @@ static const struct ftr_set_desc isar2 __initconst = { }, }; -static const struct ftr_set_desc smfr0 __initconst = { +static const struct ftr_set_desc smfr0 __prel64_initconst = { .name = "id_aa64smfr0", .override = &id_aa64smfr0_override, .fields = { @@ -149,7 +160,7 @@ static bool __init hvhe_filter(u64 val) ID_AA64MMFR1_EL1_VH_SHIFT)); } -static const struct ftr_set_desc sw_features __initconst = { +static const struct ftr_set_desc sw_features __prel64_initconst = { .name = "arm64_sw", .override = &arm64_sw_feature_override, .fields = { @@ -159,14 +170,15 @@ static const struct ftr_set_desc sw_features __initconst = { }, }; -static const struct ftr_set_desc * const regs[] __initconst = { - &mmfr1, - &pfr0, - &pfr1, - &isar1, - &isar2, - &smfr0, - &sw_features, +static const +PREL64(const struct ftr_set_desc, reg) regs[] __prel64_initconst = { + { &mmfr1 }, + { &pfr0 }, + { &pfr1 }, + { &isar1 }, + { &isar2 }, + { &smfr0 }, + { &sw_features }, }; static const struct { @@ -214,15 +226,20 @@ static void __init match_options(const char *cmdline) int i; for (i = 0; i < ARRAY_SIZE(regs); i++) { + const struct ftr_set_desc *reg = prel64_pointer(regs[i].reg); + struct arm64_ftr_override *override; int f; - for (f = 0; strlen(regs[i]->fields[f].name); f++) { - u64 shift = regs[i]->fields[f].shift; - u64 width = regs[i]->fields[f].width ?: 4; + override = prel64_pointer(reg->override); + + for (f = 0; strlen(reg->fields[f].name); f++) { + u64 shift = reg->fields[f].shift; + u64 width = reg->fields[f].width ?: 4; u64 mask = GENMASK_ULL(shift + width - 1, shift); + bool (*filter)(u64 val); u64 v; - if (find_field(cmdline, regs[i], f, &v)) + if (find_field(cmdline, reg, f, &v)) continue; /* @@ -230,16 +247,16 @@ static void __init match_options(const char *cmdline) * it by setting the value to the all-ones while * clearing the mask... Yes, this is fragile. */ - if (regs[i]->fields[f].filter && - !regs[i]->fields[f].filter(v)) { - regs[i]->override->val |= mask; - regs[i]->override->mask &= ~mask; + filter = prel64_pointer(reg->fields[f].filter); + if (filter && !filter(v)) { + override->val |= mask; + override->mask &= ~mask; continue; } - regs[i]->override->val &= ~mask; - regs[i]->override->val |= (v << shift) & mask; - regs[i]->override->mask |= mask; + override->val &= ~mask; + override->val |= (v << shift) & mask; + override->mask |= mask; return; } @@ -313,11 +330,16 @@ void init_feature_override(u64 boot_status); asmlinkage void __init init_feature_override(u64 boot_status) { + struct arm64_ftr_override *override; + const struct ftr_set_desc *reg; int i; for (i = 0; i < ARRAY_SIZE(regs); i++) { - regs[i]->override->val = 0; - regs[i]->override->mask = 0; + reg = prel64_pointer(regs[i].reg); + override = prel64_pointer(reg->override); + + override->val = 0; + override->mask = 0; } __boot_status = boot_status; @@ -325,8 +347,9 @@ asmlinkage void __init init_feature_override(u64 boot_status) parse_cmdline(); for (i = 0; i < ARRAY_SIZE(regs); i++) { - dcache_clean_inval_poc((unsigned long)regs[i]->override, - (unsigned long)regs[i]->override + - sizeof(*regs[i]->override)); + reg = prel64_pointer(regs[i].reg); + override = prel64_pointer(reg->override); + dcache_clean_inval_poc((unsigned long)override, + (unsigned long)(override + 1)); } } From dc3f5aae06381b43bc9d0d416bd15ee1682940e9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 29 Nov 2023 12:16:12 +0100 Subject: [PATCH 216/310] arm64: idreg-override: Avoid parameq() and parameqn() The only way parameq() and parameqn() deviate from the ordinary string and memory routines is that they ignore the difference between dashes and underscores. Since we copy each command line argument into a buffer before passing it to parameq() and parameqn() numerous times, let's just convert all dashes to underscores just once, and update the alias array accordingly. This also helps reduce the dependency on kernel APIs that are no longer available once we move this code into the early mini C runtime. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20231129111555.3594833-59-ardb@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/idreg-override.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index ca1b8d2dbe99..1eca93446345 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -185,8 +185,8 @@ static const struct { char alias[FTR_ALIAS_NAME_LEN]; char feature[FTR_ALIAS_OPTION_LEN]; } aliases[] __initconst = { - { "kvm-arm.mode=nvhe", "id_aa64mmfr1.vh=0" }, - { "kvm-arm.mode=protected", "id_aa64mmfr1.vh=0" }, + { "kvm_arm.mode=nvhe", "id_aa64mmfr1.vh=0" }, + { "kvm_arm.mode=protected", "id_aa64mmfr1.vh=0" }, { "arm64.nosve", "id_aa64pfr0.sve=0" }, { "arm64.nosme", "id_aa64pfr1.sme=0" }, { "arm64.nobti", "id_aa64pfr1.bt=0" }, @@ -215,7 +215,7 @@ static int __init find_field(const char *cmdline, len = snprintf(opt, ARRAY_SIZE(opt), "%s.%s=", reg->name, reg->fields[f].name); - if (!parameqn(cmdline, opt, len)) + if (memcmp(cmdline, opt, len)) return -1; return kstrtou64(cmdline + len, 0, v); @@ -272,23 +272,29 @@ static __init void __parse_cmdline(const char *cmdline, bool parse_aliases) cmdline = skip_spaces(cmdline); - for (len = 0; cmdline[len] && !isspace(cmdline[len]); len++); + /* terminate on "--" appearing on the command line by itself */ + if (cmdline[0] == '-' && cmdline[1] == '-' && isspace(cmdline[2])) + return; + + for (len = 0; cmdline[len] && !isspace(cmdline[len]); len++) { + if (len >= sizeof(buf) - 1) + break; + if (cmdline[len] == '-') + buf[len] = '_'; + else + buf[len] = cmdline[len]; + } if (!len) return; - len = min(len, ARRAY_SIZE(buf) - 1); - memcpy(buf, cmdline, len); - buf[len] = '\0'; - - if (strcmp(buf, "--") == 0) - return; + buf[len] = 0; cmdline += len; match_options(buf); for (i = 0; parse_aliases && i < ARRAY_SIZE(aliases); i++) - if (parameq(buf, aliases[i].alias)) + if (!memcmp(buf, aliases[i].alias, len + 1)) __parse_cmdline(aliases[i].feature, false); } while (1); } From bcf1eed3f8a0b83824b41da82d226cf36320be76 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 29 Nov 2023 12:16:13 +0100 Subject: [PATCH 217/310] arm64: idreg-override: avoid strlen() to check for empty strings strlen() is a costly way to decide whether a string is empty, as in that case, the first character will be NUL so we can check for that directly. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20231129111555.3594833-60-ardb@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/idreg-override.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 1eca93446345..8b22ca523186 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -232,7 +232,7 @@ static void __init match_options(const char *cmdline) override = prel64_pointer(reg->override); - for (f = 0; strlen(reg->fields[f].name); f++) { + for (f = 0; reg->fields[f].name[0] != '\0'; f++) { u64 shift = reg->fields[f].shift; u64 width = reg->fields[f].width ?: 4; u64 mask = GENMASK_ULL(shift + width - 1, shift); From 060260a6be47ae163b0c71a6d0902d065b68f3d2 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 29 Nov 2023 12:16:14 +0100 Subject: [PATCH 218/310] arm64: idreg-override: Avoid sprintf() for simple string concatenation Instead of using sprintf() with the "%s.%s=" format, where the first string argument is always the same in the inner loop of match_options(), use simple memcpy() for string concatenation, and move the first copy to the outer loop. This removes the dependency on sprintf(), which will be difficult to fulfil when we move this code into the early mini C runtime. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20231129111555.3594833-61-ardb@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/idreg-override.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 8b22ca523186..cf1df5f6fbc1 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -206,14 +206,15 @@ static int __init parse_nokaslr(char *unused) } early_param("nokaslr", parse_nokaslr); -static int __init find_field(const char *cmdline, +static int __init find_field(const char *cmdline, char *opt, int len, const struct ftr_set_desc *reg, int f, u64 *v) { - char opt[FTR_DESC_NAME_LEN + FTR_DESC_FIELD_LEN + 2]; - int len; + int flen = strlen(reg->fields[f].name); - len = snprintf(opt, ARRAY_SIZE(opt), "%s.%s=", - reg->name, reg->fields[f].name); + // append '=' to obtain '.=' + memcpy(opt + len, reg->fields[f].name, flen); + len += flen; + opt[len++] = '='; if (memcmp(cmdline, opt, len)) return -1; @@ -223,15 +224,21 @@ static int __init find_field(const char *cmdline, static void __init match_options(const char *cmdline) { + char opt[FTR_DESC_NAME_LEN + FTR_DESC_FIELD_LEN + 2]; int i; for (i = 0; i < ARRAY_SIZE(regs); i++) { const struct ftr_set_desc *reg = prel64_pointer(regs[i].reg); struct arm64_ftr_override *override; + int len = strlen(reg->name); int f; override = prel64_pointer(reg->override); + // set opt[] to '.' + memcpy(opt, reg->name, len); + opt[len++] = '.'; + for (f = 0; reg->fields[f].name[0] != '\0'; f++) { u64 shift = reg->fields[f].shift; u64 width = reg->fields[f].width ?: 4; @@ -239,7 +246,7 @@ static void __init match_options(const char *cmdline) bool (*filter)(u64 val); u64 v; - if (find_field(cmdline, reg, f, &v)) + if (find_field(cmdline, opt, len, reg, f, &v)) continue; /* From ea48626f8f0efc697555d17bb5853df92461e280 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 29 Nov 2023 12:16:15 +0100 Subject: [PATCH 219/310] arm64: idreg-override: Avoid kstrtou64() to parse a single hex digit All ID register value overrides are =0 with the exception of the nokaslr pseudo feature which uses =1. In order to remove the dependency on kstrtou64(), which is part of the core kernel and no longer usable once we move idreg-override into the early mini C runtime, let's just parse a single hex digit (with optional leading 0x) and set the output value accordingly. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20231129111555.3594833-62-ardb@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/idreg-override.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index cf1df5f6fbc1..9646f94094ed 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -206,6 +206,20 @@ static int __init parse_nokaslr(char *unused) } early_param("nokaslr", parse_nokaslr); +static int __init parse_hexdigit(const char *p, u64 *v) +{ + // skip "0x" if it comes next + if (p[0] == '0' && tolower(p[1]) == 'x') + p += 2; + + // check whether the RHS is a single hex digit + if (!isxdigit(p[0]) || (p[1] && !isspace(p[1]))) + return -EINVAL; + + *v = tolower(*p) - (isdigit(*p) ? '0' : 'a' - 10); + return 0; +} + static int __init find_field(const char *cmdline, char *opt, int len, const struct ftr_set_desc *reg, int f, u64 *v) { @@ -219,7 +233,7 @@ static int __init find_field(const char *cmdline, char *opt, int len, if (memcmp(cmdline, opt, len)) return -1; - return kstrtou64(cmdline + len, 0, v); + return parse_hexdigit(cmdline + len, v); } static void __init match_options(const char *cmdline) From 50f176175e96ea5d7cbd8536c0dd774de796ef63 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 29 Nov 2023 12:16:16 +0100 Subject: [PATCH 220/310] arm64/kernel: Move 'nokaslr' parsing out of early idreg code Parsing and ignoring 'nokaslr' can be done from anywhere, except from the code that runs very early and is therefore built with limitations on the kind of relocations it is permitted to use. So move it to a source file that is part of the ordinary kernel build. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20231129111555.3594833-63-ardb@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/idreg-override.c | 7 ------- arch/arm64/kernel/kaslr.c | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 9646f94094ed..e30fd9e32ef3 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -199,13 +199,6 @@ static const struct { { "nokaslr", "arm64_sw.nokaslr=1" }, }; -static int __init parse_nokaslr(char *unused) -{ - /* nokaslr param handling is done by early cpufeature code */ - return 0; -} -early_param("nokaslr", parse_nokaslr); - static int __init parse_hexdigit(const char *p, u64 *v) { // skip "0x" if it comes next diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 94a269cd1f07..12c7f3c8ba76 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -36,3 +36,10 @@ void __init kaslr_init(void) pr_info("KASLR enabled\n"); __kaslr_is_enabled = true; } + +static int __init parse_nokaslr(char *unused) +{ + /* nokaslr param handling is done by early cpufeature code */ + return 0; +} +early_param("nokaslr", parse_nokaslr); From 7a0a6d55ed93fe064039c4e014d5cf3a97391bbb Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 2 Nov 2023 15:02:04 +0200 Subject: [PATCH 221/310] x86/docs: Remove reference to syscall trampoline in PTI Commit bf904d2762ee ("x86/pti/64: Remove the SYSCALL64 entry trampoline") removed the syscall trampoline and instead opted to enable using the default SYSCALL64 entry point by mapping the percpu TSS. Unfortunately, the PTI documentation wasn't updated when the respective changes were made, so bring the doc up to speed. Signed-off-by: Nikolay Borisov Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231102130204.41043-1-nik.borisov@suse.com --- Documentation/arch/x86/pti.rst | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/Documentation/arch/x86/pti.rst b/Documentation/arch/x86/pti.rst index 4b858a9bad8d..e08d35177bc0 100644 --- a/Documentation/arch/x86/pti.rst +++ b/Documentation/arch/x86/pti.rst @@ -81,11 +81,9 @@ this protection comes at a cost: and exit (it can be skipped when the kernel is interrupted, though.) Moves to CR3 are on the order of a hundred cycles, and are required at every entry and exit. - b. A "trampoline" must be used for SYSCALL entry. This - trampoline depends on a smaller set of resources than the - non-PTI SYSCALL entry code, so requires mapping fewer - things into the userspace page tables. The downside is - that stacks must be switched at entry time. + b. Percpu TSS is mapped into the user page tables to allow SYSCALL64 path + to work under PTI. This doesn't have a direct runtime cost but it can + be argued it opens certain timing attack scenarios. c. Global pages are disabled for all kernel structures not mapped into both kernel and userspace page tables. This feature of the MMU allows different processes to share TLB @@ -167,7 +165,7 @@ that are worth noting here. * Failures of the selftests/x86 code. Usually a bug in one of the more obscure corners of entry_64.S * Crashes in early boot, especially around CPU bringup. Bugs - in the trampoline code or mappings cause these. + in the mappings cause these. * Crashes at the first interrupt. Caused by bugs in entry_64.S, like screwing up a page table switch. Also caused by incorrectly mapping the IRQ handler entry code. From 9b19700e623f96222c69ecb2adecb1a3e3664cc0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Dec 2023 12:32:20 +0100 Subject: [PATCH 222/310] arm64: fpsimd: Drop unneeded 'busy' flag Kernel mode NEON will preserve the user mode FPSIMD state by saving it into the task struct before clobbering the registers. In order to avoid the need for preserving kernel mode state too, we disallow nested use of kernel mode NEON, i..e, use in softirq context while the interrupted task context was using kernel mode NEON too. Originally, this policy was implemented using a per-CPU flag which was exposed via may_use_simd(), requiring the users of the kernel mode NEON to deal with the possibility that it might return false, and having NEON and non-NEON code paths. This policy was changed by commit 13150149aa6ded1 ("arm64: fpsimd: run kernel mode NEON with softirqs disabled"), and now, softirq processing is disabled entirely instead, and so may_use_simd() can never fail when called from task or softirq context. This means we can drop the fpsimd_context_busy flag entirely, and instead, ensure that we disable softirq processing in places where we formerly relied on the flag for preventing races in the FPSIMD preserve routines. Signed-off-by: Ard Biesheuvel Reviewed-by: Mark Brown Tested-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20231208113218.3001940-7-ardb@google.com [will: Folded in fix from CAMj1kXFhzbJRyWHELCivQW1yJaF=p07LLtbuyXYX3G1WtsdyQg@mail.gmail.com] Signed-off-by: Will Deacon --- arch/arm64/include/asm/simd.h | 11 +------ arch/arm64/kernel/fpsimd.c | 55 +++++++++-------------------------- 2 files changed, 15 insertions(+), 51 deletions(-) diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h index 6a75d7ecdcaa..8e86c9e70e48 100644 --- a/arch/arm64/include/asm/simd.h +++ b/arch/arm64/include/asm/simd.h @@ -12,8 +12,6 @@ #include #include -DECLARE_PER_CPU(bool, fpsimd_context_busy); - #ifdef CONFIG_KERNEL_MODE_NEON /* @@ -28,17 +26,10 @@ static __must_check inline bool may_use_simd(void) /* * We must make sure that the SVE has been initialized properly * before using the SIMD in kernel. - * fpsimd_context_busy is only set while preemption is disabled, - * and is clear whenever preemption is enabled. Since - * this_cpu_read() is atomic w.r.t. preemption, fpsimd_context_busy - * cannot change under our feet -- if it's set we cannot be - * migrated, and if it's clear we cannot be migrated to a CPU - * where it is set. */ return !WARN_ON(!system_capabilities_finalized()) && system_supports_fpsimd() && - !in_hardirq() && !irqs_disabled() && !in_nmi() && - !this_cpu_read(fpsimd_context_busy); + !in_hardirq() && !irqs_disabled() && !in_nmi(); } #else /* ! CONFIG_KERNEL_MODE_NEON */ diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 1559c706d32d..37ee261b117d 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -85,13 +85,13 @@ * softirq kicks in. Upon vcpu_put(), KVM will save the vcpu FP state and * flag the register state as invalid. * - * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may - * save the task's FPSIMD context back to task_struct from softirq context. - * To prevent this from racing with the manipulation of the task's FPSIMD state - * from task context and thereby corrupting the state, it is necessary to - * protect any manipulation of a task's fpsimd_state or TIF_FOREIGN_FPSTATE - * flag with {, __}get_cpu_fpsimd_context(). This will still allow softirqs to - * run but prevent them to use FPSIMD. + * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may be + * called from softirq context, which will save the task's FPSIMD context back + * to task_struct. To prevent this from racing with the manipulation of the + * task's FPSIMD state from task context and thereby corrupting the state, it + * is necessary to protect any manipulation of a task's fpsimd_state or + * TIF_FOREIGN_FPSTATE flag with get_cpu_fpsimd_context(), which will suspend + * softirq servicing entirely until put_cpu_fpsimd_context() is called. * * For a certain task, the sequence may look something like this: * - the task gets scheduled in; if both the task's fpsimd_cpu field @@ -209,27 +209,14 @@ static inline void sme_free(struct task_struct *t) { } #endif -DEFINE_PER_CPU(bool, fpsimd_context_busy); -EXPORT_PER_CPU_SYMBOL(fpsimd_context_busy); - static void fpsimd_bind_task_to_cpu(void); -static void __get_cpu_fpsimd_context(void) -{ - bool busy = __this_cpu_xchg(fpsimd_context_busy, true); - - WARN_ON(busy); -} - /* * Claim ownership of the CPU FPSIMD context for use by the calling context. * * The caller may freely manipulate the FPSIMD context metadata until * put_cpu_fpsimd_context() is called. * - * The double-underscore version must only be called if you know the task - * can't be preempted. - * * On RT kernels local_bh_disable() is not sufficient because it only * serializes soft interrupt related sections via a local lock, but stays * preemptible. Disabling preemption is the right choice here as bottom @@ -242,14 +229,6 @@ static void get_cpu_fpsimd_context(void) local_bh_disable(); else preempt_disable(); - __get_cpu_fpsimd_context(); -} - -static void __put_cpu_fpsimd_context(void) -{ - bool busy = __this_cpu_xchg(fpsimd_context_busy, false); - - WARN_ON(!busy); /* No matching get_cpu_fpsimd_context()? */ } /* @@ -261,18 +240,12 @@ static void __put_cpu_fpsimd_context(void) */ static void put_cpu_fpsimd_context(void) { - __put_cpu_fpsimd_context(); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_bh_enable(); else preempt_enable(); } -static bool have_cpu_fpsimd_context(void) -{ - return !preemptible() && __this_cpu_read(fpsimd_context_busy); -} - unsigned int task_get_vl(const struct task_struct *task, enum vec_type type) { return task->thread.vl[type]; @@ -383,7 +356,7 @@ static void task_fpsimd_load(void) bool restore_ffr; WARN_ON(!system_supports_fpsimd()); - WARN_ON(!have_cpu_fpsimd_context()); + WARN_ON(preemptible()); if (system_supports_sve() || system_supports_sme()) { switch (current->thread.fp_type) { @@ -467,7 +440,7 @@ static void fpsimd_save(void) unsigned int vl; WARN_ON(!system_supports_fpsimd()); - WARN_ON(!have_cpu_fpsimd_context()); + WARN_ON(preemptible()); if (test_thread_flag(TIF_FOREIGN_FPSTATE)) return; @@ -1507,7 +1480,7 @@ void fpsimd_thread_switch(struct task_struct *next) if (!system_supports_fpsimd()) return; - __get_cpu_fpsimd_context(); + WARN_ON_ONCE(!irqs_disabled()); /* Save unsaved fpsimd state, if any: */ fpsimd_save(); @@ -1523,8 +1496,6 @@ void fpsimd_thread_switch(struct task_struct *next) update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE, wrong_task || wrong_cpu); - - __put_cpu_fpsimd_context(); } static void fpsimd_flush_thread_vl(enum vec_type type) @@ -1826,13 +1797,15 @@ static void fpsimd_flush_cpu_state(void) */ void fpsimd_save_and_flush_cpu_state(void) { + unsigned long flags; + if (!system_supports_fpsimd()) return; WARN_ON(preemptible()); - __get_cpu_fpsimd_context(); + local_irq_save(flags); fpsimd_save(); fpsimd_flush_cpu_state(); - __put_cpu_fpsimd_context(); + local_irq_restore(flags); } #ifdef CONFIG_KERNEL_MODE_NEON From aefbab8e77eb16b56e18f24b85a09ebf4dc60e93 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Dec 2023 12:32:21 +0100 Subject: [PATCH 223/310] arm64: fpsimd: Preserve/restore kernel mode NEON at context switch Currently, the FPSIMD register file is not preserved and restored along with the general registers on exception entry/exit or context switch. For this reason, we disable preemption when enabling FPSIMD for kernel mode use in task context, and suspend the processing of softirqs so that there are no concurrent uses in the kernel. (Kernel mode FPSIMD may not be used at all in other contexts). Disabling preemption while doing CPU intensive work on inputs of potentially unbounded size is bad for real-time performance, which is why we try and ensure that SIMD crypto code does not operate on more than ~4k at a time, which is an arbitrary limit and requires assembler code to implement efficiently. We can avoid the need for disabling preemption if we can ensure that any in-kernel users of the NEON will not lose the FPSIMD register state across a context switch. And given that disabling softirqs implicitly disables preemption as well, we will also have to ensure that a softirq that runs code using FPSIMD can safely interrupt an in-kernel user. So introduce a thread_info flag TIF_KERNEL_FPSTATE, and modify the context switch hook for FPSIMD to preserve and restore the kernel mode FPSIMD to/from struct thread_struct when it is set. This avoids any scheduling blackouts due to prolonged use of FPSIMD in kernel mode, without the need for manual yielding. In order to support softirq processing while FPSIMD is being used in kernel task context, use the same flag to decide whether the kernel mode FPSIMD state needs to be preserved and restored before allowing FPSIMD to be used in softirq context. Signed-off-by: Ard Biesheuvel Reviewed-by: Mark Brown Reviewed-by: Mark Rutland Link: https://lore.kernel.org/r/20231208113218.3001940-8-ardb@google.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/processor.h | 2 + arch/arm64/include/asm/thread_info.h | 1 + arch/arm64/kernel/fpsimd.c | 92 ++++++++++++++++++++++------ 3 files changed, 77 insertions(+), 18 deletions(-) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index e5bc54522e71..ce6eebd6c08b 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -167,6 +167,8 @@ struct thread_struct { unsigned long fault_address; /* fault info */ unsigned long fault_code; /* ESR_EL1 value */ struct debug_info debug; /* debugging */ + + struct user_fpsimd_state kernel_fpsimd_state; #ifdef CONFIG_ARM64_PTR_AUTH struct ptrauth_keys_user keys_user; #ifdef CONFIG_ARM64_PTR_AUTH_KERNEL diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 553d1bc559c6..e72a3bf9e563 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -80,6 +80,7 @@ void arch_setup_new_exec(void); #define TIF_TAGGED_ADDR 26 /* Allow tagged user addresses */ #define TIF_SME 27 /* SME in use */ #define TIF_SME_VL_INHERIT 28 /* Inherit SME vl_onexec across exec */ +#define TIF_KERNEL_FPSTATE 29 /* Task is in a kernel mode FPSIMD section */ #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 37ee261b117d..e4595a8d731f 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -357,6 +357,7 @@ static void task_fpsimd_load(void) WARN_ON(!system_supports_fpsimd()); WARN_ON(preemptible()); + WARN_ON(test_thread_flag(TIF_KERNEL_FPSTATE)); if (system_supports_sve() || system_supports_sme()) { switch (current->thread.fp_type) { @@ -379,7 +380,7 @@ static void task_fpsimd_load(void) default: /* * This indicates either a bug in - * fpsimd_save() or memory corruption, we + * fpsimd_save_user_state() or memory corruption, we * should always record an explicit format * when we save. We always at least have the * memory allocated for FPSMID registers so @@ -430,7 +431,7 @@ static void task_fpsimd_load(void) * than via current, if we are saving KVM state then it will have * ensured that the type of registers to save is set in last->to_save. */ -static void fpsimd_save(void) +static void fpsimd_save_user_state(void) { struct cpu_fp_state const *last = this_cpu_ptr(&fpsimd_last_state); @@ -861,7 +862,7 @@ int vec_set_vector_length(struct task_struct *task, enum vec_type type, if (task == current) { get_cpu_fpsimd_context(); - fpsimd_save(); + fpsimd_save_user_state(); } fpsimd_flush_task_state(task); @@ -1473,6 +1474,16 @@ void do_fpsimd_exc(unsigned long esr, struct pt_regs *regs) current); } +static void fpsimd_load_kernel_state(struct task_struct *task) +{ + fpsimd_load_state(&task->thread.kernel_fpsimd_state); +} + +static void fpsimd_save_kernel_state(struct task_struct *task) +{ + fpsimd_save_state(&task->thread.kernel_fpsimd_state); +} + void fpsimd_thread_switch(struct task_struct *next) { bool wrong_task, wrong_cpu; @@ -1483,19 +1494,28 @@ void fpsimd_thread_switch(struct task_struct *next) WARN_ON_ONCE(!irqs_disabled()); /* Save unsaved fpsimd state, if any: */ - fpsimd_save(); + if (test_thread_flag(TIF_KERNEL_FPSTATE)) + fpsimd_save_kernel_state(current); + else + fpsimd_save_user_state(); - /* - * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's - * state. For kernel threads, FPSIMD registers are never loaded - * and wrong_task and wrong_cpu will always be true. - */ - wrong_task = __this_cpu_read(fpsimd_last_state.st) != - &next->thread.uw.fpsimd_state; - wrong_cpu = next->thread.fpsimd_cpu != smp_processor_id(); + if (test_tsk_thread_flag(next, TIF_KERNEL_FPSTATE)) { + fpsimd_load_kernel_state(next); + set_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE); + } else { + /* + * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's + * state. For kernel threads, FPSIMD registers are never + * loaded with user mode FPSIMD state and so wrong_task and + * wrong_cpu will always be true. + */ + wrong_task = __this_cpu_read(fpsimd_last_state.st) != + &next->thread.uw.fpsimd_state; + wrong_cpu = next->thread.fpsimd_cpu != smp_processor_id(); - update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE, - wrong_task || wrong_cpu); + update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE, + wrong_task || wrong_cpu); + } } static void fpsimd_flush_thread_vl(enum vec_type type) @@ -1585,7 +1605,7 @@ void fpsimd_preserve_current_state(void) return; get_cpu_fpsimd_context(); - fpsimd_save(); + fpsimd_save_user_state(); put_cpu_fpsimd_context(); } @@ -1803,7 +1823,7 @@ void fpsimd_save_and_flush_cpu_state(void) return; WARN_ON(preemptible()); local_irq_save(flags); - fpsimd_save(); + fpsimd_save_user_state(); fpsimd_flush_cpu_state(); local_irq_restore(flags); } @@ -1837,10 +1857,37 @@ void kernel_neon_begin(void) get_cpu_fpsimd_context(); /* Save unsaved fpsimd state, if any: */ - fpsimd_save(); + if (test_thread_flag(TIF_KERNEL_FPSTATE)) { + BUG_ON(IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq()); + fpsimd_save_kernel_state(current); + } else { + fpsimd_save_user_state(); + + /* + * Set the thread flag so that the kernel mode FPSIMD state + * will be context switched along with the rest of the task + * state. + * + * On non-PREEMPT_RT, softirqs may interrupt task level kernel + * mode FPSIMD, but the task will not be preemptible so setting + * TIF_KERNEL_FPSTATE for those would be both wrong (as it + * would mark the task context FPSIMD state as requiring a + * context switch) and unnecessary. + * + * On PREEMPT_RT, softirqs are serviced from a separate thread, + * which is scheduled as usual, and this guarantees that these + * softirqs are not interrupting use of the FPSIMD in kernel + * mode in task context. So in this case, setting the flag here + * is always appropriate. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq()) + set_thread_flag(TIF_KERNEL_FPSTATE); + } /* Invalidate any task state remaining in the fpsimd regs: */ fpsimd_flush_cpu_state(); + + put_cpu_fpsimd_context(); } EXPORT_SYMBOL_GPL(kernel_neon_begin); @@ -1858,7 +1905,16 @@ void kernel_neon_end(void) if (!system_supports_fpsimd()) return; - put_cpu_fpsimd_context(); + /* + * If we are returning from a nested use of kernel mode FPSIMD, restore + * the task context kernel mode FPSIMD state. This can only happen when + * running in softirq context on non-PREEMPT_RT. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && in_serving_softirq() && + test_thread_flag(TIF_KERNEL_FPSTATE)) + fpsimd_load_kernel_state(current); + else + clear_thread_flag(TIF_KERNEL_FPSTATE); } EXPORT_SYMBOL_GPL(kernel_neon_end); From 2632e25217696712681dd1f3ecc0d71624ea3b23 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 8 Dec 2023 12:32:22 +0100 Subject: [PATCH 224/310] arm64: fpsimd: Implement lazy restore for kernel mode FPSIMD Now that kernel mode FPSIMD state is context switched along with other task state, we can enable the existing logic that keeps track of which task's FPSIMD state the CPU is holding in its registers. If it is the context of the task that we are switching to, we can elide the reload of the FPSIMD state from memory. Note that we also need to check whether the FPSIMD state on this CPU is the most recent: if a task gets migrated away and back again, the state in memory may be more recent than the state in the CPU. So add another CPU id field to task_struct to keep track of this. (We could reuse the existing CPU id field used for user mode context, but that might result in user state to be discarded unnecessarily, given that two distinct CPUs could be holding the most recent user mode state and the most recent kernel mode state) Signed-off-by: Ard Biesheuvel Reviewed-by: Mark Brown Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20231208113218.3001940-9-ardb@google.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/processor.h | 1 + arch/arm64/kernel/fpsimd.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index ce6eebd6c08b..5b0a04810b23 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -169,6 +169,7 @@ struct thread_struct { struct debug_info debug; /* debugging */ struct user_fpsimd_state kernel_fpsimd_state; + unsigned int kernel_fpsimd_cpu; #ifdef CONFIG_ARM64_PTR_AUTH struct ptrauth_keys_user keys_user; #ifdef CONFIG_ARM64_PTR_AUTH_KERNEL diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index e4595a8d731f..e714d0cd5d1e 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1476,12 +1476,30 @@ void do_fpsimd_exc(unsigned long esr, struct pt_regs *regs) static void fpsimd_load_kernel_state(struct task_struct *task) { + struct cpu_fp_state *last = this_cpu_ptr(&fpsimd_last_state); + + /* + * Elide the load if this CPU holds the most recent kernel mode + * FPSIMD context of the current task. + */ + if (last->st == &task->thread.kernel_fpsimd_state && + task->thread.kernel_fpsimd_cpu == smp_processor_id()) + return; + fpsimd_load_state(&task->thread.kernel_fpsimd_state); } static void fpsimd_save_kernel_state(struct task_struct *task) { + struct cpu_fp_state cpu_fp_state = { + .st = &task->thread.kernel_fpsimd_state, + .to_save = FP_STATE_FPSIMD, + }; + fpsimd_save_state(&task->thread.kernel_fpsimd_state); + fpsimd_bind_state_to_cpu(&cpu_fp_state); + + task->thread.kernel_fpsimd_cpu = smp_processor_id(); } void fpsimd_thread_switch(struct task_struct *next) From 9dbd5927408c4a0707de73ae9dd9306b184e8fee Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 29 Nov 2023 00:27:44 +1100 Subject: [PATCH 225/310] selftests/powerpc: Fix error handling in FPU/VMX preemption tests The FPU & VMX preemption tests do not check for errors returned by the low-level asm routines, preempt_fpu() / preempt_vsx() respectively. That means any register corruption detected by the asm routines does not result in a test failure. Fix it by returning the return value of the asm routines from the pthread child routines. Fixes: e5ab8be68e44 ("selftests/powerpc: Test preservation of FPU and VMX regs across preemption") Signed-off-by: Michael Ellerman Link: https://msgid.link/20231128132748.1990179-1-mpe@ellerman.id.au --- tools/testing/selftests/powerpc/math/fpu_preempt.c | 9 +++++---- tools/testing/selftests/powerpc/math/vmx_preempt.c | 10 ++++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/powerpc/math/fpu_preempt.c b/tools/testing/selftests/powerpc/math/fpu_preempt.c index 5235bdc8c0b1..3e5b5663d244 100644 --- a/tools/testing/selftests/powerpc/math/fpu_preempt.c +++ b/tools/testing/selftests/powerpc/math/fpu_preempt.c @@ -37,19 +37,20 @@ __thread double darray[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, int threads_starting; int running; -extern void preempt_fpu(double *darray, int *threads_starting, int *running); +extern int preempt_fpu(double *darray, int *threads_starting, int *running); void *preempt_fpu_c(void *p) { + long rc; int i; + srand(pthread_self()); for (i = 0; i < 21; i++) darray[i] = rand(); - /* Test failed if it ever returns */ - preempt_fpu(darray, &threads_starting, &running); + rc = preempt_fpu(darray, &threads_starting, &running); - return p; + return (void *)rc; } int test_preempt_fpu(void) diff --git a/tools/testing/selftests/powerpc/math/vmx_preempt.c b/tools/testing/selftests/powerpc/math/vmx_preempt.c index 6761d6ce30ec..6f7cf400c687 100644 --- a/tools/testing/selftests/powerpc/math/vmx_preempt.c +++ b/tools/testing/selftests/powerpc/math/vmx_preempt.c @@ -37,19 +37,21 @@ __thread vector int varray[] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10,11,12}, int threads_starting; int running; -extern void preempt_vmx(vector int *varray, int *threads_starting, int *running); +extern int preempt_vmx(vector int *varray, int *threads_starting, int *running); void *preempt_vmx_c(void *p) { int i, j; + long rc; + srand(pthread_self()); for (i = 0; i < 12; i++) for (j = 0; j < 4; j++) varray[i][j] = rand(); - /* Test fails if it ever returns */ - preempt_vmx(varray, &threads_starting, &running); - return p; + rc = preempt_vmx(varray, &threads_starting, &running); + + return (void *)rc; } int test_preempt_vmx(void) From e5d00aaac651a69b8016d355123438387bfd37be Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 29 Nov 2023 00:27:45 +1100 Subject: [PATCH 226/310] selftests/powerpc: Check all FPRs in fpu_preempt There's a selftest that checks FPRs aren't corrupted by preemption, or just process scheduling. However it only checks the non-volatile FPRs, meaning corruption of the volatile FPRs could go undetected. The check_fpu function it calls is used by several other tests, so for now add a new routine to check all the FPRs. Increase the size of the array of FPRs to 32, and initialise them all with random values. Signed-off-by: Michael Ellerman Link: https://msgid.link/20231128132748.1990179-2-mpe@ellerman.id.au --- .../testing/selftests/powerpc/math/fpu_asm.S | 41 +++++++++++++++++-- .../selftests/powerpc/math/fpu_preempt.c | 15 +++---- 2 files changed, 43 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/powerpc/math/fpu_asm.S b/tools/testing/selftests/powerpc/math/fpu_asm.S index 9dc0c158f871..051392ad3ce7 100644 --- a/tools/testing/selftests/powerpc/math/fpu_asm.S +++ b/tools/testing/selftests/powerpc/math/fpu_asm.S @@ -66,6 +66,40 @@ FUNC_START(check_fpu) li r3,0 # Success!!! 1: blr + +// int check_all_fprs(double darray[32]) +FUNC_START(check_all_fprs) + PUSH_BASIC_STACK(8) + mr r4, r3 // r4 = darray + li r3, 1 // prepare for failure + + stfd f31, STACK_FRAME_LOCAL(0, 0)(sp) // backup f31 + + // Check regs f0-f30, using f31 as scratch + .set i, 0 + .rept 31 + lfd f31, (8 * i)(r4) // load expected value + fcmpu cr0, i, f31 // compare + bne cr0, 1f // bail if mismatch + .set i, i + 1 + .endr + + lfd f31, STACK_FRAME_LOCAL(0, 0)(sp) // reload f31 + stfd f30, STACK_FRAME_LOCAL(0, 0)(sp) // backup f30 + + lfd f30, (8 * 31)(r4) // load expected value of f31 + fcmpu cr0, f30, f31 // compare + bne cr0, 1f // bail if mismatch + + lfd f30, STACK_FRAME_LOCAL(0, 0)(sp) // reload f30 + + // Success + li r3, 0 + +1: POP_BASIC_STACK(8) + blr +FUNC_END(check_all_fprs) + FUNC_START(test_fpu) # r3 holds pointer to where to put the result of fork # r4 holds pointer to the pid @@ -104,8 +138,8 @@ FUNC_START(preempt_fpu) std r4,STACK_FRAME_PARAM(1)(sp) # int *threads_starting std r5,STACK_FRAME_PARAM(2)(sp) # int *running - bl load_fpu - nop + // Load FPRs with expected values + OP_REGS lfd, 8, 0, 31, r3 sync # Atomic DEC @@ -116,8 +150,7 @@ FUNC_START(preempt_fpu) bne- 1b 2: ld r3,STACK_FRAME_PARAM(0)(sp) - bl check_fpu - nop + bl check_all_fprs cmpdi r3,0 bne 3f ld r4,STACK_FRAME_PARAM(2)(sp) diff --git a/tools/testing/selftests/powerpc/math/fpu_preempt.c b/tools/testing/selftests/powerpc/math/fpu_preempt.c index 3e5b5663d244..24b5abacccdc 100644 --- a/tools/testing/selftests/powerpc/math/fpu_preempt.c +++ b/tools/testing/selftests/powerpc/math/fpu_preempt.c @@ -1,13 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2015, Cyril Bur, IBM Corp. + * Copyright 2023, Michael Ellerman, IBM Corp. * * This test attempts to see if the FPU registers change across preemption. - * Two things should be noted here a) The check_fpu function in asm only checks - * the non volatile registers as it is reused from the syscall test b) There is - * no way to be sure preemption happened so this test just uses many threads - * and a long wait. As such, a successful test doesn't mean much but a failure - * is bad. + * There is no way to be sure preemption happened so this test just uses many + * threads and a long wait. As such, a successful test doesn't mean much but + * a failure is bad. */ #include @@ -30,9 +29,7 @@ #define THREAD_FACTOR 8 -__thread double darray[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, - 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, - 2.1}; +__thread double darray[32]; int threads_starting; int running; @@ -45,7 +42,7 @@ void *preempt_fpu_c(void *p) int i; srand(pthread_self()); - for (i = 0; i < 21; i++) + for (i = 0; i < ARRAY_SIZE(darray); i++) darray[i] = rand(); rc = preempt_fpu(darray, &threads_starting, &running); From 2ba107f6795d780bb54b85040a9b2c6a60fccb15 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 29 Nov 2023 00:27:46 +1100 Subject: [PATCH 227/310] selftests/powerpc: Generate better bit patterns for FPU tests The fpu_preempt test randomly initialises an array of doubles to try and detect FPU register corruption. However the values it generates do not occupy the full range of values possible in the 64-bit double, meaning some partial register corruption could go undetected. Without getting too carried away, add some better initialisation to generate values that occupy more bits. Sample values before: f0 902677510 (raw 0x41cae6e203000000) f1 325217596 (raw 0x41b3626d3c000000) f2 1856578300 (raw 0x41dbaa48bf000000) f3 1247189984 (raw 0x41d295a6f8000000) And after: f0 1.1078153481413311e-09 (raw 0x3e13083932805cc2) f1 1.0576648474801922e+17 (raw 0x43777c20eb88c261) f2 -6.6245033413594075e-10 (raw 0xbe06c2f989facae9) f3 3.0085988827156291e+18 (raw 0x43c4e0585f2df37b) Signed-off-by: Michael Ellerman Link: https://msgid.link/20231128132748.1990179-3-mpe@ellerman.id.au --- tools/testing/selftests/powerpc/math/fpu.h | 25 +++++++++++++++++++ .../selftests/powerpc/math/fpu_preempt.c | 6 ++--- 2 files changed, 27 insertions(+), 4 deletions(-) create mode 100644 tools/testing/selftests/powerpc/math/fpu.h diff --git a/tools/testing/selftests/powerpc/math/fpu.h b/tools/testing/selftests/powerpc/math/fpu.h new file mode 100644 index 000000000000..a8ad0d42604e --- /dev/null +++ b/tools/testing/selftests/powerpc/math/fpu.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2023, Michael Ellerman, IBM Corporation. + */ + +#ifndef _SELFTESTS_POWERPC_FPU_H +#define _SELFTESTS_POWERPC_FPU_H + +static inline void randomise_darray(double *darray, int num) +{ + long val; + + for (int i = 0; i < num; i++) { + val = random(); + if (val & 1) + val *= -1; + + if (val & 2) + darray[i] = 1.0 / val; + else + darray[i] = val * val; + } +} + +#endif /* _SELFTESTS_POWERPC_FPU_H */ diff --git a/tools/testing/selftests/powerpc/math/fpu_preempt.c b/tools/testing/selftests/powerpc/math/fpu_preempt.c index 24b5abacccdc..97dff3690136 100644 --- a/tools/testing/selftests/powerpc/math/fpu_preempt.c +++ b/tools/testing/selftests/powerpc/math/fpu_preempt.c @@ -19,6 +19,7 @@ #include #include "utils.h" +#include "fpu.h" /* Time to wait for workers to get preempted (seconds) */ #define PREEMPT_TIME 20 @@ -39,12 +40,9 @@ extern int preempt_fpu(double *darray, int *threads_starting, int *running); void *preempt_fpu_c(void *p) { long rc; - int i; srand(pthread_self()); - for (i = 0; i < ARRAY_SIZE(darray); i++) - darray[i] = rand(); - + randomise_darray(darray, ARRAY_SIZE(darray)); rc = preempt_fpu(darray, &threads_starting, &running); return (void *)rc; From 60d2c3af9a0c04dc2d34a62e9f5e7033b40dfed8 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 29 Nov 2023 00:27:47 +1100 Subject: [PATCH 228/310] selftests/powerpc: Run fpu_preempt test for 60 seconds The FPU preempt test only runs for 20 seconds, which is not particularly long. Run it for 60 seconds to increase the chance of detecting corruption. Signed-off-by: Michael Ellerman Link: https://msgid.link/20231128132748.1990179-4-mpe@ellerman.id.au --- tools/testing/selftests/powerpc/math/fpu_preempt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/math/fpu_preempt.c b/tools/testing/selftests/powerpc/math/fpu_preempt.c index 97dff3690136..9ddede0770ed 100644 --- a/tools/testing/selftests/powerpc/math/fpu_preempt.c +++ b/tools/testing/selftests/powerpc/math/fpu_preempt.c @@ -22,7 +22,7 @@ #include "fpu.h" /* Time to wait for workers to get preempted (seconds) */ -#define PREEMPT_TIME 20 +#define PREEMPT_TIME 60 /* * Factor by which to multiply number of online CPUs for total number of * worker threads From 1bdf22580b794a498b17617bcd7ae417d6b78444 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 29 Nov 2023 00:27:48 +1100 Subject: [PATCH 229/310] selftests/powerpc: Check all FPRs in fpu_syscall test There is a selftest that checks if FPRs are corrupted across a fork, aka clone. It was added as part of the series that optimised the clone path to save the parent's FP state without "giving up" (turning off FP). See commit 8792468da5e1 ("powerpc: Add the ability to save FPU without giving it up"). The test encodes the assumption that FPRs 0-13 are volatile across the syscall, by only checking the volatile FPRs are not changed by the fork. There was also a comment in the fpu_preempt test alluding to that: The check_fpu function in asm only checks the non volatile registers as it is reused from the syscall test It is true that the function call ABI treats f0-f13 as volatile, however the syscall ABI has since been documented as *not* treating those registers as volatile. See commit 7b8845a2a2ec ("powerpc/64: Document the syscall ABI"). So change the test to check all FPRs are not corrupted by the syscall. Note that this currently fails, because save_fpu() etc. do not restore f0/vsr0. Signed-off-by: Michael Ellerman Link: https://msgid.link/20231128132748.1990179-5-mpe@ellerman.id.au --- tools/testing/selftests/powerpc/math/fpu_asm.S | 7 ++++--- tools/testing/selftests/powerpc/math/fpu_syscall.c | 8 +++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/powerpc/math/fpu_asm.S b/tools/testing/selftests/powerpc/math/fpu_asm.S index 051392ad3ce7..efe1e1be4695 100644 --- a/tools/testing/selftests/powerpc/math/fpu_asm.S +++ b/tools/testing/selftests/powerpc/math/fpu_asm.S @@ -109,8 +109,9 @@ FUNC_START(test_fpu) std r3,STACK_FRAME_PARAM(0)(sp) # Address of darray std r4,STACK_FRAME_PARAM(1)(sp) # Address of pid - bl load_fpu - nop + // Load FPRs with expected values + OP_REGS lfd, 8, 0, 31, r3 + li r0,__NR_fork sc @@ -119,7 +120,7 @@ FUNC_START(test_fpu) std r3,0(r9) ld r3,STACK_FRAME_PARAM(0)(sp) - bl check_fpu + bl check_all_fprs nop POP_FPU(256) diff --git a/tools/testing/selftests/powerpc/math/fpu_syscall.c b/tools/testing/selftests/powerpc/math/fpu_syscall.c index 694f225c7e45..751d46b133fc 100644 --- a/tools/testing/selftests/powerpc/math/fpu_syscall.c +++ b/tools/testing/selftests/powerpc/math/fpu_syscall.c @@ -14,12 +14,11 @@ #include #include "utils.h" +#include "fpu.h" extern int test_fpu(double *darray, pid_t *pid); -double darray[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, - 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, - 2.1}; +double darray[32]; int syscall_fpu(void) { @@ -27,6 +26,9 @@ int syscall_fpu(void) int i; int ret; int child_ret; + + randomise_darray(darray, ARRAY_SIZE(darray)); + for (i = 0; i < 1000; i++) { /* test_fpu will fork() */ ret = test_fpu(darray, &fork_pid); From eb183b2cd0a6549992eca3c4ada0b1bc1d9340f5 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 13 Dec 2023 09:47:52 +0000 Subject: [PATCH 230/310] Revert "perf/arm_dmc620: Remove duplicate format attribute #defines" This reverts commit a5f4ca68f348ac059efd6a3d7ad4040aed1c0818. Pulling in the Arm-specific 'linux/perf/arm_pmu.h' header breaks the allmodconfig build for x86: > In file included from drivers/perf/arm_dmc620_pmu.c:26: > include/linux/perf/arm_pmu.h:15:10: fatal error: asm/cputype.h: No such file or directory > 15 | #include > | ^~~~~~~~~~~~~~~ Just put things back like they were so that the driver can continue to be compile-tested on a variety of architectures. Link: https://lore.kernel.org/r/20231213100931.12d9d85e@canb.auug.org.au Reported-by: Stephen Rothwell Signed-off-by: Will Deacon --- drivers/perf/arm_dmc620_pmu.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c index 9de9dc8db8db..30cea6859574 100644 --- a/drivers/perf/arm_dmc620_pmu.c +++ b/drivers/perf/arm_dmc620_pmu.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -190,6 +189,27 @@ static const struct attribute_group dmc620_pmu_events_attr_group = { #define ATTR_CFG_FLD_clkdiv2_LO 9 #define ATTR_CFG_FLD_clkdiv2_HI 9 +#define __GEN_PMU_FORMAT_ATTR(cfg, lo, hi) \ + (lo) == (hi) ? #cfg ":" #lo "\n" : #cfg ":" #lo "-" #hi + +#define _GEN_PMU_FORMAT_ATTR(cfg, lo, hi) \ + __GEN_PMU_FORMAT_ATTR(cfg, lo, hi) + +#define GEN_PMU_FORMAT_ATTR(name) \ + PMU_FORMAT_ATTR(name, \ + _GEN_PMU_FORMAT_ATTR(ATTR_CFG_FLD_##name##_CFG, \ + ATTR_CFG_FLD_##name##_LO, \ + ATTR_CFG_FLD_##name##_HI)) + +#define _ATTR_CFG_GET_FLD(attr, cfg, lo, hi) \ + ((((attr)->cfg) >> lo) & GENMASK_ULL(hi - lo, 0)) + +#define ATTR_CFG_GET_FLD(attr, name) \ + _ATTR_CFG_GET_FLD(attr, \ + ATTR_CFG_FLD_##name##_CFG, \ + ATTR_CFG_FLD_##name##_LO, \ + ATTR_CFG_FLD_##name##_HI) + GEN_PMU_FORMAT_ATTR(mask); GEN_PMU_FORMAT_ATTR(match); GEN_PMU_FORMAT_ATTR(invert); From 070b71f428facd9130319707db854ed8bd24637a Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Thu, 16 Nov 2023 17:50:32 +0530 Subject: [PATCH 231/310] powerpc/hv-gpci: Add return value check in affinity_domain_via_partition_show function To access hv-gpci kernel interface files data, the "Enable Performance Information Collection" option has to be set in hmc. Incase that option is not set and user try to read the interface files, it should give error message as operation not permitted. Result of accessing added interface files with disabled performance collection option: [command]# cat processor_bus_topology cat: processor_bus_topology: Operation not permitted [command]# cat processor_config cat: processor_config: Operation not permitted [command]# cat affinity_domain_via_domain cat: affinity_domain_via_domain: Operation not permitted [command]# cat affinity_domain_via_virtual_processor cat: affinity_domain_via_virtual_processor: Operation not permitted [command]# cat affinity_domain_via_partition Based on above result there is no error message when reading affinity_domain_via_partition file because of missing check for failed hcall. Fix this issue by adding a check in the start of affinity_domain_via_partition_show function, to return error incase hcall fails, with error type other then H_PARAMETER. Fixes: a15e0d6a6929 ("powerpc/hv_gpci: Add sysfs file inside hv_gpci device to show affinity domain via partition information") Reported-by: Disha Goel Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://msgid.link/20231116122033.160964-1-kjain@linux.ibm.com --- arch/powerpc/perf/hv-gpci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c index 39dbe6b348df..27f18119fda1 100644 --- a/arch/powerpc/perf/hv-gpci.c +++ b/arch/powerpc/perf/hv-gpci.c @@ -534,6 +534,9 @@ static ssize_t affinity_domain_via_partition_show(struct device *dev, struct dev if (!ret) goto parse_result; + if (ret && (ret != H_PARAMETER)) + goto out; + /* * ret value as 'H_PARAMETER' implies that the current buffer size * can't accommodate all the information, and a partial buffer From 01e346ffefda3a7088afebf02b940614179688e7 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 12 Dec 2023 11:01:48 -0600 Subject: [PATCH 232/310] powerpc/rtas: Avoid warning on invalid token argument to sys_rtas() rtas_token_to_function() WARNs when passed an invalid token; it's meant to catch bugs in kernel-based users of RTAS functions. However, user space controls the token value passed to rtas_token_to_function() by block_rtas_call(), so user space with sufficient privilege to use sys_rtas() can trigger the warnings at will: unexpected failed lookup for token 2048 WARNING: CPU: 20 PID: 2247 at arch/powerpc/kernel/rtas.c:556 rtas_token_to_function+0xfc/0x110 ... NIP rtas_token_to_function+0xfc/0x110 LR rtas_token_to_function+0xf8/0x110 Call Trace: rtas_token_to_function+0xf8/0x110 (unreliable) sys_rtas+0x188/0x880 system_call_exception+0x268/0x530 system_call_common+0x160/0x2c4 It's desirable to continue warning on bogus tokens in rtas_token_to_function(). Currently it is used to look up RTAS function descriptors when tracing, where we know there has to have been a successful descriptor lookup by different means already, and it would be a serious inconsistency for the reverse lookup to fail. So instead of weakening rtas_token_to_function()'s contract by removing the warnings, introduce rtas_token_to_function_untrusted(), which has no opinion on failed lookups. Convert block_rtas_call() and rtas_token_to_function() to use it. Fixes: 8252b88294d2 ("powerpc/rtas: improve function information lookups") Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231212-papr-sys_rtas-vs-lockdown-v6-1-e9eafd0c8c6c@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index c49f078382a9..ce37dc9860ef 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -544,6 +544,21 @@ static int __init rtas_token_to_function_xarray_init(void) } arch_initcall(rtas_token_to_function_xarray_init); +/* + * For use by sys_rtas(), where the token value is provided by user + * space and we don't want to warn on failed lookups. + */ +static const struct rtas_function *rtas_token_to_function_untrusted(s32 token) +{ + return xa_load(&rtas_token_to_function_xarray, token); +} + +/* + * Reverse lookup for deriving the function descriptor from a + * known-good token value in contexts where the former is not already + * available. @token must be valid, e.g. derived from the result of a + * prior lookup against the function table. + */ static const struct rtas_function *rtas_token_to_function(s32 token) { const struct rtas_function *func; @@ -551,7 +566,7 @@ static const struct rtas_function *rtas_token_to_function(s32 token) if (WARN_ONCE(token < 0, "invalid token %d", token)) return NULL; - func = xa_load(&rtas_token_to_function_xarray, token); + func = rtas_token_to_function_untrusted(token); if (WARN_ONCE(!func, "unexpected failed lookup for token %d", token)) return NULL; @@ -1721,7 +1736,7 @@ static bool block_rtas_call(int token, int nargs, * If this token doesn't correspond to a function the kernel * understands, you're not allowed to call it. */ - func = rtas_token_to_function(token); + func = rtas_token_to_function_untrusted(token); if (!func) goto err; /* From c500c6e736df030f8956080738f59701c0b43dd8 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 12 Dec 2023 11:01:49 -0600 Subject: [PATCH 233/310] powerpc/rtas: Add for_each_rtas_function() iterator Add a convenience macro for iterating over every element of the internal function table and convert the one site that can use it. An additional user of the macro is anticipated in changes to follow. Reviewed-by: "Aneesh Kumar K.V (IBM)" Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231212-papr-sys_rtas-vs-lockdown-v6-2-e9eafd0c8c6c@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index ce37dc9860ef..ae9b10c954a1 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -454,6 +454,11 @@ static struct rtas_function rtas_function_table[] __ro_after_init = { }, }; +#define for_each_rtas_function(funcp) \ + for (funcp = &rtas_function_table[0]; \ + funcp < &rtas_function_table[ARRAY_SIZE(rtas_function_table)]; \ + ++funcp) + /* * Nearly all RTAS calls need to be serialized. All uses of the * default rtas_args block must hold rtas_lock. @@ -525,10 +530,10 @@ static DEFINE_XARRAY(rtas_token_to_function_xarray); static int __init rtas_token_to_function_xarray_init(void) { + const struct rtas_function *func; int err = 0; - for (size_t i = 0; i < ARRAY_SIZE(rtas_function_table); ++i) { - const struct rtas_function *func = &rtas_function_table[i]; + for_each_rtas_function(func) { const s32 token = func->token; if (token == RTAS_UNKNOWN_SERVICE) From 669acc7eec223a81ea5e2420de85b61979ab7dad Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 12 Dec 2023 11:01:50 -0600 Subject: [PATCH 234/310] powerpc/rtas: Fall back to linear search on failed token->function lookup Enabling any of the powerpc:rtas_* tracepoints at boot is likely to result in an oops on RTAS platforms. For example, booting a QEMU pseries model with 'trace_event=powerpc:rtas_input' in the command line leads to: BUG: Kernel NULL pointer dereference on read at 0x00000008 Oops: Kernel access of bad area, sig: 7 [#1] NIP [c00000000004231c] do_enter_rtas+0x1bc/0x460 LR [c00000000004231c] do_enter_rtas+0x1bc/0x460 Call Trace: do_enter_rtas+0x1bc/0x460 (unreliable) rtas_call+0x22c/0x4a0 rtas_get_boot_time+0x80/0x14c read_persistent_clock64+0x124/0x150 read_persistent_wall_and_boot_offset+0x28/0x58 timekeeping_init+0x70/0x348 start_kernel+0xa0c/0xc1c start_here_common+0x1c/0x20 (This is preceded by a warning for the failed lookup in rtas_token_to_function().) This happens when __do_enter_rtas_trace() attempts a token to function descriptor lookup before the xarray containing the mappings has been set up. Fall back to linear scan of the table if rtas_token_to_function_xarray is empty. Fixes: 24098f580e2b ("powerpc/rtas: add tracepoints around RTAS entry") Reviewed-by: "Aneesh Kumar K.V (IBM)" Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231212-papr-sys_rtas-vs-lockdown-v6-3-e9eafd0c8c6c@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index ae9b10c954a1..f60a8e7bd5ed 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -572,11 +572,21 @@ static const struct rtas_function *rtas_token_to_function(s32 token) return NULL; func = rtas_token_to_function_untrusted(token); + if (func) + return func; + /* + * Fall back to linear scan in case the reverse mapping hasn't + * been initialized yet. + */ + if (xa_empty(&rtas_token_to_function_xarray)) { + for_each_rtas_function(func) { + if (func->token == token) + return func; + } + } - if (WARN_ONCE(!func, "unexpected failed lookup for token %d", token)) - return NULL; - - return func; + WARN_ONCE(true, "unexpected failed lookup for token %d", token); + return NULL; } /* This is here deliberately so it's only used in this file */ From 9592aa5ad59e736727fe7894e6e820e2d851abcf Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 12 Dec 2023 11:01:51 -0600 Subject: [PATCH 235/310] powerpc/rtas: Add function return status constants Not all of the generic RTAS function statuses specified in PAPR have symbolic constants and descriptions in rtas.h. Fix this, providing a little more background, slightly updating the existing wording, and improving the formatting. Reviewed-by: "Aneesh Kumar K.V (IBM)" Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231212-papr-sys_rtas-vs-lockdown-v6-4-e9eafd0c8c6c@linux.ibm.com --- arch/powerpc/include/asm/rtas.h | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index a7110ed52e25..08d19e6904f7 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -201,12 +201,25 @@ typedef struct { /* Memory set aside for sys_rtas to use with calls that need a work area. */ #define RTAS_USER_REGION_SIZE (64 * 1024) -/* RTAS return status codes */ -#define RTAS_HARDWARE_ERROR -1 /* Hardware Error */ -#define RTAS_BUSY -2 /* RTAS Busy */ -#define RTAS_INVALID_PARAMETER -3 /* Invalid indicator/domain/sensor etc. */ -#define RTAS_EXTENDED_DELAY_MIN 9900 -#define RTAS_EXTENDED_DELAY_MAX 9905 +/* + * Common RTAS function return values, derived from the table "RTAS + * Status Word Values" in PAPR+ v2.13 7.2.8: "Return Codes". If a + * function can return a value in this table then generally it has the + * meaning listed here. More extended commentary in the documentation + * for rtas_call(). + * + * RTAS functions may use negative and positive numbers not in this + * set for function-specific error and success conditions, + * respectively. + */ +#define RTAS_SUCCESS 0 /* Success. */ +#define RTAS_HARDWARE_ERROR -1 /* Hardware or other unspecified error. */ +#define RTAS_BUSY -2 /* Retry immediately. */ +#define RTAS_INVALID_PARAMETER -3 /* Invalid indicator/domain/sensor etc. */ +#define RTAS_UNEXPECTED_STATE_CHANGE -7 /* Seems limited to EEH and slot reset. */ +#define RTAS_EXTENDED_DELAY_MIN 9900 /* Retry after delaying for ~1ms. */ +#define RTAS_EXTENDED_DELAY_MAX 9905 /* Retry after delaying for ~100s. */ +#define RTAS_ML_ISOLATION_ERROR -9000 /* Multi-level isolation error. */ /* statuses specific to ibm,suspend-me */ #define RTAS_SUSPEND_ABORTED 9000 /* Suspension aborted */ From e7582edb78619abb4ebf0a6e1fed125dcd7243b6 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 12 Dec 2023 11:01:52 -0600 Subject: [PATCH 236/310] powerpc/rtas: Move token validation from block_rtas_call() to sys_rtas() The rtas system call handler sys_rtas() delegates certain input validation steps to a helper function: block_rtas_call(). One of these steps ensures that the user-supplied token value maps to a known RTAS function. This is done by performing a "reverse" token-to-function lookup via rtas_token_to_function_untrusted() to obtain an rtas_function object. In changes to come, sys_rtas() itself will need the function descriptor for the token. To prepare: * Move the lookup and validation up into sys_rtas() and pass the resulting rtas_function pointer to block_rtas_call(), which is otherwise unconcerned with the token value. * Change block_rtas_call() to report the RTAS function name instead of the token value on validation failures, since it can now rely on having a valid function descriptor. One behavior change is that sys_rtas() now silently errors out when passed a bad token, before calling block_rtas_call(). So we will no longer log "RTAS call blocked - exploit attempt?" on invalid tokens. This is consistent with how sys_rtas() currently handles other "metadata" (nargs and nret), while block_rtas_call() is primarily concerned with validating the arguments to be passed to specific RTAS functions. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231212-papr-sys_rtas-vs-lockdown-v6-5-e9eafd0c8c6c@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index f60a8e7bd5ed..ca5bb0b994ac 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -1738,24 +1738,18 @@ static bool in_rmo_buf(u32 base, u32 end) end < (rtas_rmo_buf + RTAS_USER_REGION_SIZE); } -static bool block_rtas_call(int token, int nargs, +static bool block_rtas_call(const struct rtas_function *func, int nargs, struct rtas_args *args) { - const struct rtas_function *func; const struct rtas_filter *f; - const bool is_platform_dump = token == rtas_function_token(RTAS_FN_IBM_PLATFORM_DUMP); - const bool is_config_conn = token == rtas_function_token(RTAS_FN_IBM_CONFIGURE_CONNECTOR); + const bool is_platform_dump = + func == &rtas_function_table[RTAS_FNIDX__IBM_PLATFORM_DUMP]; + const bool is_config_conn = + func == &rtas_function_table[RTAS_FNIDX__IBM_CONFIGURE_CONNECTOR]; u32 base, size, end; /* - * If this token doesn't correspond to a function the kernel - * understands, you're not allowed to call it. - */ - func = rtas_token_to_function_untrusted(token); - if (!func) - goto err; - /* - * And only functions with filters attached are allowed. + * Only functions with filters attached are allowed. */ f = func->filter; if (!f) @@ -1812,14 +1806,15 @@ static bool block_rtas_call(int token, int nargs, return false; err: pr_err_ratelimited("sys_rtas: RTAS call blocked - exploit attempt?\n"); - pr_err_ratelimited("sys_rtas: token=0x%x, nargs=%d (called by %s)\n", - token, nargs, current->comm); + pr_err_ratelimited("sys_rtas: %s nargs=%d (called by %s)\n", + func->name, nargs, current->comm); return true; } /* We assume to be passed big endian arguments */ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) { + const struct rtas_function *func; struct pin_cookie cookie; struct rtas_args args; unsigned long flags; @@ -1849,13 +1844,18 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) nargs * sizeof(rtas_arg_t)) != 0) return -EFAULT; - if (token == RTAS_UNKNOWN_SERVICE) + /* + * If this token doesn't correspond to a function the kernel + * understands, you're not allowed to call it. + */ + func = rtas_token_to_function_untrusted(token); + if (!func) return -EINVAL; args.rets = &args.args[nargs]; memset(args.rets, 0, nret * sizeof(rtas_arg_t)); - if (block_rtas_call(token, nargs, &args)) + if (block_rtas_call(func, nargs, &args)) return -EINVAL; if (token_is_restricted_errinjct(token)) { From adf7a019e5f82607fc0f0079926d0178afe8f4ef Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 12 Dec 2023 11:01:53 -0600 Subject: [PATCH 237/310] powerpc/rtas: Facilitate high-level call sequences On RTAS platforms there is a general restriction that the OS must not enter RTAS on more than one CPU at a time. This low-level serialization requirement is satisfied by holding a spin lock (rtas_lock) across most RTAS function invocations. However, some pseries RTAS functions require multiple successive calls to complete a logical operation. Beginning a new call sequence for such a function may disrupt any other sequences of that function already in progress. Safe and reliable use of these functions effectively requires higher-level serialization beyond what is already done at the level of RTAS entry and exit. Where a sequence-based RTAS function is invoked only through sys_rtas(), with no in-kernel users, there is no issue as far as the kernel is concerned. User space is responsible for appropriately serializing its call sequences. (Whether user space code actually takes measures to prevent sequence interleaving is another matter.) Examples of such functions currently include ibm,platform-dump and ibm,get-vpd. But where a sequence-based RTAS function has both user space and in-kernel uesrs, there is a hazard. Even if the in-kernel call sites of such a function serialize their sequences correctly, a user of sys_rtas() can invoke the same function at any time, potentially disrupting a sequence in progress. So in order to prevent disruption of kernel-based RTAS call sequences, they must serialize not only with themselves but also with sys_rtas() users, somehow. Preferably without adding more function-specific hacks to sys_rtas(). This is a prerequisite for adding an in-kernel call sequence of ibm,get-vpd, which is in a change to follow. Note that it has never been feasible for the kernel to prevent sys_rtas()-based sequences from being disrupted because control returns to user space on every call. sys_rtas()-based users of these functions have always been, and continue to be, responsible for coordinating their call sequences with other users, even those which may invoke the RTAS functions through less direct means than sys_rtas(). This is an unavoidable consequence of exposing sequence-based RTAS functions through sys_rtas(). * Add an optional mutex member to struct rtas_function. * Statically define a mutex for each RTAS function with known call sequence serialization requirements, and assign its address to the .lock member of the corresponding function table entry, along with justifying commentary. * In sys_rtas(), if the table entry for the RTAS function being called has a populated lock member, acquire it before taking rtas_lock and entering RTAS. * Kernel-based RTAS call sequences are expected to access the appropriate mutex explicitly by name. For example, a user of the ibm,activate-firmware RTAS function would do: int token = rtas_function_token(RTAS_FN_IBM_ACTIVATE_FIRMWARE); int fwrc; mutex_lock(&rtas_ibm_activate_firmware_lock); do { fwrc = rtas_call(token, 0, 1, NULL); } while (rtas_busy_delay(fwrc)); mutex_unlock(&rtas_ibm_activate_firmware_lock); There should be no perceivable change introduced here except that concurrent callers of the same RTAS function via sys_rtas() may block on a mutex instead of spinning on rtas_lock. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231212-papr-sys_rtas-vs-lockdown-v6-6-e9eafd0c8c6c@linux.ibm.com --- arch/powerpc/include/asm/rtas.h | 3 ++ arch/powerpc/kernel/rtas.c | 83 +++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 08d19e6904f7..9bb2210c8d44 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -3,6 +3,7 @@ #define _POWERPC_RTAS_H #ifdef __KERNEL__ +#include #include #include #include @@ -512,6 +513,8 @@ extern char rtas_data_buf[RTAS_DATA_BUF_SIZE]; /* RMO buffer reserved for user-space RTAS use */ extern unsigned long rtas_rmo_buf; +extern struct mutex rtas_ibm_get_vpd_lock; + #define GLOBAL_INTERRUPT_QUEUE 9005 /** diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index ca5bb0b994ac..4d28983e8b1d 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -70,14 +71,33 @@ struct rtas_filter { * ppc64le, and we want to keep it that way. It does * not make sense for this to be set when @filter * is NULL. + * @lock: Pointer to an optional dedicated per-function mutex. This + * should be set for functions that require multiple calls in + * sequence to complete a single operation, and such sequences + * will disrupt each other if allowed to interleave. Users of + * this function are required to hold the associated lock for + * the duration of the call sequence. Add an explanatory + * comment to the function table entry if setting this member. */ struct rtas_function { s32 token; const bool banned_for_syscall_on_le:1; const char * const name; const struct rtas_filter *filter; + struct mutex *lock; }; +/* + * Per-function locks for sequence-based RTAS functions. + */ +static DEFINE_MUTEX(rtas_ibm_activate_firmware_lock); +static DEFINE_MUTEX(rtas_ibm_get_dynamic_sensor_state_lock); +static DEFINE_MUTEX(rtas_ibm_get_indices_lock); +static DEFINE_MUTEX(rtas_ibm_lpar_perftools_lock); +static DEFINE_MUTEX(rtas_ibm_physical_attestation_lock); +static DEFINE_MUTEX(rtas_ibm_set_dynamic_indicator_lock); +DEFINE_MUTEX(rtas_ibm_get_vpd_lock); + static struct rtas_function rtas_function_table[] __ro_after_init = { [RTAS_FNIDX__CHECK_EXCEPTION] = { .name = "check-exception", @@ -125,6 +145,13 @@ static struct rtas_function rtas_function_table[] __ro_after_init = { .buf_idx1 = -1, .size_idx1 = -1, .buf_idx2 = -1, .size_idx2 = -1, }, + /* + * PAPR+ as of v2.13 doesn't explicitly impose any + * restriction, but this typically requires multiple + * calls before success, and there's no reason to + * allow sequences to interleave. + */ + .lock = &rtas_ibm_activate_firmware_lock, }, [RTAS_FNIDX__IBM_CBE_START_PTCAL] = { .name = "ibm,cbe-start-ptcal", @@ -196,6 +223,13 @@ static struct rtas_function rtas_function_table[] __ro_after_init = { .buf_idx1 = 1, .size_idx1 = -1, .buf_idx2 = -1, .size_idx2 = -1, }, + /* + * PAPR+ v2.13 R1–7.3.19–3 is explicit that the OS + * must not call ibm,get-dynamic-sensor-state with + * different inputs until a non-retry status has been + * returned. + */ + .lock = &rtas_ibm_get_dynamic_sensor_state_lock, }, [RTAS_FNIDX__IBM_GET_INDICES] = { .name = "ibm,get-indices", @@ -203,6 +237,12 @@ static struct rtas_function rtas_function_table[] __ro_after_init = { .buf_idx1 = 2, .size_idx1 = 3, .buf_idx2 = -1, .size_idx2 = -1, }, + /* + * PAPR+ v2.13 R1–7.3.17–2 says that the OS must not + * interleave ibm,get-indices call sequences with + * different inputs. + */ + .lock = &rtas_ibm_get_indices_lock, }, [RTAS_FNIDX__IBM_GET_RIO_TOPOLOGY] = { .name = "ibm,get-rio-topology", @@ -220,6 +260,11 @@ static struct rtas_function rtas_function_table[] __ro_after_init = { .buf_idx1 = 0, .size_idx1 = -1, .buf_idx2 = 1, .size_idx2 = 2, }, + /* + * PAPR+ v2.13 R1–7.3.20–4 indicates that sequences + * should not be allowed to interleave. + */ + .lock = &rtas_ibm_get_vpd_lock, }, [RTAS_FNIDX__IBM_GET_XIVE] = { .name = "ibm,get-xive", @@ -239,6 +284,11 @@ static struct rtas_function rtas_function_table[] __ro_after_init = { .buf_idx1 = 2, .size_idx1 = 3, .buf_idx2 = -1, .size_idx2 = -1, }, + /* + * PAPR+ v2.13 R1–7.3.26–6 says the OS should allow + * only one call sequence in progress at a time. + */ + .lock = &rtas_ibm_lpar_perftools_lock, }, [RTAS_FNIDX__IBM_MANAGE_FLASH_IMAGE] = { .name = "ibm,manage-flash-image", @@ -277,6 +327,14 @@ static struct rtas_function rtas_function_table[] __ro_after_init = { .buf_idx1 = 0, .size_idx1 = 1, .buf_idx2 = -1, .size_idx2 = -1, }, + /* + * This follows a sequence-based pattern similar to + * ibm,get-vpd et al. Since PAPR+ restricts + * interleaving call sequences for other functions of + * this style, assume the restriction applies here, + * even though it's not explicit in the spec. + */ + .lock = &rtas_ibm_physical_attestation_lock, }, [RTAS_FNIDX__IBM_PLATFORM_DUMP] = { .name = "ibm,platform-dump", @@ -284,6 +342,13 @@ static struct rtas_function rtas_function_table[] __ro_after_init = { .buf_idx1 = 4, .size_idx1 = 5, .buf_idx2 = -1, .size_idx2 = -1, }, + /* + * PAPR+ v2.13 7.3.3.4.1 indicates that concurrent + * sequences of ibm,platform-dump are allowed if they + * are operating on different dump tags. So leave the + * lock pointer unset for now. This may need + * reconsideration if kernel-internal users appear. + */ }, [RTAS_FNIDX__IBM_POWER_OFF_UPS] = { .name = "ibm,power-off-ups", @@ -326,6 +391,12 @@ static struct rtas_function rtas_function_table[] __ro_after_init = { .buf_idx1 = 2, .size_idx1 = -1, .buf_idx2 = -1, .size_idx2 = -1, }, + /* + * PAPR+ v2.13 R1–7.3.18–3 says the OS must not call + * this function with different inputs until a + * non-retry status has been returned. + */ + .lock = &rtas_ibm_set_dynamic_indicator_lock, }, [RTAS_FNIDX__IBM_SET_EEH_OPTION] = { .name = "ibm,set-eeh-option", @@ -1888,6 +1959,15 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) buff_copy = get_errorlog_buffer(); + /* + * If this function has a mutex assigned to it, we must + * acquire it to avoid interleaving with any kernel-based uses + * of the same function. Kernel-based sequences acquire the + * appropriate mutex explicitly. + */ + if (func->lock) + mutex_lock(func->lock); + raw_spin_lock_irqsave(&rtas_lock, flags); cookie = lockdep_pin_lock(&rtas_lock); @@ -1903,6 +1983,9 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) lockdep_unpin_lock(&rtas_lock, cookie); raw_spin_unlock_irqrestore(&rtas_lock, flags); + if (func->lock) + mutex_unlock(func->lock); + if (buff_copy) { if (errbuf) log_error(errbuf, ERR_TYPE_RTAS_LOG, 0); From dc7637c402b90a197d3f21a3d78f2b00b67ea22a Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 12 Dec 2023 11:01:54 -0600 Subject: [PATCH 238/310] powerpc/rtas: Serialize firmware activation sequences Use rtas_ibm_activate_firmware_lock to prevent interleaving call sequences of the ibm,activate-firmware RTAS function, which typically requires multiple calls to complete the update. While the spec does not specifically prohibit interleaved sequences, there's almost certainly no advantage to allowing them. Reviewed-by: "Aneesh Kumar K.V (IBM)" Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231212-papr-sys_rtas-vs-lockdown-v6-7-e9eafd0c8c6c@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 4d28983e8b1d..72f6b5a402dd 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -1734,10 +1734,14 @@ void rtas_activate_firmware(void) return; } + mutex_lock(&rtas_ibm_activate_firmware_lock); + do { fwrc = rtas_call(token, 0, 1, NULL); } while (rtas_busy_delay(fwrc)); + mutex_unlock(&rtas_ibm_activate_firmware_lock); + if (fwrc) pr_err("ibm,activate-firmware failed (%i)\n", fwrc); } From e3681107bc9f97c5948a1c8a3a97ac64907210ce Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 12 Dec 2023 11:01:55 -0600 Subject: [PATCH 239/310] powerpc/rtas: Warn if per-function lock isn't held If the function descriptor has a populated lock member, then callers are required to hold it across calls. Now that the firmware activation sequence is appropriately guarded, we can warn when the requirement isn't satisfied. __do_enter_rtas_trace() gets reorganized a bit as a result of performing the function descriptor lookup unconditionally now. Reviewed-by: "Aneesh Kumar K.V (IBM)" Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231212-papr-sys_rtas-vs-lockdown-v6-8-e9eafd0c8c6c@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 72f6b5a402dd..7e793b503e29 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -671,28 +671,25 @@ static void __do_enter_rtas(struct rtas_args *args) static void __do_enter_rtas_trace(struct rtas_args *args) { - const char *name = NULL; + const struct rtas_function *func = rtas_token_to_function(be32_to_cpu(args->token)); + + /* + * If there is a per-function lock, it must be held by the + * caller. + */ + if (func->lock) + lockdep_assert_held(func->lock); if (args == &rtas_args) lockdep_assert_held(&rtas_lock); - /* - * If the tracepoints that consume the function name aren't - * active, avoid the lookup. - */ - if ((trace_rtas_input_enabled() || trace_rtas_output_enabled())) { - const s32 token = be32_to_cpu(args->token); - const struct rtas_function *func = rtas_token_to_function(token); - name = func->name; - } - - trace_rtas_input(args, name); + trace_rtas_input(args, func->name); trace_rtas_ll_entry(args); __do_enter_rtas(args); trace_rtas_ll_exit(args); - trace_rtas_output(args, name); + trace_rtas_output(args, func->name); } static void do_enter_rtas(struct rtas_args *args) From 514f6ff4369a30bf0da71a1a09fd47b2fca5d76f Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 12 Dec 2023 11:01:56 -0600 Subject: [PATCH 240/310] powerpc/pseries: Add papr-vpd character driver for VPD retrieval MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PowerVM LPARs may retrieve Vital Product Data (VPD) for system components using the ibm,get-vpd RTAS function. We can expose this to user space with a /dev/papr-vpd character device, where the programming model is: struct papr_location_code plc = { .str = "", }; /* obtain all VPD */ int devfd = open("/dev/papr-vpd", O_RDONLY); int vpdfd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, &plc); size_t size = lseek(vpdfd, 0, SEEK_END); char *buf = malloc(size); pread(devfd, buf, size, 0); When a file descriptor is obtained from ioctl(PAPR_VPD_CREATE_HANDLE), the file contains the result of a complete ibm,get-vpd sequence. The file contents are immutable from the POV of user space. To get a new view of the VPD, the client must create a new handle. This design choice insulates user space from most of the complexities that ibm,get-vpd brings: * ibm,get-vpd must be called more than once to obtain complete results. * Only one ibm,get-vpd call sequence should be in progress at a time; interleaved sequences will disrupt each other. Callers must have a protocol for serializing their use of the function. * A call sequence in progress may receive a "VPD changed, try again" status, requiring the client to abandon the sequence and start over. The memory required for the VPD buffers seems acceptable, around 20KB for all VPD on one of my systems. And the value of the /rtas/ibm,vpd-size DT property (the estimated maximum size of VPD) is consistently 300KB across various systems I've checked. I've implemented support for this new ABI in the rtas_get_vpd() function in librtas, which the vpdupdate command currently uses to populate its VPD database. I've verified that an unmodified vpdupdate binary generates an identical database when using a librtas.so that prefers the new ABI. Along with the papr-vpd.h header exposed to user space, this introduces a common papr-miscdev.h uapi header to share a base ioctl ID with similar drivers to come. Tested-by: Michal Suchánek Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231212-papr-sys_rtas-vs-lockdown-v6-9-e9eafd0c8c6c@linux.ibm.com --- .../userspace-api/ioctl/ioctl-number.rst | 2 + arch/powerpc/include/uapi/asm/papr-miscdev.h | 9 + arch/powerpc/include/uapi/asm/papr-vpd.h | 22 + arch/powerpc/platforms/pseries/Makefile | 1 + arch/powerpc/platforms/pseries/papr-vpd.c | 541 ++++++++++++++++++ 5 files changed, 575 insertions(+) create mode 100644 arch/powerpc/include/uapi/asm/papr-miscdev.h create mode 100644 arch/powerpc/include/uapi/asm/papr-vpd.h create mode 100644 arch/powerpc/platforms/pseries/papr-vpd.c diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 4ea5b837399a..a950545bf7cd 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -349,6 +349,8 @@ Code Seq# Include File Comments 0xB1 00-1F PPPoX +0xB2 00 arch/powerpc/include/uapi/asm/papr-vpd.h powerpc/pseries VPD API + 0xB3 00 linux/mmc/ioctl.h 0xB4 00-0F linux/gpio.h 0xB5 00-0F uapi/linux/rpmsg.h diff --git a/arch/powerpc/include/uapi/asm/papr-miscdev.h b/arch/powerpc/include/uapi/asm/papr-miscdev.h new file mode 100644 index 000000000000..49a2a270b7f3 --- /dev/null +++ b/arch/powerpc/include/uapi/asm/papr-miscdev.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_PAPR_MISCDEV_H_ +#define _UAPI_PAPR_MISCDEV_H_ + +enum { + PAPR_MISCDEV_IOC_ID = 0xb2, +}; + +#endif /* _UAPI_PAPR_MISCDEV_H_ */ diff --git a/arch/powerpc/include/uapi/asm/papr-vpd.h b/arch/powerpc/include/uapi/asm/papr-vpd.h new file mode 100644 index 000000000000..1c88e87cb420 --- /dev/null +++ b/arch/powerpc/include/uapi/asm/papr-vpd.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_PAPR_VPD_H_ +#define _UAPI_PAPR_VPD_H_ + +#include +#include + +struct papr_location_code { + /* + * PAPR+ v2.13 12.3.2.4 Converged Location Code Rules - Length + * Restrictions. 79 characters plus nul. + */ + char str[80]; +}; + +/* + * ioctl for /dev/papr-vpd. Returns a VPD handle fd corresponding to + * the location code. + */ +#define PAPR_VPD_IOC_CREATE_HANDLE _IOW(PAPR_MISCDEV_IOC_ID, 0, struct papr_location_code) + +#endif /* _UAPI_PAPR_VPD_H_ */ diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 1476c5e4433c..f936962a2946 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -4,6 +4,7 @@ ccflags-$(CONFIG_PPC_PSERIES_DEBUG) += -DDEBUG obj-y := lpar.o hvCall.o nvram.o reconfig.o \ of_helpers.o rtas-work-area.o papr-sysparm.o \ + papr-vpd.o \ setup.o iommu.o event_sources.o ras.o \ firmware.o power.o dlpar.o mobility.o rng.o \ pci.o pci_dlpar.o eeh_pseries.o msi.o \ diff --git a/arch/powerpc/platforms/pseries/papr-vpd.c b/arch/powerpc/platforms/pseries/papr-vpd.c new file mode 100644 index 000000000000..c29e85db5f35 --- /dev/null +++ b/arch/powerpc/platforms/pseries/papr-vpd.c @@ -0,0 +1,541 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "papr-vpd: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Function-specific return values for ibm,get-vpd, derived from PAPR+ + * v2.13 7.3.20 "ibm,get-vpd RTAS Call". + */ +#define RTAS_IBM_GET_VPD_COMPLETE 0 /* All VPD has been retrieved. */ +#define RTAS_IBM_GET_VPD_MORE_DATA 1 /* More VPD is available. */ +#define RTAS_IBM_GET_VPD_START_OVER -4 /* VPD changed, restart call sequence. */ + +/** + * struct rtas_ibm_get_vpd_params - Parameters (in and out) for ibm,get-vpd. + * @loc_code: In: Caller-provided location code buffer. Must be RTAS-addressable. + * @work_area: In: Caller-provided work area buffer for results. + * @sequence: In: Sequence number. Out: Next sequence number. + * @written: Out: Bytes written by ibm,get-vpd to @work_area. + * @status: Out: RTAS call status. + */ +struct rtas_ibm_get_vpd_params { + const struct papr_location_code *loc_code; + struct rtas_work_area *work_area; + u32 sequence; + u32 written; + s32 status; +}; + +/** + * rtas_ibm_get_vpd() - Call ibm,get-vpd to fill a work area buffer. + * @params: See &struct rtas_ibm_get_vpd_params. + * + * Calls ibm,get-vpd until it errors or successfully deposits data + * into the supplied work area. Handles RTAS retry statuses. Maps RTAS + * error statuses to reasonable errno values. + * + * The caller is expected to invoke rtas_ibm_get_vpd() multiple times + * to retrieve all the VPD for the provided location code. Only one + * sequence should be in progress at any time; starting a new sequence + * will disrupt any sequence already in progress. Serialization of VPD + * retrieval sequences is the responsibility of the caller. + * + * The caller should inspect @params.status to determine whether more + * calls are needed to complete the sequence. + * + * Context: May sleep. + * Return: -ve on error, 0 otherwise. + */ +static int rtas_ibm_get_vpd(struct rtas_ibm_get_vpd_params *params) +{ + const struct papr_location_code *loc_code = params->loc_code; + struct rtas_work_area *work_area = params->work_area; + u32 rets[2]; + s32 fwrc; + int ret; + + lockdep_assert_held(&rtas_ibm_get_vpd_lock); + + do { + fwrc = rtas_call(rtas_function_token(RTAS_FN_IBM_GET_VPD), 4, 3, + rets, + __pa(loc_code), + rtas_work_area_phys(work_area), + rtas_work_area_size(work_area), + params->sequence); + } while (rtas_busy_delay(fwrc)); + + switch (fwrc) { + case RTAS_HARDWARE_ERROR: + ret = -EIO; + break; + case RTAS_INVALID_PARAMETER: + ret = -EINVAL; + break; + case RTAS_IBM_GET_VPD_START_OVER: + ret = -EAGAIN; + break; + case RTAS_IBM_GET_VPD_MORE_DATA: + params->sequence = rets[0]; + fallthrough; + case RTAS_IBM_GET_VPD_COMPLETE: + params->written = rets[1]; + /* + * Kernel or firmware bug, do not continue. + */ + if (WARN(params->written > rtas_work_area_size(work_area), + "possible write beyond end of work area")) + ret = -EFAULT; + else + ret = 0; + break; + default: + ret = -EIO; + pr_err_ratelimited("unexpected ibm,get-vpd status %d\n", fwrc); + break; + } + + params->status = fwrc; + return ret; +} + +/* + * Internal VPD "blob" APIs for accumulating ibm,get-vpd results into + * an immutable buffer to be attached to a file descriptor. + */ +struct vpd_blob { + const char *data; + size_t len; +}; + +static bool vpd_blob_has_data(const struct vpd_blob *blob) +{ + return blob->data && blob->len; +} + +static void vpd_blob_free(const struct vpd_blob *blob) +{ + if (blob) { + kvfree(blob->data); + kfree(blob); + } +} + +/** + * vpd_blob_extend() - Append data to a &struct vpd_blob. + * @blob: The blob to extend. + * @data: The new data to append to @blob. + * @len: The length of @data. + * + * Context: May sleep. + * Return: -ENOMEM on allocation failure, 0 otherwise. + */ +static int vpd_blob_extend(struct vpd_blob *blob, const char *data, size_t len) +{ + const size_t new_len = blob->len + len; + const size_t old_len = blob->len; + const char *old_ptr = blob->data; + char *new_ptr; + + new_ptr = old_ptr ? + kvrealloc(old_ptr, old_len, new_len, GFP_KERNEL_ACCOUNT) : + kvmalloc(len, GFP_KERNEL_ACCOUNT); + + if (!new_ptr) + return -ENOMEM; + + memcpy(&new_ptr[old_len], data, len); + blob->data = new_ptr; + blob->len = new_len; + return 0; +} + +/** + * vpd_blob_generate() - Construct a new &struct vpd_blob. + * @generator: Function that supplies the blob data. + * @arg: Context pointer supplied by caller, passed to @generator. + * + * The @generator callback is invoked until it returns NULL. @arg is + * passed to @generator in its first argument on each call. When + * @generator returns data, it should store the data length in its + * second argument. + * + * Context: May sleep. + * Return: A completely populated &struct vpd_blob, or NULL on error. + */ +static const struct vpd_blob * +vpd_blob_generate(const char * (*generator)(void *, size_t *), void *arg) +{ + struct vpd_blob *blob; + const char *buf; + size_t len; + int err = 0; + + blob = kzalloc(sizeof(*blob), GFP_KERNEL_ACCOUNT); + if (!blob) + return NULL; + + while (err == 0 && (buf = generator(arg, &len))) + err = vpd_blob_extend(blob, buf, len); + + if (err != 0 || !vpd_blob_has_data(blob)) + goto free_blob; + + return blob; +free_blob: + vpd_blob_free(blob); + return NULL; +} + +/* + * Internal VPD sequence APIs. A VPD sequence is a series of calls to + * ibm,get-vpd for a given location code. The sequence ends when an + * error is encountered or all VPD for the location code has been + * returned. + */ + +/** + * struct vpd_sequence - State for managing a VPD sequence. + * @error: Shall be zero as long as the sequence has not encountered an error, + * -ve errno otherwise. Use vpd_sequence_set_err() to update this. + * @params: Parameter block to pass to rtas_ibm_get_vpd(). + */ +struct vpd_sequence { + int error; + struct rtas_ibm_get_vpd_params params; +}; + +/** + * vpd_sequence_begin() - Begin a VPD retrieval sequence. + * @seq: Uninitialized sequence state. + * @loc_code: Location code that defines the scope of the VPD to return. + * + * Initializes @seq with the resources necessary to carry out a VPD + * sequence. Callers must pass @seq to vpd_sequence_end() regardless + * of whether the sequence succeeds. + * + * Context: May sleep. + */ +static void vpd_sequence_begin(struct vpd_sequence *seq, + const struct papr_location_code *loc_code) +{ + /* + * Use a static data structure for the location code passed to + * RTAS to ensure it's in the RMA and avoid a separate work + * area allocation. Guarded by the function lock. + */ + static struct papr_location_code static_loc_code; + + /* + * We could allocate the work area before acquiring the + * function lock, but that would allow concurrent requests to + * exhaust the limited work area pool for no benefit. So + * allocate the work area under the lock. + */ + mutex_lock(&rtas_ibm_get_vpd_lock); + static_loc_code = *loc_code; + *seq = (struct vpd_sequence) { + .params = { + .work_area = rtas_work_area_alloc(SZ_4K), + .loc_code = &static_loc_code, + .sequence = 1, + }, + }; +} + +/** + * vpd_sequence_end() - Finalize a VPD retrieval sequence. + * @seq: Sequence state. + * + * Releases resources obtained by vpd_sequence_begin(). + */ +static void vpd_sequence_end(struct vpd_sequence *seq) +{ + rtas_work_area_free(seq->params.work_area); + mutex_unlock(&rtas_ibm_get_vpd_lock); +} + +/** + * vpd_sequence_should_stop() - Determine whether a VPD retrieval sequence + * should continue. + * @seq: VPD sequence state. + * + * Examines the sequence error state and outputs of the last call to + * ibm,get-vpd to determine whether the sequence in progress should + * continue or stop. + * + * Return: True if the sequence has encountered an error or if all VPD for + * this sequence has been retrieved. False otherwise. + */ +static bool vpd_sequence_should_stop(const struct vpd_sequence *seq) +{ + bool done; + + if (seq->error) + return true; + + switch (seq->params.status) { + case 0: + if (seq->params.written == 0) + done = false; /* Initial state. */ + else + done = true; /* All data consumed. */ + break; + case 1: + done = false; /* More data available. */ + break; + default: + done = true; /* Error encountered. */ + break; + } + + return done; +} + +static int vpd_sequence_set_err(struct vpd_sequence *seq, int err) +{ + /* Preserve the first error recorded. */ + if (seq->error == 0) + seq->error = err; + + return seq->error; +} + +/* + * Generator function to be passed to vpd_blob_generate(). + */ +static const char *vpd_sequence_fill_work_area(void *arg, size_t *len) +{ + struct vpd_sequence *seq = arg; + struct rtas_ibm_get_vpd_params *p = &seq->params; + + if (vpd_sequence_should_stop(seq)) + return NULL; + if (vpd_sequence_set_err(seq, rtas_ibm_get_vpd(p))) + return NULL; + *len = p->written; + return rtas_work_area_raw_buf(p->work_area); +} + +/* + * Higher-level VPD retrieval code below. These functions use the + * vpd_blob_* and vpd_sequence_* APIs defined above to create fd-based + * VPD handles for consumption by user space. + */ + +/** + * papr_vpd_run_sequence() - Run a single VPD retrieval sequence. + * @loc_code: Location code that defines the scope of VPD to return. + * + * Context: May sleep. Holds a mutex and an RTAS work area for its + * duration. Typically performs multiple sleepable slab + * allocations. + * + * Return: A populated &struct vpd_blob on success. Encoded error + * pointer otherwise. + */ +static const struct vpd_blob *papr_vpd_run_sequence(const struct papr_location_code *loc_code) +{ + const struct vpd_blob *blob; + struct vpd_sequence seq; + + vpd_sequence_begin(&seq, loc_code); + blob = vpd_blob_generate(vpd_sequence_fill_work_area, &seq); + if (!blob) + vpd_sequence_set_err(&seq, -ENOMEM); + vpd_sequence_end(&seq); + + if (seq.error) { + vpd_blob_free(blob); + return ERR_PTR(seq.error); + } + + return blob; +} + +/** + * papr_vpd_retrieve() - Return the VPD for a location code. + * @loc_code: Location code that defines the scope of VPD to return. + * + * Run VPD sequences against @loc_code until a blob is successfully + * instantiated, or a hard error is encountered, or a fatal signal is + * pending. + * + * Context: May sleep. + * Return: A fully populated VPD blob when successful. Encoded error + * pointer otherwise. + */ +static const struct vpd_blob *papr_vpd_retrieve(const struct papr_location_code *loc_code) +{ + const struct vpd_blob *blob; + + /* + * EAGAIN means the sequence errored with a -4 (VPD changed) + * status from ibm,get-vpd, and we should attempt a new + * sequence. PAPR+ v2.13 R1–7.3.20–5 indicates that this + * should be a transient condition, not something that happens + * continuously. But we'll stop trying on a fatal signal. + */ + do { + blob = papr_vpd_run_sequence(loc_code); + if (!IS_ERR(blob)) /* Success. */ + break; + if (PTR_ERR(blob) != -EAGAIN) /* Hard error. */ + break; + pr_info_ratelimited("VPD changed during retrieval, retrying\n"); + cond_resched(); + } while (!fatal_signal_pending(current)); + + return blob; +} + +static ssize_t papr_vpd_handle_read(struct file *file, char __user *buf, size_t size, loff_t *off) +{ + const struct vpd_blob *blob = file->private_data; + + /* bug: we should not instantiate a handle without any data attached. */ + if (!vpd_blob_has_data(blob)) { + pr_err_once("handle without data\n"); + return -EIO; + } + + return simple_read_from_buffer(buf, size, off, blob->data, blob->len); +} + +static int papr_vpd_handle_release(struct inode *inode, struct file *file) +{ + const struct vpd_blob *blob = file->private_data; + + vpd_blob_free(blob); + + return 0; +} + +static loff_t papr_vpd_handle_seek(struct file *file, loff_t off, int whence) +{ + const struct vpd_blob *blob = file->private_data; + + return fixed_size_llseek(file, off, whence, blob->len); +} + + +static const struct file_operations papr_vpd_handle_ops = { + .read = papr_vpd_handle_read, + .llseek = papr_vpd_handle_seek, + .release = papr_vpd_handle_release, +}; + +/** + * papr_vpd_create_handle() - Create a fd-based handle for reading VPD. + * @ulc: Location code in user memory; defines the scope of the VPD to + * retrieve. + * + * Handler for PAPR_VPD_IOC_CREATE_HANDLE ioctl command. Validates + * @ulc and instantiates an immutable VPD "blob" for it. The blob is + * attached to a file descriptor for reading by user space. The memory + * backing the blob is freed when the file is released. + * + * The entire requested VPD is retrieved by this call and all + * necessary RTAS interactions are performed before returning the fd + * to user space. This keeps the read handler simple and ensures that + * the kernel can prevent interleaving of ibm,get-vpd call sequences. + * + * Return: The installed fd number if successful, -ve errno otherwise. + */ +static long papr_vpd_create_handle(struct papr_location_code __user *ulc) +{ + struct papr_location_code klc; + const struct vpd_blob *blob; + struct file *file; + long err; + int fd; + + if (copy_from_user(&klc, ulc, sizeof(klc))) + return -EFAULT; + + if (!string_is_terminated(klc.str, ARRAY_SIZE(klc.str))) + return -EINVAL; + + blob = papr_vpd_retrieve(&klc); + if (IS_ERR(blob)) + return PTR_ERR(blob); + + fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC); + if (fd < 0) { + err = fd; + goto free_blob; + } + + file = anon_inode_getfile("[papr-vpd]", &papr_vpd_handle_ops, + (void *)blob, O_RDONLY); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto put_fd; + } + + file->f_mode |= FMODE_LSEEK | FMODE_PREAD; + fd_install(fd, file); + return fd; +put_fd: + put_unused_fd(fd); +free_blob: + vpd_blob_free(blob); + return err; +} + +/* + * Top-level ioctl handler for /dev/papr-vpd. + */ +static long papr_vpd_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (__force void __user *)arg; + long ret; + + switch (ioctl) { + case PAPR_VPD_IOC_CREATE_HANDLE: + ret = papr_vpd_create_handle(argp); + break; + default: + ret = -ENOIOCTLCMD; + break; + } + return ret; +} + +static const struct file_operations papr_vpd_ops = { + .unlocked_ioctl = papr_vpd_dev_ioctl, +}; + +static struct miscdevice papr_vpd_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "papr-vpd", + .fops = &papr_vpd_ops, +}; + +static __init int papr_vpd_init(void) +{ + if (!rtas_function_implemented(RTAS_FN_IBM_GET_VPD)) + return -ENODEV; + + return misc_register(&papr_vpd_dev); +} +machine_device_initcall(pseries, papr_vpd_init); From 35aae182bd7b422be3cefc08c12207bf2b973364 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 12 Dec 2023 11:01:57 -0600 Subject: [PATCH 241/310] powerpc/pseries/papr-sysparm: Validate buffer object lengths The ability to get and set system parameters will be exposed to user space, so let's get a little more strict about malformed papr_sysparm_buf objects. * Create accessors for the length field of struct papr_sysparm_buf. The length is always stored in MSB order and this is better than spreading the necessary conversions all over. * Reject attempts to submit invalid buffers to RTAS. * Warn if RTAS returns a buffer with an invalid length, clamping the returned length to a safe value that won't overrun the buffer. These are meant as precautionary measures to mitigate both firmware and kernel bugs in this area, should they arise, but I am not aware of any. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231212-papr-sys_rtas-vs-lockdown-v6-10-e9eafd0c8c6c@linux.ibm.com --- arch/powerpc/platforms/pseries/papr-sysparm.c | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/arch/powerpc/platforms/pseries/papr-sysparm.c b/arch/powerpc/platforms/pseries/papr-sysparm.c index fedc61599e6c..a1e7aeac7416 100644 --- a/arch/powerpc/platforms/pseries/papr-sysparm.c +++ b/arch/powerpc/platforms/pseries/papr-sysparm.c @@ -23,6 +23,46 @@ void papr_sysparm_buf_free(struct papr_sysparm_buf *buf) kfree(buf); } +static size_t papr_sysparm_buf_get_length(const struct papr_sysparm_buf *buf) +{ + return be16_to_cpu(buf->len); +} + +static void papr_sysparm_buf_set_length(struct papr_sysparm_buf *buf, size_t length) +{ + WARN_ONCE(length > sizeof(buf->val), + "bogus length %zu, clamping to safe value", length); + length = min(sizeof(buf->val), length); + buf->len = cpu_to_be16(length); +} + +/* + * For use on buffers returned from ibm,get-system-parameter before + * returning them to callers. Ensures the encoded length of valid data + * cannot overrun buf->val[]. + */ +static void papr_sysparm_buf_clamp_length(struct papr_sysparm_buf *buf) +{ + papr_sysparm_buf_set_length(buf, papr_sysparm_buf_get_length(buf)); +} + +/* + * Perform some basic diligence on the system parameter buffer before + * submitting it to RTAS. + */ +static bool papr_sysparm_buf_can_submit(const struct papr_sysparm_buf *buf) +{ + /* + * Firmware ought to reject buffer lengths that exceed the + * maximum specified in PAPR, but there's no reason for the + * kernel to allow them either. + */ + if (papr_sysparm_buf_get_length(buf) > sizeof(buf->val)) + return false; + + return true; +} + /** * papr_sysparm_get() - Retrieve the value of a PAPR system parameter. * @param: PAPR system parameter token as described in @@ -63,6 +103,9 @@ int papr_sysparm_get(papr_sysparm_t param, struct papr_sysparm_buf *buf) if (token == RTAS_UNKNOWN_SERVICE) return -ENOENT; + if (!papr_sysparm_buf_can_submit(buf)) + return -EINVAL; + work_area = rtas_work_area_alloc(sizeof(*buf)); memcpy(rtas_work_area_raw_buf(work_area), buf, sizeof(*buf)); @@ -77,6 +120,7 @@ int papr_sysparm_get(papr_sysparm_t param, struct papr_sysparm_buf *buf) case 0: ret = 0; memcpy(buf, rtas_work_area_raw_buf(work_area), sizeof(*buf)); + papr_sysparm_buf_clamp_length(buf); break; case -3: /* parameter not implemented */ ret = -EOPNOTSUPP; @@ -115,6 +159,9 @@ int papr_sysparm_set(papr_sysparm_t param, const struct papr_sysparm_buf *buf) if (token == RTAS_UNKNOWN_SERVICE) return -ENOENT; + if (!papr_sysparm_buf_can_submit(buf)) + return -EINVAL; + work_area = rtas_work_area_alloc(sizeof(*buf)); memcpy(rtas_work_area_raw_buf(work_area), buf, sizeof(*buf)); From 905b9e48786ec55b2c469db77fb46e20bf3e4901 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 12 Dec 2023 11:01:58 -0600 Subject: [PATCH 242/310] powerpc/pseries/papr-sysparm: Expose character device to user space Until now the papr_sysparm APIs have been kernel-internal. But user space needs access to PAPR system parameters too. The only method available to user space today to get or set system parameters is using sys_rtas() and /dev/mem to pass RTAS-addressable buffers between user space and firmware. This is incompatible with lockdown and should be deprecated. So provide an alternative ABI to user space in the form of a /dev/papr-sysparm character device with just two ioctl commands (get and set). The data payloads involved are small enough to fit in the ioctl argument buffer, making the code relatively simple. Exposing the system parameters through sysfs has been considered but it would be too awkward: * The kernel currently does not have to contain an exhaustive list of defined system parameters. This is a convenient property to maintain because we don't have to update the kernel whenever a new parameter is added to PAPR. Exporting a named attribute in sysfs for each parameter would negate this. * Some system parameters are text-based and some are not. * Retrieval of at least one system parameter requires input data, which a simple read-oriented interface can't support. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231212-papr-sys_rtas-vs-lockdown-v6-11-e9eafd0c8c6c@linux.ibm.com --- .../userspace-api/ioctl/ioctl-number.rst | 2 + arch/powerpc/include/asm/papr-sysparm.h | 17 +- arch/powerpc/include/uapi/asm/papr-sysparm.h | 58 +++++++ arch/powerpc/platforms/pseries/papr-sysparm.c | 158 +++++++++++++++++- 4 files changed, 227 insertions(+), 8 deletions(-) create mode 100644 arch/powerpc/include/uapi/asm/papr-sysparm.h diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index a950545bf7cd..d8b6cb1a3636 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -351,6 +351,8 @@ Code Seq# Include File Comments 0xB2 00 arch/powerpc/include/uapi/asm/papr-vpd.h powerpc/pseries VPD API +0xB2 01-02 arch/powerpc/include/uapi/asm/papr-sysparm.h powerpc/pseries system parameter API + 0xB3 00 linux/mmc/ioctl.h 0xB4 00-0F linux/gpio.h 0xB5 00-0F uapi/linux/rpmsg.h diff --git a/arch/powerpc/include/asm/papr-sysparm.h b/arch/powerpc/include/asm/papr-sysparm.h index f5fdbd8ae9db..0dbbff59101d 100644 --- a/arch/powerpc/include/asm/papr-sysparm.h +++ b/arch/powerpc/include/asm/papr-sysparm.h @@ -2,8 +2,10 @@ #ifndef _ASM_POWERPC_PAPR_SYSPARM_H #define _ASM_POWERPC_PAPR_SYSPARM_H +#include + typedef struct { - const u32 token; + u32 token; } papr_sysparm_t; #define mk_papr_sysparm(x_) ((papr_sysparm_t){ .token = x_, }) @@ -20,11 +22,14 @@ typedef struct { #define PAPR_SYSPARM_TLB_BLOCK_INVALIDATE_ATTRS mk_papr_sysparm(50) #define PAPR_SYSPARM_LPAR_NAME mk_papr_sysparm(55) -enum { - PAPR_SYSPARM_MAX_INPUT = 1024, - PAPR_SYSPARM_MAX_OUTPUT = 4000, -}; - +/** + * struct papr_sysparm_buf - RTAS work area layout for system parameter functions. + * + * This is the memory layout of the buffers passed to/from + * ibm,get-system-parameter and ibm,set-system-parameter. It is + * distinct from the papr_sysparm_io_block structure that is passed + * between user space and the kernel. + */ struct papr_sysparm_buf { __be16 len; char val[PAPR_SYSPARM_MAX_OUTPUT]; diff --git a/arch/powerpc/include/uapi/asm/papr-sysparm.h b/arch/powerpc/include/uapi/asm/papr-sysparm.h new file mode 100644 index 000000000000..9f9a0f267ea5 --- /dev/null +++ b/arch/powerpc/include/uapi/asm/papr-sysparm.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_PAPR_SYSPARM_H_ +#define _UAPI_PAPR_SYSPARM_H_ + +#include +#include +#include + +enum { + PAPR_SYSPARM_MAX_INPUT = 1024, + PAPR_SYSPARM_MAX_OUTPUT = 4000, +}; + +struct papr_sysparm_io_block { + __u32 parameter; + __u16 length; + char data[PAPR_SYSPARM_MAX_OUTPUT]; +}; + +/** + * PAPR_SYSPARM_IOC_GET - Retrieve the value of a PAPR system parameter. + * + * Uses _IOWR because of one corner case: Retrieving the value of the + * "OS Service Entitlement Status" parameter (60) requires the caller + * to supply input data (a date string) in the buffer passed to + * firmware. So the @length and @data of the incoming + * papr_sysparm_io_block are always used to initialize the work area + * supplied to ibm,get-system-parameter. No other parameters are known + * to parameterize the result this way, and callers are encouraged + * (but not required) to zero-initialize @length and @data in the + * common case. + * + * On error the contents of the ioblock are indeterminate. + * + * Return: + * 0: Success; @length is the length of valid data in @data, not to exceed @PAPR_SYSPARM_MAX_OUTPUT. + * -EIO: Platform error. (-1) + * -EINVAL: Incorrect data length or format. (-9999) + * -EPERM: The calling partition is not allowed to access this parameter. (-9002) + * -EOPNOTSUPP: Parameter not supported on this platform (-3) + */ +#define PAPR_SYSPARM_IOC_GET _IOWR(PAPR_MISCDEV_IOC_ID, 1, struct papr_sysparm_io_block) + +/** + * PAPR_SYSPARM_IOC_SET - Update the value of a PAPR system parameter. + * + * The contents of the ioblock are unchanged regardless of success. + * + * Return: + * 0: Success; the parameter has been updated. + * -EIO: Platform error. (-1) + * -EINVAL: Incorrect data length or format. (-9999) + * -EPERM: The calling partition is not allowed to access this parameter. (-9002) + * -EOPNOTSUPP: Parameter not supported on this platform (-3) + */ +#define PAPR_SYSPARM_IOC_SET _IOW(PAPR_MISCDEV_IOC_ID, 2, struct papr_sysparm_io_block) + +#endif /* _UAPI_PAPR_SYSPARM_H_ */ diff --git a/arch/powerpc/platforms/pseries/papr-sysparm.c b/arch/powerpc/platforms/pseries/papr-sysparm.c index a1e7aeac7416..7063ce8884e4 100644 --- a/arch/powerpc/platforms/pseries/papr-sysparm.c +++ b/arch/powerpc/platforms/pseries/papr-sysparm.c @@ -2,14 +2,20 @@ #define pr_fmt(fmt) "papr-sysparm: " fmt +#include #include +#include +#include #include #include +#include #include #include -#include +#include +#include #include #include +#include struct papr_sysparm_buf *papr_sysparm_buf_alloc(void) { @@ -87,7 +93,6 @@ static bool papr_sysparm_buf_can_submit(const struct papr_sysparm_buf *buf) * * Return: 0 on success, -errno otherwise. @buf is unmodified on error. */ - int papr_sysparm_get(papr_sysparm_t param, struct papr_sysparm_buf *buf) { const s32 token = rtas_function_token(RTAS_FN_IBM_GET_SYSTEM_PARAMETER); @@ -196,3 +201,152 @@ int papr_sysparm_set(papr_sysparm_t param, const struct papr_sysparm_buf *buf) return ret; } + +static struct papr_sysparm_buf * +papr_sysparm_buf_from_user(const struct papr_sysparm_io_block __user *user_iob) +{ + struct papr_sysparm_buf *kern_spbuf; + long err; + u16 len; + + /* + * The length of valid data that userspace claims to be in + * user_iob->data[]. + */ + if (get_user(len, &user_iob->length)) + return ERR_PTR(-EFAULT); + + static_assert(sizeof(user_iob->data) >= PAPR_SYSPARM_MAX_INPUT); + static_assert(sizeof(kern_spbuf->val) >= PAPR_SYSPARM_MAX_INPUT); + + if (len > PAPR_SYSPARM_MAX_INPUT) + return ERR_PTR(-EINVAL); + + kern_spbuf = papr_sysparm_buf_alloc(); + if (!kern_spbuf) + return ERR_PTR(-ENOMEM); + + papr_sysparm_buf_set_length(kern_spbuf, len); + + if (len > 0 && copy_from_user(kern_spbuf->val, user_iob->data, len)) { + err = -EFAULT; + goto free_sysparm_buf; + } + + return kern_spbuf; + +free_sysparm_buf: + papr_sysparm_buf_free(kern_spbuf); + return ERR_PTR(err); +} + +static int papr_sysparm_buf_to_user(const struct papr_sysparm_buf *kern_spbuf, + struct papr_sysparm_io_block __user *user_iob) +{ + u16 len_out = papr_sysparm_buf_get_length(kern_spbuf); + + if (put_user(len_out, &user_iob->length)) + return -EFAULT; + + static_assert(sizeof(user_iob->data) >= PAPR_SYSPARM_MAX_OUTPUT); + static_assert(sizeof(kern_spbuf->val) >= PAPR_SYSPARM_MAX_OUTPUT); + + if (copy_to_user(user_iob->data, kern_spbuf->val, PAPR_SYSPARM_MAX_OUTPUT)) + return -EFAULT; + + return 0; +} + +static long papr_sysparm_ioctl_get(struct papr_sysparm_io_block __user *user_iob) +{ + struct papr_sysparm_buf *kern_spbuf; + papr_sysparm_t param; + long ret; + + if (get_user(param.token, &user_iob->parameter)) + return -EFAULT; + + kern_spbuf = papr_sysparm_buf_from_user(user_iob); + if (IS_ERR(kern_spbuf)) + return PTR_ERR(kern_spbuf); + + ret = papr_sysparm_get(param, kern_spbuf); + if (ret) + goto free_sysparm_buf; + + ret = papr_sysparm_buf_to_user(kern_spbuf, user_iob); + if (ret) + goto free_sysparm_buf; + + ret = 0; + +free_sysparm_buf: + papr_sysparm_buf_free(kern_spbuf); + return ret; +} + + +static long papr_sysparm_ioctl_set(struct papr_sysparm_io_block __user *user_iob) +{ + struct papr_sysparm_buf *kern_spbuf; + papr_sysparm_t param; + long ret; + + if (get_user(param.token, &user_iob->parameter)) + return -EFAULT; + + kern_spbuf = papr_sysparm_buf_from_user(user_iob); + if (IS_ERR(kern_spbuf)) + return PTR_ERR(kern_spbuf); + + ret = papr_sysparm_set(param, kern_spbuf); + if (ret) + goto free_sysparm_buf; + + ret = 0; + +free_sysparm_buf: + papr_sysparm_buf_free(kern_spbuf); + return ret; +} + +static long papr_sysparm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (__force void __user *)arg; + long ret; + + switch (ioctl) { + case PAPR_SYSPARM_IOC_GET: + ret = papr_sysparm_ioctl_get(argp); + break; + case PAPR_SYSPARM_IOC_SET: + if (filp->f_mode & FMODE_WRITE) + ret = papr_sysparm_ioctl_set(argp); + else + ret = -EBADF; + break; + default: + ret = -ENOIOCTLCMD; + break; + } + return ret; +} + +static const struct file_operations papr_sysparm_ops = { + .unlocked_ioctl = papr_sysparm_ioctl, +}; + +static struct miscdevice papr_sysparm_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "papr-sysparm", + .fops = &papr_sysparm_ops, +}; + +static __init int papr_sysparm_init(void) +{ + if (!rtas_function_implemented(RTAS_FN_IBM_GET_SYSTEM_PARAMETER)) + return -ENODEV; + + return misc_register(&papr_sysparm_dev); +} +machine_device_initcall(pseries, papr_sysparm_init); From 9118c5d32bddb5f75bc4f9f31218e70317702502 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 12 Dec 2023 11:01:59 -0600 Subject: [PATCH 243/310] powerpc/selftests: Add test for papr-vpd Add selftests for /dev/papr-vpd, exercising the common expected use cases: * Retrieve all VPD by passing an empty location code. * Retrieve the "system VPD" by passing a location code derived from DT root node properties, as done by the vpdupdate command. The tests also verify that certain intended properties of the driver hold: * Passing an unterminated location code to PAPR_VPD_CREATE_HANDLE gets EINVAL. * Passing a NULL location code pointer to PAPR_VPD_CREATE_HANDLE gets EFAULT. * Closing the device node without first issuing a PAPR_VPD_CREATE_HANDLE command to it succeeds. * Releasing a handle without first consuming any data from it succeeds. * Re-reading the contents of a handle returns the same data as the first time. Some minimal validation of the returned data is performed. The tests are skipped on systems where the papr-vpd driver does not initialize, making this useful only on PowerVM LPARs at this point. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231212-papr-sys_rtas-vs-lockdown-v6-12-e9eafd0c8c6c@linux.ibm.com --- tools/testing/selftests/powerpc/Makefile | 1 + .../selftests/powerpc/papr_vpd/.gitignore | 1 + .../selftests/powerpc/papr_vpd/Makefile | 12 + .../selftests/powerpc/papr_vpd/papr_vpd.c | 352 ++++++++++++++++++ 4 files changed, 366 insertions(+) create mode 100644 tools/testing/selftests/powerpc/papr_vpd/.gitignore create mode 100644 tools/testing/selftests/powerpc/papr_vpd/Makefile create mode 100644 tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index 7ea42fa02eab..05fc68d446c2 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -32,6 +32,7 @@ SUB_DIRS = alignment \ vphn \ math \ papr_attributes \ + papr_vpd \ ptrace \ security \ mce diff --git a/tools/testing/selftests/powerpc/papr_vpd/.gitignore b/tools/testing/selftests/powerpc/papr_vpd/.gitignore new file mode 100644 index 000000000000..49285031a656 --- /dev/null +++ b/tools/testing/selftests/powerpc/papr_vpd/.gitignore @@ -0,0 +1 @@ +/papr_vpd diff --git a/tools/testing/selftests/powerpc/papr_vpd/Makefile b/tools/testing/selftests/powerpc/papr_vpd/Makefile new file mode 100644 index 000000000000..06b719703bfd --- /dev/null +++ b/tools/testing/selftests/powerpc/papr_vpd/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +noarg: + $(MAKE) -C ../ + +TEST_GEN_PROGS := papr_vpd + +top_srcdir = ../../../../.. +include ../../lib.mk + +$(TEST_GEN_PROGS): ../harness.c ../utils.c + +$(OUTPUT)/papr_vpd: CFLAGS += $(KHDR_INCLUDES) diff --git a/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c new file mode 100644 index 000000000000..98cbb9109ee6 --- /dev/null +++ b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include + +#include "utils.h" + +#define DEVPATH "/dev/papr-vpd" + +static int dev_papr_vpd_open_close(void) +{ + const int devfd = open(DEVPATH, O_RDONLY); + + SKIP_IF_MSG(devfd < 0 && errno == ENOENT, + DEVPATH " not present"); + + FAIL_IF(devfd < 0); + FAIL_IF(close(devfd) != 0); + + return 0; +} + +static int dev_papr_vpd_get_handle_all(void) +{ + const int devfd = open(DEVPATH, O_RDONLY); + struct papr_location_code lc = { .str = "", }; + off_t size; + int fd; + + SKIP_IF_MSG(devfd < 0 && errno == ENOENT, + DEVPATH " not present"); + + FAIL_IF(devfd < 0); + + errno = 0; + fd = ioctl(devfd, PAPR_VPD_IOC_CREATE_HANDLE, &lc); + FAIL_IF(errno != 0); + FAIL_IF(fd < 0); + + FAIL_IF(close(devfd) != 0); + + size = lseek(fd, 0, SEEK_END); + FAIL_IF(size <= 0); + + void *buf = malloc((size_t)size); + FAIL_IF(!buf); + + ssize_t consumed = pread(fd, buf, size, 0); + FAIL_IF(consumed != size); + + /* Ensure EOF */ + FAIL_IF(read(fd, buf, size) != 0); + FAIL_IF(close(fd)); + + /* Verify that the buffer looks like VPD */ + static const char needle[] = "System VPD"; + FAIL_IF(!memmem(buf, size, needle, strlen(needle))); + + return 0; +} + +static int dev_papr_vpd_get_handle_byte_at_a_time(void) +{ + const int devfd = open(DEVPATH, O_RDONLY); + struct papr_location_code lc = { .str = "", }; + int fd; + + SKIP_IF_MSG(devfd < 0 && errno == ENOENT, + DEVPATH " not present"); + + FAIL_IF(devfd < 0); + + errno = 0; + fd = ioctl(devfd, PAPR_VPD_IOC_CREATE_HANDLE, &lc); + FAIL_IF(errno != 0); + FAIL_IF(fd < 0); + + FAIL_IF(close(devfd) != 0); + + size_t consumed = 0; + while (1) { + ssize_t res; + char c; + + errno = 0; + res = read(fd, &c, sizeof(c)); + FAIL_IF(res > sizeof(c)); + FAIL_IF(res < 0); + FAIL_IF(errno != 0); + consumed += res; + if (res == 0) + break; + } + + FAIL_IF(consumed != lseek(fd, 0, SEEK_END)); + + FAIL_IF(close(fd)); + + return 0; +} + + +static int dev_papr_vpd_unterm_loc_code(void) +{ + const int devfd = open(DEVPATH, O_RDONLY); + struct papr_location_code lc = {}; + int fd; + + SKIP_IF_MSG(devfd < 0 && errno == ENOENT, + DEVPATH " not present"); + + FAIL_IF(devfd < 0); + + /* + * Place a non-null byte in every element of loc_code; the + * driver should reject this input. + */ + memset(lc.str, 'x', ARRAY_SIZE(lc.str)); + + errno = 0; + fd = ioctl(devfd, PAPR_VPD_IOC_CREATE_HANDLE, &lc); + FAIL_IF(fd != -1); + FAIL_IF(errno != EINVAL); + + FAIL_IF(close(devfd) != 0); + return 0; +} + +static int dev_papr_vpd_null_handle(void) +{ + const int devfd = open(DEVPATH, O_RDONLY); + int rc; + + SKIP_IF_MSG(devfd < 0 && errno == ENOENT, + DEVPATH " not present"); + + FAIL_IF(devfd < 0); + + errno = 0; + rc = ioctl(devfd, PAPR_VPD_IOC_CREATE_HANDLE, NULL); + FAIL_IF(rc != -1); + FAIL_IF(errno != EFAULT); + + FAIL_IF(close(devfd) != 0); + return 0; +} + +static int papr_vpd_close_handle_without_reading(void) +{ + const int devfd = open(DEVPATH, O_RDONLY); + struct papr_location_code lc; + int fd; + + SKIP_IF_MSG(devfd < 0 && errno == ENOENT, + DEVPATH " not present"); + + FAIL_IF(devfd < 0); + + errno = 0; + fd = ioctl(devfd, PAPR_VPD_IOC_CREATE_HANDLE, &lc); + FAIL_IF(errno != 0); + FAIL_IF(fd < 0); + + /* close the handle without reading it */ + FAIL_IF(close(fd) != 0); + + FAIL_IF(close(devfd) != 0); + return 0; +} + +static int papr_vpd_reread(void) +{ + const int devfd = open(DEVPATH, O_RDONLY); + struct papr_location_code lc = { .str = "", }; + int fd; + + SKIP_IF_MSG(devfd < 0 && errno == ENOENT, + DEVPATH " not present"); + + FAIL_IF(devfd < 0); + + errno = 0; + fd = ioctl(devfd, PAPR_VPD_IOC_CREATE_HANDLE, &lc); + FAIL_IF(errno != 0); + FAIL_IF(fd < 0); + + FAIL_IF(close(devfd) != 0); + + const off_t size = lseek(fd, 0, SEEK_END); + FAIL_IF(size <= 0); + + char *bufs[2]; + + for (size_t i = 0; i < ARRAY_SIZE(bufs); ++i) { + bufs[i] = malloc(size); + FAIL_IF(!bufs[i]); + ssize_t consumed = pread(fd, bufs[i], size, 0); + FAIL_IF(consumed != size); + } + + FAIL_IF(memcmp(bufs[0], bufs[1], size)); + + FAIL_IF(close(fd) != 0); + + return 0; +} + +static int get_system_loc_code(struct papr_location_code *lc) +{ + static const char system_id_path[] = "/sys/firmware/devicetree/base/system-id"; + static const char model_path[] = "/sys/firmware/devicetree/base/model"; + char *system_id; + char *model; + int err = -1; + + if (read_file_alloc(model_path, &model, NULL)) + return err; + + if (read_file_alloc(system_id_path, &system_id, NULL)) + goto free_model; + + char *mtm; + int sscanf_ret = sscanf(model, "IBM,%ms", &mtm); + if (sscanf_ret != 1) + goto free_system_id; + + char *plant_and_seq; + if (sscanf(system_id, "IBM,%*c%*c%ms", &plant_and_seq) != 1) + goto free_mtm; + /* + * Replace - with . to build location code. + */ + char *sep = strchr(mtm, '-'); + if (!sep) + goto free_mtm; + else + *sep = '.'; + + snprintf(lc->str, sizeof(lc->str), + "U%s.%s", mtm, plant_and_seq); + err = 0; + + free(plant_and_seq); +free_mtm: + free(mtm); +free_system_id: + free(system_id); +free_model: + free(model); + return err; +} + +static int papr_vpd_system_loc_code(void) +{ + struct papr_location_code lc; + const int devfd = open(DEVPATH, O_RDONLY); + off_t size; + int fd; + + SKIP_IF_MSG(get_system_loc_code(&lc), + "Cannot determine system location code"); + SKIP_IF_MSG(devfd < 0 && errno == ENOENT, + DEVPATH " not present"); + + FAIL_IF(devfd < 0); + + errno = 0; + fd = ioctl(devfd, PAPR_VPD_IOC_CREATE_HANDLE, &lc); + FAIL_IF(errno != 0); + FAIL_IF(fd < 0); + + FAIL_IF(close(devfd) != 0); + + size = lseek(fd, 0, SEEK_END); + FAIL_IF(size <= 0); + + void *buf = malloc((size_t)size); + FAIL_IF(!buf); + + ssize_t consumed = pread(fd, buf, size, 0); + FAIL_IF(consumed != size); + + /* Ensure EOF */ + FAIL_IF(read(fd, buf, size) != 0); + FAIL_IF(close(fd)); + + /* Verify that the buffer looks like VPD */ + static const char needle[] = "System VPD"; + FAIL_IF(!memmem(buf, size, needle, strlen(needle))); + + return 0; +} + +struct vpd_test { + int (*function)(void); + const char *description; +}; + +static const struct vpd_test vpd_tests[] = { + { + .function = dev_papr_vpd_open_close, + .description = "open/close " DEVPATH, + }, + { + .function = dev_papr_vpd_unterm_loc_code, + .description = "ensure EINVAL on unterminated location code", + }, + { + .function = dev_papr_vpd_null_handle, + .description = "ensure EFAULT on bad handle addr", + }, + { + .function = dev_papr_vpd_get_handle_all, + .description = "get handle for all VPD" + }, + { + .function = papr_vpd_close_handle_without_reading, + .description = "close handle without consuming VPD" + }, + { + .function = dev_papr_vpd_get_handle_byte_at_a_time, + .description = "read all VPD one byte at a time" + }, + { + .function = papr_vpd_reread, + .description = "ensure re-read yields same results" + }, + { + .function = papr_vpd_system_loc_code, + .description = "get handle for system VPD" + }, +}; + +int main(void) +{ + size_t fails = 0; + + for (size_t i = 0; i < ARRAY_SIZE(vpd_tests); ++i) { + const struct vpd_test *t = &vpd_tests[i]; + + if (test_harness(t->function, t->description)) + ++fails; + } + + return fails == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} From 76b2ec3faeaa0c8d84705acd64ac0e5a307ce9c2 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 12 Dec 2023 11:02:00 -0600 Subject: [PATCH 244/310] powerpc/selftests: Add test for papr-sysparm Consistently testing system parameter access is a bit difficult by nature -- the set of parameters available depends on the model and system configuration, and updating a parameter should be considered a destructive operation reserved for the admin. So we validate some of the error paths and retrieve the SPLPAR characteristics string, but not much else. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20231212-papr-sys_rtas-vs-lockdown-v6-13-e9eafd0c8c6c@linux.ibm.com --- tools/testing/selftests/powerpc/Makefile | 1 + .../selftests/powerpc/papr_sysparm/.gitignore | 1 + .../selftests/powerpc/papr_sysparm/Makefile | 12 ++ .../powerpc/papr_sysparm/papr_sysparm.c | 196 ++++++++++++++++++ 4 files changed, 210 insertions(+) create mode 100644 tools/testing/selftests/powerpc/papr_sysparm/.gitignore create mode 100644 tools/testing/selftests/powerpc/papr_sysparm/Makefile create mode 100644 tools/testing/selftests/powerpc/papr_sysparm/papr_sysparm.c diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index 05fc68d446c2..c376151982c4 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -33,6 +33,7 @@ SUB_DIRS = alignment \ math \ papr_attributes \ papr_vpd \ + papr_sysparm \ ptrace \ security \ mce diff --git a/tools/testing/selftests/powerpc/papr_sysparm/.gitignore b/tools/testing/selftests/powerpc/papr_sysparm/.gitignore new file mode 100644 index 000000000000..f2a69bf59d40 --- /dev/null +++ b/tools/testing/selftests/powerpc/papr_sysparm/.gitignore @@ -0,0 +1 @@ +/papr_sysparm diff --git a/tools/testing/selftests/powerpc/papr_sysparm/Makefile b/tools/testing/selftests/powerpc/papr_sysparm/Makefile new file mode 100644 index 000000000000..7f79e437634a --- /dev/null +++ b/tools/testing/selftests/powerpc/papr_sysparm/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +noarg: + $(MAKE) -C ../ + +TEST_GEN_PROGS := papr_sysparm + +top_srcdir = ../../../../.. +include ../../lib.mk + +$(TEST_GEN_PROGS): ../harness.c ../utils.c + +$(OUTPUT)/papr_sysparm: CFLAGS += $(KHDR_INCLUDES) diff --git a/tools/testing/selftests/powerpc/papr_sysparm/papr_sysparm.c b/tools/testing/selftests/powerpc/papr_sysparm/papr_sysparm.c new file mode 100644 index 000000000000..d5436de5b8ed --- /dev/null +++ b/tools/testing/selftests/powerpc/papr_sysparm/papr_sysparm.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include + +#include "utils.h" + +#define DEVPATH "/dev/papr-sysparm" + +static int open_close(void) +{ + const int devfd = open(DEVPATH, O_RDONLY); + + SKIP_IF_MSG(devfd < 0 && errno == ENOENT, + DEVPATH " not present"); + + FAIL_IF(devfd < 0); + FAIL_IF(close(devfd) != 0); + + return 0; +} + +static int get_splpar(void) +{ + struct papr_sysparm_io_block sp = { + .parameter = 20, // SPLPAR characteristics + }; + const int devfd = open(DEVPATH, O_RDONLY); + + SKIP_IF_MSG(devfd < 0 && errno == ENOENT, + DEVPATH " not present"); + + FAIL_IF(devfd < 0); + FAIL_IF(ioctl(devfd, PAPR_SYSPARM_IOC_GET, &sp) != 0); + FAIL_IF(sp.length == 0); + FAIL_IF(sp.length > sizeof(sp.data)); + FAIL_IF(close(devfd) != 0); + + return 0; +} + +static int get_bad_parameter(void) +{ + struct papr_sysparm_io_block sp = { + .parameter = UINT32_MAX, // there are only ~60 specified parameters + }; + const int devfd = open(DEVPATH, O_RDONLY); + + SKIP_IF_MSG(devfd < 0 && errno == ENOENT, + DEVPATH " not present"); + + FAIL_IF(devfd < 0); + + // Ensure expected error + FAIL_IF(ioctl(devfd, PAPR_SYSPARM_IOC_GET, &sp) != -1); + FAIL_IF(errno != EOPNOTSUPP); + + // Ensure the buffer is unchanged + FAIL_IF(sp.length != 0); + for (size_t i = 0; i < ARRAY_SIZE(sp.data); ++i) + FAIL_IF(sp.data[i] != 0); + + FAIL_IF(close(devfd) != 0); + + return 0; +} + +static int check_efault_common(unsigned long cmd) +{ + const int devfd = open(DEVPATH, O_RDWR); + + SKIP_IF_MSG(devfd < 0 && errno == ENOENT, + DEVPATH " not present"); + + FAIL_IF(devfd < 0); + + // Ensure expected error + FAIL_IF(ioctl(devfd, cmd, NULL) != -1); + FAIL_IF(errno != EFAULT); + + FAIL_IF(close(devfd) != 0); + + return 0; +} + +static int check_efault_get(void) +{ + return check_efault_common(PAPR_SYSPARM_IOC_GET); +} + +static int check_efault_set(void) +{ + return check_efault_common(PAPR_SYSPARM_IOC_SET); +} + +static int set_hmc0(void) +{ + struct papr_sysparm_io_block sp = { + .parameter = 0, // HMC0, not a settable parameter + }; + const int devfd = open(DEVPATH, O_RDWR); + + SKIP_IF_MSG(devfd < 0 && errno == ENOENT, + DEVPATH " not present"); + + FAIL_IF(devfd < 0); + + // Ensure expected error + FAIL_IF(ioctl(devfd, PAPR_SYSPARM_IOC_SET, &sp) != -1); + SKIP_IF_MSG(errno == EOPNOTSUPP, "operation not supported"); + FAIL_IF(errno != EPERM); + + FAIL_IF(close(devfd) != 0); + + return 0; +} + +static int set_with_ro_fd(void) +{ + struct papr_sysparm_io_block sp = { + .parameter = 0, // HMC0, not a settable parameter. + }; + const int devfd = open(DEVPATH, O_RDONLY); + + SKIP_IF_MSG(devfd < 0 && errno == ENOENT, + DEVPATH " not present"); + + FAIL_IF(devfd < 0); + + // Ensure expected error + FAIL_IF(ioctl(devfd, PAPR_SYSPARM_IOC_SET, &sp) != -1); + SKIP_IF_MSG(errno == EOPNOTSUPP, "operation not supported"); + + // HMC0 isn't a settable parameter and we would normally + // expect to get EPERM on attempts to modify it. However, when + // the file is open read-only, we expect the driver to prevent + // the attempt with a distinct error. + FAIL_IF(errno != EBADF); + + FAIL_IF(close(devfd) != 0); + + return 0; +} + +struct sysparm_test { + int (*function)(void); + const char *description; +}; + +static const struct sysparm_test sysparm_tests[] = { + { + .function = open_close, + .description = "open and close " DEVPATH " without issuing commands", + }, + { + .function = get_splpar, + .description = "retrieve SPLPAR characteristics", + }, + { + .function = get_bad_parameter, + .description = "verify EOPNOTSUPP for known-bad parameter", + }, + { + .function = check_efault_get, + .description = "PAPR_SYSPARM_IOC_GET returns EFAULT on bad address", + }, + { + .function = check_efault_set, + .description = "PAPR_SYSPARM_IOC_SET returns EFAULT on bad address", + }, + { + .function = set_hmc0, + .description = "ensure EPERM on attempt to update HMC0", + }, + { + .function = set_with_ro_fd, + .description = "PAPR_IOC_SYSPARM_SET returns EACCESS on read-only fd", + }, +}; + +int main(void) +{ + size_t fails = 0; + + for (size_t i = 0; i < ARRAY_SIZE(sysparm_tests); ++i) { + const struct sysparm_test *t = &sysparm_tests[i]; + + if (test_harness(t->function, t->description)) + ++fails; + } + + return fails == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} From 2ec36570c3581285d15de672eaed10ce7e9218cd Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 8 Dec 2023 22:00:40 +0530 Subject: [PATCH 245/310] powerpc/ftrace: Fix indentation in ftrace.h Replace seven spaces with a tab character to fix an indentation issue reported by the kernel test robot. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311221731.alUwTDIm-lkp@intel.com/ Signed-off-by: Naveen N Rao Signed-off-by: Michael Ellerman Link: https://msgid.link/9f058227bd9243f0842786ef7228d87ab10d29f6.1702045299.git.naveen@kernel.org --- arch/powerpc/include/asm/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 9e5a39b6a311..1ebd2ca97f12 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -25,7 +25,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) if (IS_ENABLED(CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY)) addr += MCOUNT_INSN_SIZE; - return addr; + return addr; } unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, From ae24db43b3b427eb290b58d55179c32f0a7539d1 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Fri, 8 Dec 2023 22:00:42 +0530 Subject: [PATCH 246/310] powerpc/ftrace: Remove nops after the call to ftrace_stub ftrace_stub is within the same CU, so there is no need for a subsequent nop instruction. Signed-off-by: Naveen N Rao Signed-off-by: Michael Ellerman Link: https://msgid.link/8ee5ec520e37d5523654bb2cd65a17512fb774e2.1702045299.git.naveen@kernel.org --- arch/powerpc/kernel/trace/ftrace_entry.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace_entry.S b/arch/powerpc/kernel/trace/ftrace_entry.S index 90701885762c..a517a4085cff 100644 --- a/arch/powerpc/kernel/trace/ftrace_entry.S +++ b/arch/powerpc/kernel/trace/ftrace_entry.S @@ -162,7 +162,6 @@ _GLOBAL(ftrace_regs_caller) .globl ftrace_regs_call ftrace_regs_call: bl ftrace_stub - nop ftrace_regs_exit 1 _GLOBAL(ftrace_caller) @@ -171,7 +170,6 @@ _GLOBAL(ftrace_caller) .globl ftrace_call ftrace_call: bl ftrace_stub - nop ftrace_regs_exit 0 _GLOBAL(ftrace_stub) From b20f98e8b3deb50247603f0242ee2d1e38726635 Mon Sep 17 00:00:00 2001 From: Sathvika Vasireddy Date: Fri, 8 Dec 2023 22:00:43 +0530 Subject: [PATCH 247/310] powerpc/Kconfig: Select FUNCTION_ALIGNMENT_4B Commit d49a0626216b95 ("arch: Introduce CONFIG_FUNCTION_ALIGNMENT") introduced a generic function-alignment infrastructure. Move to using FUNCTION_ALIGNMENT_4B on powerpc, to use the same alignment as that of the existing _GLOBAL macro. Signed-off-by: Sathvika Vasireddy Signed-off-by: Michael Ellerman Link: https://msgid.link/21892186ec44abe24df0daf64f577dac0e78783f.1702045299.git.naveen@kernel.org --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/linkage.h | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 6f105ee4f3cf..318e5c1b7454 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -189,6 +189,7 @@ config PPC select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if ARCH_USING_PATCHABLE_FUNCTION_ENTRY + select FUNCTION_ALIGNMENT_4B select GENERIC_ATOMIC64 if PPC32 select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_CMOS_UPDATE diff --git a/arch/powerpc/include/asm/linkage.h b/arch/powerpc/include/asm/linkage.h index b88d1d2cf304..b71b9582e754 100644 --- a/arch/powerpc/include/asm/linkage.h +++ b/arch/powerpc/include/asm/linkage.h @@ -4,9 +4,6 @@ #include -#define __ALIGN .align 2 -#define __ALIGN_STR ".align 2" - #ifdef CONFIG_PPC64_ELF_ABI_V1 #define cond_syscall(x) \ asm ("\t.weak " #x "\n\t.set " #x ", sys_ni_syscall\n" \ From f46c8a75263f97bda13c739ba1c90aced0d3b071 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Mon, 4 Dec 2023 10:32:23 +0800 Subject: [PATCH 248/310] powerpc/mm: Fix null-pointer dereference in pgtable_cache_add kasprintf() returns a pointer to dynamically allocated memory which can be NULL upon failure. Ensure the allocation was successful by checking the pointer validity. Suggested-by: Christophe Leroy Suggested-by: Michael Ellerman Signed-off-by: Kunwu Chan Signed-off-by: Michael Ellerman Link: https://msgid.link/20231204023223.2447523-1-chentao@kylinos.cn --- arch/powerpc/mm/init-common.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 119ef491f797..d3a7726ecf51 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -126,7 +126,7 @@ void pgtable_cache_add(unsigned int shift) * as to leave enough 0 bits in the address to contain it. */ unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1, HUGEPD_SHIFT_MASK + 1); - struct kmem_cache *new; + struct kmem_cache *new = NULL; /* It would be nice if this was a BUILD_BUG_ON(), but at the * moment, gcc doesn't seem to recognize is_power_of_2 as a @@ -139,7 +139,8 @@ void pgtable_cache_add(unsigned int shift) align = max_t(unsigned long, align, minalign); name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift); - new = kmem_cache_create(name, table_size, align, 0, ctor(shift)); + if (name) + new = kmem_cache_create(name, table_size, align, 0, ctor(shift)); if (!new) panic("Could not allocate pgtable cache for order %d", shift); From 9a260f2dd827bbc82cc60eb4f4d8c22707d80742 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Fri, 8 Dec 2023 16:59:37 +0800 Subject: [PATCH 249/310] powerpc/powernv: Add a null pointer check to scom_debug_init_one() kasprintf() returns a pointer to dynamically allocated memory which can be NULL upon failure. Add a null pointer check, and release 'ent' to avoid memory leaks. Fixes: bfd2f0d49aef ("powerpc/powernv: Get rid of old scom_controller abstraction") Signed-off-by: Kunwu Chan Signed-off-by: Michael Ellerman Link: https://msgid.link/20231208085937.107210-1-chentao@kylinos.cn --- arch/powerpc/platforms/powernv/opal-xscom.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index 262cd6fac907..748c2b97fa53 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -165,6 +165,11 @@ static int scom_debug_init_one(struct dentry *root, struct device_node *dn, ent->chip = chip; snprintf(ent->name, 16, "%08x", chip); ent->path.data = (void *)kasprintf(GFP_KERNEL, "%pOF", dn); + if (!ent->path.data) { + kfree(ent); + return -ENOMEM; + } + ent->path.size = strlen((char *)ent->path.data); dir = debugfs_create_dir(ent->name, root); From 8649829a1dd25199bbf557b2621cedb4bf9b3050 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Mon, 27 Nov 2023 11:07:55 +0800 Subject: [PATCH 250/310] powerpc/powernv: Add a null pointer check in opal_event_init() kasprintf() returns a pointer to dynamically allocated memory which can be NULL upon failure. Fixes: 2717a33d6074 ("powerpc/opal-irqchip: Use interrupt names if present") Signed-off-by: Kunwu Chan Signed-off-by: Michael Ellerman Link: https://msgid.link/20231127030755.1546750-1-chentao@kylinos.cn --- arch/powerpc/platforms/powernv/opal-irqchip.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index f9a7001dacb7..56a1f7ce78d2 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -275,6 +275,8 @@ int __init opal_event_init(void) else name = kasprintf(GFP_KERNEL, "opal"); + if (!name) + continue; /* Install interrupt handler */ rc = request_irq(r->start, opal_interrupt, r->flags & IRQD_TRIGGER_MASK, name, NULL); From e123015c0ba859cf48aa7f89c5016cc6e98e018d Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Sun, 26 Nov 2023 17:57:39 +0800 Subject: [PATCH 251/310] powerpc/powernv: Add a null pointer check in opal_powercap_init() kasprintf() returns a pointer to dynamically allocated memory which can be NULL upon failure. Fixes: b9ef7b4b867f ("powerpc: Convert to using %pOFn instead of device_node.name") Signed-off-by: Kunwu Chan Signed-off-by: Michael Ellerman Link: https://msgid.link/20231126095739.1501990-1-chentao@kylinos.cn --- arch/powerpc/platforms/powernv/opal-powercap.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/platforms/powernv/opal-powercap.c b/arch/powerpc/platforms/powernv/opal-powercap.c index 7bfe4cbeb35a..ea917266aa17 100644 --- a/arch/powerpc/platforms/powernv/opal-powercap.c +++ b/arch/powerpc/platforms/powernv/opal-powercap.c @@ -196,6 +196,12 @@ void __init opal_powercap_init(void) j = 0; pcaps[i].pg.name = kasprintf(GFP_KERNEL, "%pOFn", node); + if (!pcaps[i].pg.name) { + kfree(pcaps[i].pattrs); + kfree(pcaps[i].pg.attrs); + goto out_pcaps_pattrs; + } + if (has_min) { powercap_add_attr(min, "powercap-min", &pcaps[i].pattrs[j]); From 0a233867a39078ebb0f575e2948593bbff5826b3 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Sun, 26 Nov 2023 17:37:19 +0800 Subject: [PATCH 252/310] powerpc/imc-pmu: Add a null pointer check in update_events_in_group() kasprintf() returns a pointer to dynamically allocated memory which can be NULL upon failure. Fixes: 885dcd709ba9 ("powerpc/perf: Add nest IMC PMU support") Signed-off-by: Kunwu Chan Signed-off-by: Michael Ellerman Link: https://msgid.link/20231126093719.1440305-1-chentao@kylinos.cn --- arch/powerpc/perf/imc-pmu.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 5d12ca386c1f..8664a7d297ad 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -299,6 +299,8 @@ static int update_events_in_group(struct device_node *node, struct imc_pmu *pmu) attr_group->attrs = attrs; do { ev_val_str = kasprintf(GFP_KERNEL, "event=0x%x", pmu->events[i].value); + if (!ev_val_str) + continue; dev_str = device_str_attr_create(pmu->events[i].name, ev_val_str); if (!dev_str) continue; @@ -306,6 +308,8 @@ static int update_events_in_group(struct device_node *node, struct imc_pmu *pmu) attrs[j++] = dev_str; if (pmu->events[i].scale) { ev_scale_str = kasprintf(GFP_KERNEL, "%s.scale", pmu->events[i].name); + if (!ev_scale_str) + continue; dev_str = device_str_attr_create(ev_scale_str, pmu->events[i].scale); if (!dev_str) continue; @@ -315,6 +319,8 @@ static int update_events_in_group(struct device_node *node, struct imc_pmu *pmu) if (pmu->events[i].unit) { ev_unit_str = kasprintf(GFP_KERNEL, "%s.unit", pmu->events[i].name); + if (!ev_unit_str) + continue; dev_str = device_str_attr_create(ev_unit_str, pmu->events[i].unit); if (!dev_str) continue; From a143892cb77c5397fd4356bbef9982abe4f3c5a5 Mon Sep 17 00:00:00 2001 From: Aditya Gupta Date: Wed, 20 Sep 2023 16:27:06 +0530 Subject: [PATCH 253/310] powerpc: add cpu_spec.cpu_features to vmcoreinfo CPU features can be determined in makedumpfile, using 'cur_cpu_spec.cpu_features'. This provides more data to makedumpfile about the crashed system, and can help in filtering the vmcore accordingly. Signed-off-by: Aditya Gupta Signed-off-by: Michael Ellerman Link: https://msgid.link/20230920105706.853626-2-adityag@linux.ibm.com --- arch/powerpc/kexec/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index 85846cadb9b5..27fa9098a5b7 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -75,6 +75,7 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_OFFSET(mmu_psize_def, shift); #endif VMCOREINFO_SYMBOL(cur_cpu_spec); + VMCOREINFO_OFFSET(cpu_spec, cpu_features); VMCOREINFO_OFFSET(cpu_spec, mmu_features); vmcoreinfo_append_str("NUMBER(RADIX_MMU)=%d\n", early_radix_enabled()); vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); From 7b1a09e44dc64f4f5930659b6d14a27183c00705 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Wed, 13 Dec 2023 09:20:46 +0800 Subject: [PATCH 254/310] arm64: irq: set the correct node for shadow call stack The init_irq_stacks() has been changed to use the correct node: https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git/commit/?id=75b5e0bf90bf The init_irq_scs() has the same issue with init_irq_stacks(): cpu_to_node() is not initialized yet, it does not work. This patch uses early_cpu_to_node() to set the init_irq_scs() with the correct node. Signed-off-by: Huang Shijie Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20231213012046.12014-1-shijie@os.amperecomputing.com Signed-off-by: Will Deacon --- arch/arm64/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 9f253d8efe90..85087e2df564 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -48,7 +48,7 @@ static void init_irq_scs(void) for_each_possible_cpu(cpu) per_cpu(irq_shadow_call_stack_ptr, cpu) = - scs_alloc(cpu_to_node(cpu)); + scs_alloc(early_cpu_to_node(cpu)); } #ifdef CONFIG_VMAP_STACK From 7d28365a06af74cee015a448d32ab6e98cd05cfb Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 9 Nov 2023 21:09:56 +0100 Subject: [PATCH 255/310] x86/head_64: Use TESTB instead of TESTL in secondary_startup_64_no_verify() There is no need to use TESTL when checking the least-significant bit with a TEST instruction. Use TESTB, which is three bytes shorter: f6 05 00 00 00 00 01 testb $0x1,0x0(%rip) vs: f7 05 00 00 00 00 01 testl $0x1,0x0(%rip) 00 00 00 for the same effect. No functional changes intended. Signed-off-by: Uros Bizjak Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231109201032.4439-1-ubizjak@gmail.com --- arch/x86/kernel/head_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 086a2c3aaaa0..1f79d809305d 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -182,7 +182,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) /* Enable PAE mode, PSE, PGE and LA57 */ orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx #ifdef CONFIG_X86_5LEVEL - testl $1, __pgtable_l5_enabled(%rip) + testb $1, __pgtable_l5_enabled(%rip) jz 1f orl $X86_CR4_LA57, %ecx 1: From cae40614cdd61a6601dc87c6e07c06bf642a125b Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 8 Dec 2023 10:56:48 +0800 Subject: [PATCH 256/310] docs: perf: Add description for Synopsys DesignWare PCIe PMU driver Alibaba's T-Head Yitan 710 SoC includes Synopsys' DesignWare Core PCIe controller which implements PMU for performance and functional debugging to facilitate system maintenance. Document it to provide guidance on how to use it. Signed-off-by: Shuai Xue Reviewed-by: Baolin Wang Reviewed-by: Jonathan Cameron Reviewed-by: Yicong Yang Tested-by: Ilkka Koskinen Link: https://lore.kernel.org/r/20231208025652.87192-2-xueshuai@linux.alibaba.com Signed-off-by: Will Deacon --- .../admin-guide/perf/dwc_pcie_pmu.rst | 94 +++++++++++++++++++ Documentation/admin-guide/perf/index.rst | 1 + 2 files changed, 95 insertions(+) create mode 100644 Documentation/admin-guide/perf/dwc_pcie_pmu.rst diff --git a/Documentation/admin-guide/perf/dwc_pcie_pmu.rst b/Documentation/admin-guide/perf/dwc_pcie_pmu.rst new file mode 100644 index 000000000000..d47cd229d710 --- /dev/null +++ b/Documentation/admin-guide/perf/dwc_pcie_pmu.rst @@ -0,0 +1,94 @@ +====================================================================== +Synopsys DesignWare Cores (DWC) PCIe Performance Monitoring Unit (PMU) +====================================================================== + +DesignWare Cores (DWC) PCIe PMU +=============================== + +The PMU is a PCIe configuration space register block provided by each PCIe Root +Port in a Vendor-Specific Extended Capability named RAS D.E.S (Debug, Error +injection, and Statistics). + +As the name indicates, the RAS DES capability supports system level +debugging, AER error injection, and collection of statistics. To facilitate +collection of statistics, Synopsys DesignWare Cores PCIe controller +provides the following two features: + +- one 64-bit counter for Time Based Analysis (RX/TX data throughput and + time spent in each low-power LTSSM state) and +- one 32-bit counter for Event Counting (error and non-error events for + a specified lane) + +Note: There is no interrupt for counter overflow. + +Time Based Analysis +------------------- + +Using this feature you can obtain information regarding RX/TX data +throughput and time spent in each low-power LTSSM state by the controller. +The PMU measures data in two categories: + +- Group#0: Percentage of time the controller stays in LTSSM states. +- Group#1: Amount of data processed (Units of 16 bytes). + +Lane Event counters +------------------- + +Using this feature you can obtain Error and Non-Error information in +specific lane by the controller. The PMU event is selected by all of: + +- Group i +- Event j within the Group i +- Lane k + +Some of the events only exist for specific configurations. + +DesignWare Cores (DWC) PCIe PMU Driver +======================================= + +This driver adds PMU devices for each PCIe Root Port named based on the BDF of +the Root Port. For example, + + 30:03.0 PCI bridge: Device 1ded:8000 (rev 01) + +the PMU device name for this Root Port is dwc_rootport_3018. + +The DWC PCIe PMU driver registers a perf PMU driver, which provides +description of available events and configuration options in sysfs, see +/sys/bus/event_source/devices/dwc_rootport_{bdf}. + +The "format" directory describes format of the config fields of the +perf_event_attr structure. The "events" directory provides configuration +templates for all documented events. For example, +"Rx_PCIe_TLP_Data_Payload" is an equivalent of "eventid=0x22,type=0x1". + +The "perf list" command shall list the available events from sysfs, e.g.:: + + $# perf list | grep dwc_rootport + <...> + dwc_rootport_3018/Rx_PCIe_TLP_Data_Payload/ [Kernel PMU event] + <...> + dwc_rootport_3018/rx_memory_read,lane=?/ [Kernel PMU event] + +Time Based Analysis Event Usage +------------------------------- + +Example usage of counting PCIe RX TLP data payload (Units of bytes):: + + $# perf stat -a -e dwc_rootport_3018/Rx_PCIe_TLP_Data_Payload/ + +The average RX/TX bandwidth can be calculated using the following formula: + + PCIe RX Bandwidth = Rx_PCIe_TLP_Data_Payload / Measure_Time_Window + PCIe TX Bandwidth = Tx_PCIe_TLP_Data_Payload / Measure_Time_Window + +Lane Event Usage +------------------------------- + +Each lane has the same event set and to avoid generating a list of hundreds +of events, the user need to specify the lane ID explicitly, e.g.:: + + $# perf stat -a -e dwc_rootport_3018/rx_memory_read,lane=4/ + +The driver does not support sampling, therefore "perf record" will not +work. Per-task (without "-a") perf sessions are not supported. diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst index a2e6f2c81146..f4a4513c526f 100644 --- a/Documentation/admin-guide/perf/index.rst +++ b/Documentation/admin-guide/perf/index.rst @@ -19,6 +19,7 @@ Performance monitor support arm_dsu_pmu thunderx2-pmu alibaba_pmu + dwc_pcie_pmu nvidia-pmu meson-ddr-pmu cxl From ad6534c626fedd818718d76c36d69c7d8e7b61cc Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 8 Dec 2023 10:56:49 +0800 Subject: [PATCH 257/310] PCI: Add Alibaba Vendor ID to linux/pci_ids.h The Alibaba Vendor ID (0x1ded) is now used by Alibaba elasticRDMA ("erdma") and will be shared with the upcoming PCIe PMU ("dwc_pcie_pmu"). Move the Vendor ID to linux/pci_ids.h so that it can shared by several drivers later. Signed-off-by: Shuai Xue Acked-by: Bjorn Helgaas # pci_ids.h Tested-by: Ilkka Koskinen Link: https://lore.kernel.org/r/20231208025652.87192-3-xueshuai@linux.alibaba.com Signed-off-by: Will Deacon --- drivers/infiniband/hw/erdma/erdma_hw.h | 2 -- include/linux/pci_ids.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 9d316fdc6f9a..a155519a862f 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -11,8 +11,6 @@ #include /* PCIe device related definition. */ -#define PCI_VENDOR_ID_ALIBABA 0x1ded - #define ERDMA_PCI_WIDTH 64 #define ERDMA_FUNC_BAR 0 #define ERDMA_MISX_BAR 2 diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 275799b5f535..844ffdac8d7d 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2605,6 +2605,8 @@ #define PCI_VENDOR_ID_TEKRAM 0x1de1 #define PCI_DEVICE_ID_TEKRAM_DC290 0xdc29 +#define PCI_VENDOR_ID_ALIBABA 0x1ded + #define PCI_VENDOR_ID_TEHUTI 0x1fc9 #define PCI_DEVICE_ID_TEHUTI_3009 0x3009 #define PCI_DEVICE_ID_TEHUTI_3010 0x3010 From ac16087134b837d42b75bb1c741070b6c142f258 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 8 Dec 2023 10:56:50 +0800 Subject: [PATCH 258/310] PCI: Move pci_clear_and_set_dword() helper to PCI header The clear and set pattern is commonly used for accessing PCI config, move the helper pci_clear_and_set_dword() from aspm.c into PCI header. In addition, rename to pci_clear_and_set_config_dword() to retain the "config" information and match the other accessors. No functional change intended. Signed-off-by: Shuai Xue Acked-by: Bjorn Helgaas Tested-by: Ilkka Koskinen Link: https://lore.kernel.org/r/20231208025652.87192-4-xueshuai@linux.alibaba.com Signed-off-by: Will Deacon --- drivers/pci/access.c | 12 ++++++++ drivers/pci/pcie/aspm.c | 65 +++++++++++++++++++---------------------- include/linux/pci.h | 2 ++ 3 files changed, 44 insertions(+), 35 deletions(-) diff --git a/drivers/pci/access.c b/drivers/pci/access.c index 6554a2e89d36..6449056b57dd 100644 --- a/drivers/pci/access.c +++ b/drivers/pci/access.c @@ -598,3 +598,15 @@ int pci_write_config_dword(const struct pci_dev *dev, int where, return pci_bus_write_config_dword(dev->bus, dev->devfn, where, val); } EXPORT_SYMBOL(pci_write_config_dword); + +void pci_clear_and_set_config_dword(const struct pci_dev *dev, int pos, + u32 clear, u32 set) +{ + u32 val; + + pci_read_config_dword(dev, pos, &val); + val &= ~clear; + val |= set; + pci_write_config_dword(dev, pos, val); +} +EXPORT_SYMBOL(pci_clear_and_set_config_dword); diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 50b04ae5c394..a720a8e3ced2 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -426,17 +426,6 @@ static void pcie_aspm_check_latency(struct pci_dev *endpoint) } } -static void pci_clear_and_set_dword(struct pci_dev *pdev, int pos, - u32 clear, u32 set) -{ - u32 val; - - pci_read_config_dword(pdev, pos, &val); - val &= ~clear; - val |= set; - pci_write_config_dword(pdev, pos, val); -} - /* Calculate L1.2 PM substate timing parameters */ static void aspm_calc_l12_info(struct pcie_link_state *link, u32 parent_l1ss_cap, u32 child_l1ss_cap) @@ -501,10 +490,12 @@ static void aspm_calc_l12_info(struct pcie_link_state *link, cl1_2_enables = cctl1 & PCI_L1SS_CTL1_L1_2_MASK; if (pl1_2_enables || cl1_2_enables) { - pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_L1_2_MASK, 0); - pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_L1_2_MASK, 0); + pci_clear_and_set_config_dword(child, + child->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_L1_2_MASK, 0); + pci_clear_and_set_config_dword(parent, + parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_L1_2_MASK, 0); } /* Program T_POWER_ON times in both ports */ @@ -512,22 +503,26 @@ static void aspm_calc_l12_info(struct pcie_link_state *link, pci_write_config_dword(child, child->l1ss + PCI_L1SS_CTL2, ctl2); /* Program Common_Mode_Restore_Time in upstream device */ - pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_CM_RESTORE_TIME, ctl1); + pci_clear_and_set_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_CM_RESTORE_TIME, ctl1); /* Program LTR_L1.2_THRESHOLD time in both ports */ - pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE, ctl1); - pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE, ctl1); + pci_clear_and_set_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_LTR_L12_TH_VALUE | + PCI_L1SS_CTL1_LTR_L12_TH_SCALE, + ctl1); + pci_clear_and_set_config_dword(child, child->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_LTR_L12_TH_VALUE | + PCI_L1SS_CTL1_LTR_L12_TH_SCALE, + ctl1); if (pl1_2_enables || cl1_2_enables) { - pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, 0, - pl1_2_enables); - pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, 0, - cl1_2_enables); + pci_clear_and_set_config_dword(parent, + parent->l1ss + PCI_L1SS_CTL1, 0, + pl1_2_enables); + pci_clear_and_set_config_dword(child, + child->l1ss + PCI_L1SS_CTL1, 0, + cl1_2_enables); } } @@ -687,10 +682,10 @@ static void pcie_config_aspm_l1ss(struct pcie_link_state *link, u32 state) */ /* Disable all L1 substates */ - pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_L1SS_MASK, 0); - pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_L1SS_MASK, 0); + pci_clear_and_set_config_dword(child, child->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_L1SS_MASK, 0); + pci_clear_and_set_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_L1SS_MASK, 0); /* * If needed, disable L1, and it gets enabled later * in pcie_config_aspm_link(). @@ -713,10 +708,10 @@ static void pcie_config_aspm_l1ss(struct pcie_link_state *link, u32 state) val |= PCI_L1SS_CTL1_PCIPM_L1_2; /* Enable what we need to enable */ - pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_L1SS_MASK, val); - pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_L1SS_MASK, val); + pci_clear_and_set_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_L1SS_MASK, val); + pci_clear_and_set_config_dword(child, child->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_L1SS_MASK, val); } static void pcie_config_aspm_dev(struct pci_dev *pdev, u32 val) diff --git a/include/linux/pci.h b/include/linux/pci.h index 60ca768bc867..268c4bd98ef3 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1239,6 +1239,8 @@ int pci_read_config_dword(const struct pci_dev *dev, int where, u32 *val); int pci_write_config_byte(const struct pci_dev *dev, int where, u8 val); int pci_write_config_word(const struct pci_dev *dev, int where, u16 val); int pci_write_config_dword(const struct pci_dev *dev, int where, u32 val); +void pci_clear_and_set_config_dword(const struct pci_dev *dev, int pos, + u32 clear, u32 set); int pcie_capability_read_word(struct pci_dev *dev, int pos, u16 *val); int pcie_capability_read_dword(struct pci_dev *dev, int pos, u32 *val); From af9597adc2f1e3609c67c9792a2469bb64e43ae9 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 8 Dec 2023 10:56:51 +0800 Subject: [PATCH 259/310] drivers/perf: add DesignWare PCIe PMU driver This commit adds the PCIe Performance Monitoring Unit (PMU) driver support for T-Head Yitian SoC chip. Yitian is based on the Synopsys PCI Express Core controller IP which provides statistics feature. The PMU is a PCIe configuration space register block provided by each PCIe Root Port in a Vendor-Specific Extended Capability named RAS D.E.S (Debug, Error injection, and Statistics). To facilitate collection of statistics the controller provides the following two features for each Root Port: - one 64-bit counter for Time Based Analysis (RX/TX data throughput and time spent in each low-power LTSSM state) and - one 32-bit counter for Event Counting (error and non-error events for a specified lane) Note: There is no interrupt for counter overflow. This driver adds PMU devices for each PCIe Root Port. And the PMU device is named based the BDF of Root Port. For example, 30:03.0 PCI bridge: Device 1ded:8000 (rev 01) the PMU device name for this Root Port is dwc_rootport_3018. Example usage of counting PCIe RX TLP data payload (Units of bytes):: $# perf stat -a -e dwc_rootport_3018/Rx_PCIe_TLP_Data_Payload/ average RX bandwidth can be calculated like this: PCIe TX Bandwidth = Rx_PCIe_TLP_Data_Payload / Measure_Time_Window Signed-off-by: Shuai Xue Reviewed-by: Baolin Wang Reviewed-by: Jonathan Cameron Reviewed-by: Yicong Yang Reviewed-and-tested-by: Ilkka Koskinen Link: https://lore.kernel.org/r/20231208025652.87192-5-xueshuai@linux.alibaba.com [will: Fix sparse error due to use of uninitialised 'vsec' symbol in dwc_pcie_match_des_cap()] Signed-off-by: Will Deacon --- drivers/perf/Kconfig | 7 + drivers/perf/Makefile | 1 + drivers/perf/dwc_pcie_pmu.c | 792 ++++++++++++++++++++++++++++++++++++ 3 files changed, 800 insertions(+) create mode 100644 drivers/perf/dwc_pcie_pmu.c diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 273d67ecf6d2..ec6e0d9194a1 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -217,6 +217,13 @@ config MARVELL_CN10K_DDR_PMU Enable perf support for Marvell DDR Performance monitoring event on CN10K platform. +config DWC_PCIE_PMU + tristate "Synopsys DesignWare PCIe PMU" + depends on PCI + help + Enable perf support for Synopsys DesignWare PCIe PMU Performance + monitoring event on platform including the Alibaba Yitian 710. + source "drivers/perf/arm_cspmu/Kconfig" source "drivers/perf/amlogic/Kconfig" diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index 16b3ec4db916..a06338e3401c 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o obj-$(CONFIG_APPLE_M1_CPU_PMU) += apple_m1_cpu_pmu.o obj-$(CONFIG_ALIBABA_UNCORE_DRW_PMU) += alibaba_uncore_drw_pmu.o +obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/ obj-$(CONFIG_MESON_DDR_PMU) += amlogic/ obj-$(CONFIG_CXL_PMU) += cxl_pmu.o diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c new file mode 100644 index 000000000000..957058ad0099 --- /dev/null +++ b/drivers/perf/dwc_pcie_pmu.c @@ -0,0 +1,792 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Synopsys DesignWare PCIe PMU driver + * + * Copyright (C) 2021-2023 Alibaba Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DWC_PCIE_VSEC_RAS_DES_ID 0x02 +#define DWC_PCIE_EVENT_CNT_CTL 0x8 + +/* + * Event Counter Data Select includes two parts: + * - 27-24: Group number(4-bit: 0..0x7) + * - 23-16: Event number(8-bit: 0..0x13) within the Group + * + * Put them together as in TRM. + */ +#define DWC_PCIE_CNT_EVENT_SEL GENMASK(27, 16) +#define DWC_PCIE_CNT_LANE_SEL GENMASK(11, 8) +#define DWC_PCIE_CNT_STATUS BIT(7) +#define DWC_PCIE_CNT_ENABLE GENMASK(4, 2) +#define DWC_PCIE_PER_EVENT_OFF 0x1 +#define DWC_PCIE_PER_EVENT_ON 0x3 +#define DWC_PCIE_EVENT_CLEAR GENMASK(1, 0) +#define DWC_PCIE_EVENT_PER_CLEAR 0x1 + +#define DWC_PCIE_EVENT_CNT_DATA 0xC + +#define DWC_PCIE_TIME_BASED_ANAL_CTL 0x10 +#define DWC_PCIE_TIME_BASED_REPORT_SEL GENMASK(31, 24) +#define DWC_PCIE_TIME_BASED_DURATION_SEL GENMASK(15, 8) +#define DWC_PCIE_DURATION_MANUAL_CTL 0x0 +#define DWC_PCIE_DURATION_1MS 0x1 +#define DWC_PCIE_DURATION_10MS 0x2 +#define DWC_PCIE_DURATION_100MS 0x3 +#define DWC_PCIE_DURATION_1S 0x4 +#define DWC_PCIE_DURATION_2S 0x5 +#define DWC_PCIE_DURATION_4S 0x6 +#define DWC_PCIE_DURATION_4US 0xFF +#define DWC_PCIE_TIME_BASED_TIMER_START BIT(0) +#define DWC_PCIE_TIME_BASED_CNT_ENABLE 0x1 + +#define DWC_PCIE_TIME_BASED_ANAL_DATA_REG_LOW 0x14 +#define DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH 0x18 + +/* Event attributes */ +#define DWC_PCIE_CONFIG_EVENTID GENMASK(15, 0) +#define DWC_PCIE_CONFIG_TYPE GENMASK(19, 16) +#define DWC_PCIE_CONFIG_LANE GENMASK(27, 20) + +#define DWC_PCIE_EVENT_ID(event) FIELD_GET(DWC_PCIE_CONFIG_EVENTID, (event)->attr.config) +#define DWC_PCIE_EVENT_TYPE(event) FIELD_GET(DWC_PCIE_CONFIG_TYPE, (event)->attr.config) +#define DWC_PCIE_EVENT_LANE(event) FIELD_GET(DWC_PCIE_CONFIG_LANE, (event)->attr.config) + +enum dwc_pcie_event_type { + DWC_PCIE_TIME_BASE_EVENT, + DWC_PCIE_LANE_EVENT, + DWC_PCIE_EVENT_TYPE_MAX, +}; + +#define DWC_PCIE_LANE_EVENT_MAX_PERIOD GENMASK_ULL(31, 0) +#define DWC_PCIE_MAX_PERIOD GENMASK_ULL(63, 0) + +struct dwc_pcie_pmu { + struct pmu pmu; + struct pci_dev *pdev; /* Root Port device */ + u16 ras_des_offset; + u32 nr_lanes; + + struct list_head pmu_node; + struct hlist_node cpuhp_node; + struct perf_event *event[DWC_PCIE_EVENT_TYPE_MAX]; + int on_cpu; +}; + +#define to_dwc_pcie_pmu(p) (container_of(p, struct dwc_pcie_pmu, pmu)) + +static int dwc_pcie_pmu_hp_state; +static struct list_head dwc_pcie_dev_info_head = + LIST_HEAD_INIT(dwc_pcie_dev_info_head); +static bool notify; + +struct dwc_pcie_dev_info { + struct platform_device *plat_dev; + struct pci_dev *pdev; + struct list_head dev_node; +}; + +struct dwc_pcie_vendor_id { + int vendor_id; +}; + +static const struct dwc_pcie_vendor_id dwc_pcie_vendor_ids[] = { + {.vendor_id = PCI_VENDOR_ID_ALIBABA }, + {} /* terminator */ +}; + +static ssize_t cpumask_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(dev_get_drvdata(dev)); + + return cpumap_print_to_pagebuf(true, buf, cpumask_of(pcie_pmu->on_cpu)); +} +static DEVICE_ATTR_RO(cpumask); + +static struct attribute *dwc_pcie_pmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL +}; + +static struct attribute_group dwc_pcie_cpumask_attr_group = { + .attrs = dwc_pcie_pmu_cpumask_attrs, +}; + +struct dwc_pcie_format_attr { + struct device_attribute attr; + u64 field; + int config; +}; + +PMU_FORMAT_ATTR(eventid, "config:0-15"); +PMU_FORMAT_ATTR(type, "config:16-19"); +PMU_FORMAT_ATTR(lane, "config:20-27"); + +static struct attribute *dwc_pcie_format_attrs[] = { + &format_attr_type.attr, + &format_attr_eventid.attr, + &format_attr_lane.attr, + NULL, +}; + +static struct attribute_group dwc_pcie_format_attrs_group = { + .name = "format", + .attrs = dwc_pcie_format_attrs, +}; + +struct dwc_pcie_event_attr { + struct device_attribute attr; + enum dwc_pcie_event_type type; + u16 eventid; + u8 lane; +}; + +static ssize_t dwc_pcie_event_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dwc_pcie_event_attr *eattr; + + eattr = container_of(attr, typeof(*eattr), attr); + + if (eattr->type == DWC_PCIE_LANE_EVENT) + return sysfs_emit(buf, "eventid=0x%x,type=0x%x,lane=?\n", + eattr->eventid, eattr->type); + else if (eattr->type == DWC_PCIE_TIME_BASE_EVENT) + return sysfs_emit(buf, "eventid=0x%x,type=0x%x\n", + eattr->eventid, eattr->type); + + return 0; +} + +#define DWC_PCIE_EVENT_ATTR(_name, _type, _eventid, _lane) \ + (&((struct dwc_pcie_event_attr[]) {{ \ + .attr = __ATTR(_name, 0444, dwc_pcie_event_show, NULL), \ + .type = _type, \ + .eventid = _eventid, \ + .lane = _lane, \ + }})[0].attr.attr) + +#define DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(_name, _eventid) \ + DWC_PCIE_EVENT_ATTR(_name, DWC_PCIE_TIME_BASE_EVENT, _eventid, 0) +#define DWC_PCIE_PMU_LANE_EVENT_ATTR(_name, _eventid) \ + DWC_PCIE_EVENT_ATTR(_name, DWC_PCIE_LANE_EVENT, _eventid, 0) + +static struct attribute *dwc_pcie_pmu_time_event_attrs[] = { + /* Group #0 */ + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(one_cycle, 0x00), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(TX_L0S, 0x01), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(RX_L0S, 0x02), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L0, 0x03), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1, 0x04), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_1, 0x05), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_2, 0x06), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(CFG_RCVRY, 0x07), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(TX_RX_L0S, 0x08), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_AUX, 0x09), + + /* Group #1 */ + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_PCIe_TLP_Data_Payload, 0x20), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_PCIe_TLP_Data_Payload, 0x21), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_CCIX_TLP_Data_Payload, 0x22), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_CCIX_TLP_Data_Payload, 0x23), + + /* + * Leave it to the user to specify the lane ID to avoid generating + * a list of hundreds of events. + */ + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_ack_dllp, 0x600), + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_update_fc_dllp, 0x601), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_ack_dllp, 0x602), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_update_fc_dllp, 0x603), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_nulified_tlp, 0x604), + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_nulified_tlp, 0x605), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_duplicate_tl, 0x606), + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_write, 0x700), + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_read, 0x701), + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_configuration_write, 0x702), + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_configuration_read, 0x703), + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_io_write, 0x704), + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_io_read, 0x705), + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_completion_without_data, 0x706), + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_completion_with_data, 0x707), + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_message_tlp, 0x708), + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_atomic, 0x709), + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_tlp_with_prefix, 0x70A), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_memory_write, 0x70B), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_memory_read, 0x70C), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_io_write, 0x70F), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_io_read, 0x710), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_completion_without_data, 0x711), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_completion_with_data, 0x712), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_message_tlp, 0x713), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_atomic, 0x714), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_tlp_with_prefix, 0x715), + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_ccix_tlp, 0x716), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_ccix_tlp, 0x717), + NULL +}; + +static const struct attribute_group dwc_pcie_event_attrs_group = { + .name = "events", + .attrs = dwc_pcie_pmu_time_event_attrs, +}; + +static const struct attribute_group *dwc_pcie_attr_groups[] = { + &dwc_pcie_event_attrs_group, + &dwc_pcie_format_attrs_group, + &dwc_pcie_cpumask_attr_group, + NULL +}; + +static void dwc_pcie_pmu_lane_event_enable(struct dwc_pcie_pmu *pcie_pmu, + bool enable) +{ + struct pci_dev *pdev = pcie_pmu->pdev; + u16 ras_des_offset = pcie_pmu->ras_des_offset; + + if (enable) + pci_clear_and_set_config_dword(pdev, + ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, + DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_ON); + else + pci_clear_and_set_config_dword(pdev, + ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, + DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF); +} + +static void dwc_pcie_pmu_time_based_event_enable(struct dwc_pcie_pmu *pcie_pmu, + bool enable) +{ + struct pci_dev *pdev = pcie_pmu->pdev; + u16 ras_des_offset = pcie_pmu->ras_des_offset; + + pci_clear_and_set_config_dword(pdev, + ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL, + DWC_PCIE_TIME_BASED_TIMER_START, enable); +} + +static u64 dwc_pcie_pmu_read_lane_event_counter(struct perf_event *event) +{ + struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu); + struct pci_dev *pdev = pcie_pmu->pdev; + u16 ras_des_offset = pcie_pmu->ras_des_offset; + u32 val; + + pci_read_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_DATA, &val); + + return val; +} + +static u64 dwc_pcie_pmu_read_time_based_counter(struct perf_event *event) +{ + struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu); + struct pci_dev *pdev = pcie_pmu->pdev; + int event_id = DWC_PCIE_EVENT_ID(event); + u16 ras_des_offset = pcie_pmu->ras_des_offset; + u32 lo, hi, ss; + u64 val; + + /* + * The 64-bit value of the data counter is spread across two + * registers that are not synchronized. In order to read them + * atomically, ensure that the high 32 bits match before and after + * reading the low 32 bits. + */ + pci_read_config_dword(pdev, + ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH, &hi); + do { + /* snapshot the high 32 bits */ + ss = hi; + + pci_read_config_dword( + pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_LOW, + &lo); + pci_read_config_dword( + pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_DATA_REG_HIGH, + &hi); + } while (hi != ss); + + val = ((u64)hi << 32) | lo; + /* + * The Group#1 event measures the amount of data processed in 16-byte + * units. Simplify the end-user interface by multiplying the counter + * at the point of read. + */ + if (event_id >= 0x20 && event_id <= 0x23) + val *= 16; + + return val; +} + +static void dwc_pcie_pmu_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event); + u64 delta, prev, now = 0; + + do { + prev = local64_read(&hwc->prev_count); + + if (type == DWC_PCIE_LANE_EVENT) + now = dwc_pcie_pmu_read_lane_event_counter(event); + else if (type == DWC_PCIE_TIME_BASE_EVENT) + now = dwc_pcie_pmu_read_time_based_counter(event); + + } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev); + + delta = (now - prev) & DWC_PCIE_MAX_PERIOD; + /* 32-bit counter for Lane Event Counting */ + if (type == DWC_PCIE_LANE_EVENT) + delta &= DWC_PCIE_LANE_EVENT_MAX_PERIOD; + + local64_add(delta, &event->count); +} + +static int dwc_pcie_pmu_event_init(struct perf_event *event) +{ + struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu); + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event); + struct perf_event *sibling; + u32 lane; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* We don't support sampling */ + if (is_sampling_event(event)) + return -EINVAL; + + /* We cannot support task bound events */ + if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK) + return -EINVAL; + + if (event->group_leader != event && + !is_software_event(event->group_leader)) + return -EINVAL; + + for_each_sibling_event(sibling, event->group_leader) { + if (sibling->pmu != event->pmu && !is_software_event(sibling)) + return -EINVAL; + } + + if (type < 0 || type >= DWC_PCIE_EVENT_TYPE_MAX) + return -EINVAL; + + if (type == DWC_PCIE_LANE_EVENT) { + lane = DWC_PCIE_EVENT_LANE(event); + if (lane < 0 || lane >= pcie_pmu->nr_lanes) + return -EINVAL; + } + + event->cpu = pcie_pmu->on_cpu; + + return 0; +} + +static void dwc_pcie_pmu_event_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu); + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event); + + hwc->state = 0; + local64_set(&hwc->prev_count, 0); + + if (type == DWC_PCIE_LANE_EVENT) + dwc_pcie_pmu_lane_event_enable(pcie_pmu, true); + else if (type == DWC_PCIE_TIME_BASE_EVENT) + dwc_pcie_pmu_time_based_event_enable(pcie_pmu, true); +} + +static void dwc_pcie_pmu_event_stop(struct perf_event *event, int flags) +{ + struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu); + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event); + struct hw_perf_event *hwc = &event->hw; + + if (event->hw.state & PERF_HES_STOPPED) + return; + + if (type == DWC_PCIE_LANE_EVENT) + dwc_pcie_pmu_lane_event_enable(pcie_pmu, false); + else if (type == DWC_PCIE_TIME_BASE_EVENT) + dwc_pcie_pmu_time_based_event_enable(pcie_pmu, false); + + dwc_pcie_pmu_event_update(event); + hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; +} + +static int dwc_pcie_pmu_event_add(struct perf_event *event, int flags) +{ + struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu); + struct pci_dev *pdev = pcie_pmu->pdev; + struct hw_perf_event *hwc = &event->hw; + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event); + int event_id = DWC_PCIE_EVENT_ID(event); + int lane = DWC_PCIE_EVENT_LANE(event); + u16 ras_des_offset = pcie_pmu->ras_des_offset; + u32 ctrl; + + /* one counter for each type and it is in use */ + if (pcie_pmu->event[type]) + return -ENOSPC; + + pcie_pmu->event[type] = event; + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + + if (type == DWC_PCIE_LANE_EVENT) { + /* EVENT_COUNTER_DATA_REG needs clear manually */ + ctrl = FIELD_PREP(DWC_PCIE_CNT_EVENT_SEL, event_id) | + FIELD_PREP(DWC_PCIE_CNT_LANE_SEL, lane) | + FIELD_PREP(DWC_PCIE_CNT_ENABLE, DWC_PCIE_PER_EVENT_OFF) | + FIELD_PREP(DWC_PCIE_EVENT_CLEAR, DWC_PCIE_EVENT_PER_CLEAR); + pci_write_config_dword(pdev, ras_des_offset + DWC_PCIE_EVENT_CNT_CTL, + ctrl); + } else if (type == DWC_PCIE_TIME_BASE_EVENT) { + /* + * TIME_BASED_ANAL_DATA_REG is a 64 bit register, we can safely + * use it with any manually controlled duration. And it is + * cleared when next measurement starts. + */ + ctrl = FIELD_PREP(DWC_PCIE_TIME_BASED_REPORT_SEL, event_id) | + FIELD_PREP(DWC_PCIE_TIME_BASED_DURATION_SEL, + DWC_PCIE_DURATION_MANUAL_CTL) | + DWC_PCIE_TIME_BASED_CNT_ENABLE; + pci_write_config_dword( + pdev, ras_des_offset + DWC_PCIE_TIME_BASED_ANAL_CTL, ctrl); + } + + if (flags & PERF_EF_START) + dwc_pcie_pmu_event_start(event, PERF_EF_RELOAD); + + perf_event_update_userpage(event); + + return 0; +} + +static void dwc_pcie_pmu_event_del(struct perf_event *event, int flags) +{ + struct dwc_pcie_pmu *pcie_pmu = to_dwc_pcie_pmu(event->pmu); + enum dwc_pcie_event_type type = DWC_PCIE_EVENT_TYPE(event); + + dwc_pcie_pmu_event_stop(event, flags | PERF_EF_UPDATE); + perf_event_update_userpage(event); + pcie_pmu->event[type] = NULL; +} + +static void dwc_pcie_pmu_remove_cpuhp_instance(void *hotplug_node) +{ + cpuhp_state_remove_instance_nocalls(dwc_pcie_pmu_hp_state, hotplug_node); +} + +/* + * Find the binded DES capability device info of a PCI device. + * @pdev: The PCI device. + */ +static struct dwc_pcie_dev_info *dwc_pcie_find_dev_info(struct pci_dev *pdev) +{ + struct dwc_pcie_dev_info *dev_info; + + list_for_each_entry(dev_info, &dwc_pcie_dev_info_head, dev_node) + if (dev_info->pdev == pdev) + return dev_info; + + return NULL; +} + +static void dwc_pcie_unregister_pmu(void *data) +{ + struct dwc_pcie_pmu *pcie_pmu = data; + + perf_pmu_unregister(&pcie_pmu->pmu); +} + +static bool dwc_pcie_match_des_cap(struct pci_dev *pdev) +{ + const struct dwc_pcie_vendor_id *vid; + u16 vsec = 0; + u32 val; + + if (!pci_is_pcie(pdev) || !(pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT)) + return false; + + for (vid = dwc_pcie_vendor_ids; vid->vendor_id; vid++) { + vsec = pci_find_vsec_capability(pdev, vid->vendor_id, + DWC_PCIE_VSEC_RAS_DES_ID); + if (vsec) + break; + } + if (!vsec) + return false; + + pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &val); + if (PCI_VNDR_HEADER_REV(val) != 0x04) + return false; + + pci_dbg(pdev, + "Detected PCIe Vendor-Specific Extended Capability RAS DES\n"); + return true; +} + +static void dwc_pcie_unregister_dev(struct dwc_pcie_dev_info *dev_info) +{ + platform_device_unregister(dev_info->plat_dev); + list_del(&dev_info->dev_node); + kfree(dev_info); +} + +static int dwc_pcie_register_dev(struct pci_dev *pdev) +{ + struct platform_device *plat_dev; + struct dwc_pcie_dev_info *dev_info; + u32 bdf; + + bdf = PCI_DEVID(pdev->bus->number, pdev->devfn); + plat_dev = platform_device_register_data(NULL, "dwc_pcie_pmu", bdf, + pdev, sizeof(*pdev)); + + if (IS_ERR(plat_dev)) + return PTR_ERR(plat_dev); + + dev_info = kzalloc(sizeof(*dev_info), GFP_KERNEL); + if (!dev_info) + return -ENOMEM; + + /* Cache platform device to handle pci device hotplug */ + dev_info->plat_dev = plat_dev; + dev_info->pdev = pdev; + list_add(&dev_info->dev_node, &dwc_pcie_dev_info_head); + + return 0; +} + +static int dwc_pcie_pmu_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + struct pci_dev *pdev = to_pci_dev(dev); + struct dwc_pcie_dev_info *dev_info; + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + if (!dwc_pcie_match_des_cap(pdev)) + return NOTIFY_DONE; + if (dwc_pcie_register_dev(pdev)) + return NOTIFY_BAD; + break; + case BUS_NOTIFY_DEL_DEVICE: + dev_info = dwc_pcie_find_dev_info(pdev); + if (!dev_info) + return NOTIFY_DONE; + dwc_pcie_unregister_dev(dev_info); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block dwc_pcie_pmu_nb = { + .notifier_call = dwc_pcie_pmu_notifier, +}; + +static int dwc_pcie_pmu_probe(struct platform_device *plat_dev) +{ + struct pci_dev *pdev = plat_dev->dev.platform_data; + struct dwc_pcie_pmu *pcie_pmu; + char *name; + u32 bdf, val; + u16 vsec; + int ret; + + vsec = pci_find_vsec_capability(pdev, pdev->vendor, + DWC_PCIE_VSEC_RAS_DES_ID); + pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &val); + bdf = PCI_DEVID(pdev->bus->number, pdev->devfn); + name = devm_kasprintf(&plat_dev->dev, GFP_KERNEL, "dwc_rootport_%x", bdf); + if (!name) + return -ENOMEM; + + pcie_pmu = devm_kzalloc(&plat_dev->dev, sizeof(*pcie_pmu), GFP_KERNEL); + if (!pcie_pmu) + return -ENOMEM; + + pcie_pmu->pdev = pdev; + pcie_pmu->ras_des_offset = vsec; + pcie_pmu->nr_lanes = pcie_get_width_cap(pdev); + pcie_pmu->on_cpu = -1; + pcie_pmu->pmu = (struct pmu){ + .name = name, + .parent = &pdev->dev, + .module = THIS_MODULE, + .attr_groups = dwc_pcie_attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .task_ctx_nr = perf_invalid_context, + .event_init = dwc_pcie_pmu_event_init, + .add = dwc_pcie_pmu_event_add, + .del = dwc_pcie_pmu_event_del, + .start = dwc_pcie_pmu_event_start, + .stop = dwc_pcie_pmu_event_stop, + .read = dwc_pcie_pmu_event_update, + }; + + /* Add this instance to the list used by the offline callback */ + ret = cpuhp_state_add_instance(dwc_pcie_pmu_hp_state, + &pcie_pmu->cpuhp_node); + if (ret) { + pci_err(pdev, "Error %d registering hotplug @%x\n", ret, bdf); + return ret; + } + + /* Unwind when platform driver removes */ + ret = devm_add_action_or_reset(&plat_dev->dev, + dwc_pcie_pmu_remove_cpuhp_instance, + &pcie_pmu->cpuhp_node); + if (ret) + return ret; + + ret = perf_pmu_register(&pcie_pmu->pmu, name, -1); + if (ret) { + pci_err(pdev, "Error %d registering PMU @%x\n", ret, bdf); + return ret; + } + ret = devm_add_action_or_reset(&plat_dev->dev, dwc_pcie_unregister_pmu, + pcie_pmu); + if (ret) + return ret; + + return 0; +} + +static int dwc_pcie_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node) +{ + struct dwc_pcie_pmu *pcie_pmu; + + pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node); + if (pcie_pmu->on_cpu == -1) + pcie_pmu->on_cpu = cpumask_local_spread( + 0, dev_to_node(&pcie_pmu->pdev->dev)); + + return 0; +} + +static int dwc_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node) +{ + struct dwc_pcie_pmu *pcie_pmu; + struct pci_dev *pdev; + int node; + cpumask_t mask; + unsigned int target; + + pcie_pmu = hlist_entry_safe(cpuhp_node, struct dwc_pcie_pmu, cpuhp_node); + /* Nothing to do if this CPU doesn't own the PMU */ + if (cpu != pcie_pmu->on_cpu) + return 0; + + pcie_pmu->on_cpu = -1; + pdev = pcie_pmu->pdev; + node = dev_to_node(&pdev->dev); + if (cpumask_and(&mask, cpumask_of_node(node), cpu_online_mask) && + cpumask_andnot(&mask, &mask, cpumask_of(cpu))) + target = cpumask_any(&mask); + else + target = cpumask_any_but(cpu_online_mask, cpu); + + if (target >= nr_cpu_ids) { + pci_err(pdev, "There is no CPU to set\n"); + return 0; + } + + /* This PMU does NOT support interrupt, just migrate context. */ + perf_pmu_migrate_context(&pcie_pmu->pmu, cpu, target); + pcie_pmu->on_cpu = target; + + return 0; +} + +static struct platform_driver dwc_pcie_pmu_driver = { + .probe = dwc_pcie_pmu_probe, + .driver = {.name = "dwc_pcie_pmu",}, +}; + +static int __init dwc_pcie_pmu_init(void) +{ + struct pci_dev *pdev = NULL; + bool found = false; + int ret; + + for_each_pci_dev(pdev) { + if (!dwc_pcie_match_des_cap(pdev)) + continue; + + ret = dwc_pcie_register_dev(pdev); + if (ret) { + pci_dev_put(pdev); + return ret; + } + + found = true; + } + if (!found) + return -ENODEV; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "perf/dwc_pcie_pmu:online", + dwc_pcie_pmu_online_cpu, + dwc_pcie_pmu_offline_cpu); + if (ret < 0) + return ret; + + dwc_pcie_pmu_hp_state = ret; + + ret = platform_driver_register(&dwc_pcie_pmu_driver); + if (ret) + goto platform_driver_register_err; + + ret = bus_register_notifier(&pci_bus_type, &dwc_pcie_pmu_nb); + if (ret) + goto platform_driver_register_err; + notify = true; + + return 0; + +platform_driver_register_err: + cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state); + + return ret; +} + +static void __exit dwc_pcie_pmu_exit(void) +{ + struct dwc_pcie_dev_info *dev_info, *tmp; + + if (notify) + bus_unregister_notifier(&pci_bus_type, &dwc_pcie_pmu_nb); + list_for_each_entry_safe(dev_info, tmp, &dwc_pcie_dev_info_head, dev_node) + dwc_pcie_unregister_dev(dev_info); + platform_driver_unregister(&dwc_pcie_pmu_driver); + cpuhp_remove_multi_state(dwc_pcie_pmu_hp_state); +} + +module_init(dwc_pcie_pmu_init); +module_exit(dwc_pcie_pmu_exit); + +MODULE_DESCRIPTION("PMU driver for DesignWare Cores PCI Express Controller"); +MODULE_AUTHOR("Shuai Xue "); +MODULE_LICENSE("GPL v2"); From f56bb3de66bc7db90cd6df0a9866167e8a79317e Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 8 Dec 2023 10:56:52 +0800 Subject: [PATCH 260/310] MAINTAINERS: add maintainers for DesignWare PCIe PMU driver Add maintainers for Synopsys DesignWare PCIe PMU driver and driver document. Signed-off-by: Shuai Xue Link: https://lore.kernel.org/r/20231208025652.87192-6-xueshuai@linux.alibaba.com Signed-off-by: Will Deacon --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 012df8ccf34e..5b28ec113c7d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21090,6 +21090,13 @@ L: linux-mmc@vger.kernel.org S: Maintained F: drivers/mmc/host/dw_mmc* +SYNOPSYS DESIGNWARE PCIE PMU DRIVER +M: Shuai Xue +M: Jing Zhang +S: Supported +F: Documentation/admin-guide/perf/dwc_pcie_pmu.rst +F: drivers/perf/dwc_pcie_pmu.c + SYNOPSYS HSDK RESET CONTROLLER DRIVER M: Eugeniy Paltsev S: Supported From 63a2d92e1461396cd83742cbaf20af240b7a94e9 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 12 Dec 2023 17:09:09 +0000 Subject: [PATCH 261/310] arm64: Cleanup system cpucap handling Recent changes to remove cpus_have_const_cap() introduced new users of cpus_have_cap() in the period between detecting system cpucaps and patching alternatives. It would be preferable to defer these until after the relevant cpucaps have been patched so that these can use the usual feature check helper functions, which is clearer and has less risk of accidental usage of code relying upon an alternative which has not yet been patched. This patch reworks the system-wide cpucap detection and patching to minimize this transient period: * The detection, enablement, and patching of system cpucaps is moved into a new setup_system_capabilities() function so that these can be grouped together more clearly, with no other functions called in the period between detection and patching. This is called from setup_system_features() before the subsequent checks that depend on the cpucaps. The logging of TTBR0 PAN and cpucaps with a mask is also moved here to keep these as close as possible to update_cpu_capabilities(). At the same time, comments are corrected and improved to make the intent clearer. * As hyp_mode_check() only tests system register values (not hwcaps) and must be called prior to patching, the call to hyp_mode_check() is moved before the call to setup_system_features(). * In setup_system_features(), the use of system_uses_ttbr0_pan() is restored, now that this occurs after alternatives are patched. This is a partial revert of commit: 53d62e995d9eaed1 ("arm64: Avoid cpus_have_const_cap() for ARM64_HAS_PAN") * In sve_setup() and sme_setup(), the use of system_supports_sve() and system_supports_sme() respectively are restored, now that these occur after alternatives are patched. This is a partial revert of commit: a76521d160284a1e ("arm64: Avoid cpus_have_const_cap() for ARM64_{SVE,SME,SME2,FA64}") Signed-off-by: Mark Rutland Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20231212170910.3745497-2-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 46 ++++++++++++++++++++-------------- arch/arm64/kernel/fpsimd.c | 4 +-- arch/arm64/kernel/smp.c | 3 +-- 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index b335da126e86..5f1320c38aa2 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3318,23 +3318,40 @@ unsigned long cpu_get_elf_hwcap2(void) return elf_hwcap[1]; } -void __init setup_system_features(void) +static void __init setup_system_capabilities(void) { - int i; /* - * The system-wide safe feature feature register values have been - * finalized. Finalize and log the available system capabilities. + * The system-wide safe feature register values have been finalized. + * Detect, enable, and patch alternatives for the available system + * cpucaps. */ update_cpu_capabilities(SCOPE_SYSTEM); - if (IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) && - !cpus_have_cap(ARM64_HAS_PAN)) - pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n"); + enable_cpu_capabilities(SCOPE_ALL & ~SCOPE_BOOT_CPU); + apply_alternatives_all(); /* - * Enable all the available capabilities which have not been enabled - * already. + * Log any cpucaps with a cpumask as these aren't logged by + * update_cpu_capabilities(). */ - enable_cpu_capabilities(SCOPE_ALL & ~SCOPE_BOOT_CPU); + for (int i = 0; i < ARM64_NCAPS; i++) { + const struct arm64_cpu_capabilities *caps = cpucap_ptrs[i]; + + if (caps && caps->cpus && caps->desc && + cpumask_any(caps->cpus) < nr_cpu_ids) + pr_info("detected: %s on CPU%*pbl\n", + caps->desc, cpumask_pr_args(caps->cpus)); + } + + /* + * TTBR0 PAN doesn't have its own cpucap, so log it manually. + */ + if (system_uses_ttbr0_pan()) + pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n"); +} + +void __init setup_system_features(void) +{ + setup_system_capabilities(); kpti_install_ng_mappings(); @@ -3347,15 +3364,6 @@ void __init setup_system_features(void) if (!cache_type_cwg()) pr_warn("No Cache Writeback Granule information, assuming %d\n", ARCH_DMA_MINALIGN); - - for (i = 0; i < ARM64_NCAPS; i++) { - const struct arm64_cpu_capabilities *caps = cpucap_ptrs[i]; - - if (caps && caps->cpus && caps->desc && - cpumask_any(caps->cpus) < nr_cpu_ids) - pr_info("detected: %s on CPU%*pbl\n", - caps->desc, cpumask_pr_args(caps->cpus)); - } } void __init setup_user_features(void) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 1559c706d32d..bc9384517db3 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1171,7 +1171,7 @@ void __init sve_setup(void) unsigned long b; int max_bit; - if (!cpus_have_cap(ARM64_SVE)) + if (!system_supports_sve()) return; /* @@ -1301,7 +1301,7 @@ void __init sme_setup(void) struct vl_info *info = &vl_info[ARM64_VEC_SME]; int min_bit, max_bit; - if (!cpus_have_cap(ARM64_SME)) + if (!system_supports_sme()) return; /* diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index defbab84e9e5..85384dc9a89d 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -439,9 +439,8 @@ static void __init hyp_mode_check(void) void __init smp_cpus_done(unsigned int max_cpus) { pr_info("SMP: Total of %d processors activated.\n", num_online_cpus()); - setup_system_features(); hyp_mode_check(); - apply_alternatives_all(); + setup_system_features(); setup_user_features(); mark_linear_text_alias_ro(); } From eb15d707c252c3fb56e3ea57a185976c67e5a07f Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 12 Dec 2023 17:09:10 +0000 Subject: [PATCH 262/310] arm64: Align boot cpucap handling with system cpucap handling Currently the detection+enablement of boot cpucaps is separate from the patching of boot cpucap alternatives, which means there's a period where cpus_have_cap($CAP) and alternative_has_cap($CAP) may be mismatched. It would be preferable to manage the boot cpucaps in the same way as the system cpucaps, both for clarity and to minimize the risk of accidental usage of code relying upon an alternative which has not yet been patched. This patch aligns the handling of boot cpucaps with the handling of system cpucaps: * The existing setup_boot_cpu_capabilities() function is moved to be closer to the setup_system_capabilities() and setup_system_features() functions so that they're more clearly related and more likely to be updated together in future. * The patching of boot cpucap alternatives is moved into setup_boot_cpu_capabilities(), immediately after boot cpucaps are detected and enabled. * A new setup_boot_cpu_features() function is added to mirror setup_system_features(); this handles initialization of cpucap data structures and calls setup_boot_cpu_capabilities(). This makes init_cpu_features() a closer mirror to update_cpu_features(), and makes smp_prepare_boot_cpu() a closer mirror to smp_cpus_done(). Importantly, while these changes alter the structure of the code, they retain the existing order of calls to: init_cpu_features(); // prefix initializing feature regs init_cpucap_indirect_list(); detect_system_supports_pseudo_nmi(); update_cpu_capabilities(SCOPE_BOOT_CPU | SCOPE_LOCAL_CPU); enable_cpu_capabilities(SCOPE_BOOT_CPU); apply_boot_alternatives(); ... and hence there should be no functional change as a result of this patch; this is purely a structural cleanup. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20231212170910.3745497-3-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpufeature.h | 1 + arch/arm64/kernel/cpufeature.c | 57 +++++++++++++++-------------- arch/arm64/kernel/smp.c | 9 +---- 3 files changed, 33 insertions(+), 34 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index f6d416fe49b0..2ea24c5cb900 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -617,6 +617,7 @@ static inline bool id_aa64pfr1_mte(u64 pfr1) return val >= ID_AA64PFR1_EL1_MTE_MTE2; } +void __init setup_boot_cpu_features(void); void __init setup_system_features(void); void __init setup_user_features(void); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5f1320c38aa2..e976538c11ac 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1081,25 +1081,6 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); - - /* - * Initialize the indirect array of CPU capabilities pointers before we - * handle the boot CPU below. - */ - init_cpucap_indirect_list(); - - /* - * Detect broken pseudo-NMI. Must be called _before_ the call to - * setup_boot_cpu_capabilities() since it interacts with - * can_use_gic_priorities(). - */ - detect_system_supports_pseudo_nmi(); - - /* - * Detect and enable early CPU capabilities based on the boot CPU, - * after we have initialised the CPU feature infrastructure. - */ - setup_boot_cpu_capabilities(); } static void update_cpu_ftr_reg(struct arm64_ftr_reg *reg, u64 new) @@ -3255,14 +3236,6 @@ void check_local_cpu_capabilities(void) verify_local_cpu_capabilities(); } -static void __init setup_boot_cpu_capabilities(void) -{ - /* Detect capabilities with either SCOPE_BOOT_CPU or SCOPE_LOCAL_CPU */ - update_cpu_capabilities(SCOPE_BOOT_CPU | SCOPE_LOCAL_CPU); - /* Enable the SCOPE_BOOT_CPU capabilities alone right away */ - enable_cpu_capabilities(SCOPE_BOOT_CPU); -} - bool this_cpu_has_cap(unsigned int n) { if (!WARN_ON(preemptible()) && n < ARM64_NCAPS) { @@ -3318,6 +3291,36 @@ unsigned long cpu_get_elf_hwcap2(void) return elf_hwcap[1]; } +static void __init setup_boot_cpu_capabilities(void) +{ + /* + * The boot CPU's feature register values have been recorded. Detect + * boot cpucaps and local cpucaps for the boot CPU, then enable and + * patch alternatives for the available boot cpucaps. + */ + update_cpu_capabilities(SCOPE_BOOT_CPU | SCOPE_LOCAL_CPU); + enable_cpu_capabilities(SCOPE_BOOT_CPU); + apply_boot_alternatives(); +} + +void __init setup_boot_cpu_features(void) +{ + /* + * Initialize the indirect array of CPU capabilities pointers before we + * handle the boot CPU. + */ + init_cpucap_indirect_list(); + + /* + * Detect broken pseudo-NMI. Must be called _before_ the call to + * setup_boot_cpu_capabilities() since it interacts with + * can_use_gic_priorities(). + */ + detect_system_supports_pseudo_nmi(); + + setup_boot_cpu_capabilities(); +} + static void __init setup_system_capabilities(void) { /* diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 85384dc9a89d..4ced34f62dab 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -453,14 +453,9 @@ void __init smp_prepare_boot_cpu(void) * freed shortly, so we must move over to the runtime per-cpu area. */ set_my_cpu_offset(per_cpu_offset(smp_processor_id())); - cpuinfo_store_boot_cpu(); - /* - * We now know enough about the boot CPU to apply the - * alternatives that cannot wait until interrupt handling - * and/or scheduling is enabled. - */ - apply_boot_alternatives(); + cpuinfo_store_boot_cpu(); + setup_boot_cpu_features(); /* Conditionally switch to GIC PMR for interrupt masking */ if (system_uses_irq_prio_masking()) From 30579c8baa5b4bd986420a984dad2940f1ff65d3 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Thu, 30 Nov 2023 14:26:01 +0100 Subject: [PATCH 263/310] x86/sev: Do the C-bit verification only on the BSP There's no need to do it on every AP. The C-bit value read on the BSP and also verified there, is used everywhere from now on. No functional changes - just a bit faster booting APs. Signed-off-by: Borislav Petkov (AMD) Acked-by: Tom Lendacky Link: https://lore.kernel.org/r/20231130132601.10317-1-bp@alien8.de --- arch/x86/kernel/head_64.S | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 086a2c3aaaa0..d1dc39af6771 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -114,6 +114,28 @@ SYM_CODE_START_NOALIGN(startup_64) /* Form the CR3 value being sure to include the CR3 modifier */ addq $(early_top_pgt - __START_KERNEL_map), %rax + +#ifdef CONFIG_AMD_MEM_ENCRYPT + mov %rax, %rdi + mov %rax, %r14 + + addq phys_base(%rip), %rdi + + /* + * For SEV guests: Verify that the C-bit is correct. A malicious + * hypervisor could lie about the C-bit position to perform a ROP + * attack on the guest by writing to the unencrypted stack and wait for + * the next RET instruction. + */ + call sev_verify_cbit + + /* + * Restore CR3 value without the phys_base which will be added + * below, before writing %cr3. + */ + mov %r14, %rax +#endif + jmp 1f SYM_CODE_END(startup_64) @@ -192,15 +214,6 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) /* Setup early boot stage 4-/5-level pagetables. */ addq phys_base(%rip), %rax - /* - * For SEV guests: Verify that the C-bit is correct. A malicious - * hypervisor could lie about the C-bit position to perform a ROP - * attack on the guest by writing to the unencrypted stack and wait for - * the next RET instruction. - */ - movq %rax, %rdi - call sev_verify_cbit - /* * Switch to new page-table * From 6f4b7052daa060e7d20d6d599697b8ac702a7e69 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 14 Nov 2023 12:42:19 +0530 Subject: [PATCH 264/310] powerpc/sched: Cleanup vcpu_is_preempted() No functional change in this patch. A helper is added to find if vcpu is dispatched by hypervisor. Use that instead of opencoding. Also clarify some of the comments. Signed-off-by: "Aneesh Kumar K.V" Signed-off-by: Michael Ellerman Link: https://msgid.link/20231114071219.198222-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/paravirt.h | 33 ++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h index ac4279208d63..b78b82d66057 100644 --- a/arch/powerpc/include/asm/paravirt.h +++ b/arch/powerpc/include/asm/paravirt.h @@ -76,6 +76,17 @@ static inline bool is_vcpu_idle(int vcpu) { return lppaca_of(vcpu).idle; } + +static inline bool vcpu_is_dispatched(int vcpu) +{ + /* + * This is the yield_count. An "odd" value (low bit on) means that + * the processor is yielded (either because of an OS yield or a + * hypervisor preempt). An even value implies that the processor is + * currently executing. + */ + return (!(yield_count_of(vcpu) & 1)); +} #else static inline bool is_shared_processor(void) { @@ -109,6 +120,10 @@ static inline bool is_vcpu_idle(int vcpu) { return false; } +static inline bool vcpu_is_dispatched(int vcpu) +{ + return true; +} #endif #define vcpu_is_preempted vcpu_is_preempted @@ -134,12 +149,12 @@ static inline bool vcpu_is_preempted(int cpu) * If the hypervisor has dispatched the target CPU on a physical * processor, then the target CPU is definitely not preempted. */ - if (!(yield_count_of(cpu) & 1)) + if (vcpu_is_dispatched(cpu)) return false; /* - * If the target CPU has yielded to Hypervisor but OS has not - * requested idle then the target CPU is definitely preempted. + * if the target CPU is not dispatched and the guest OS + * has not marked the CPU idle, then it is hypervisor preempted. */ if (!is_vcpu_idle(cpu)) return true; @@ -166,7 +181,7 @@ static inline bool vcpu_is_preempted(int cpu) /* * The PowerVM hypervisor dispatches VMs on a whole core - * basis. So we know that a thread sibling of the local CPU + * basis. So we know that a thread sibling of the executing CPU * cannot have been preempted by the hypervisor, even if it * has called H_CONFER, which will set the yield bit. */ @@ -174,15 +189,17 @@ static inline bool vcpu_is_preempted(int cpu) return false; /* - * If any of the threads of the target CPU's core are not - * preempted or ceded, then consider target CPU to be - * non-preempted. + * The specific target CPU was marked by guest OS as idle, but + * then also check all other cpus in the core for PowerVM + * because it does core scheduling and one of the vcpu + * of the core getting preempted by hypervisor implies + * other vcpus can also be considered preempted. */ first_cpu = cpu_first_thread_sibling(cpu); for (i = first_cpu; i < first_cpu + threads_per_core; i++) { if (i == cpu) continue; - if (!(yield_count_of(i) & 1)) + if (vcpu_is_dispatched(i)) return false; if (!is_vcpu_idle(i)) return true; From aa80c6343fcf53cbc29f84ba9f89ca87d4e41350 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 14 Dec 2023 23:37:11 +0530 Subject: [PATCH 265/310] powerpc/smp: Enable Asym packing for cores on shared processor If there are shared processor LPARs, underlying Hypervisor can have more virtual cores to handle than actual physical cores. Starting with Power 9, a big core (aka SMT8 core) has 2 nearly independent thread groups. On a shared processors LPARs, it helps to pack threads to lesser number of cores so that the overall system performance and utilization improves. PowerVM schedules at a big core level. Hence packing to fewer cores helps. Since each thread-group is independent, running threads on both the thread-groups of a SMT8 core, should have a minimal adverse impact in non over provisioned scenarios. These changes in this patchset will not affect in the over provisioned scenario. If there are more threads than SMT domains, then asym_packing will not kick-in For example: Lets says there are two 8-core Shared LPARs that are actually sharing a 8 Core shared physical pool, each running 8 threads each. Then Consolidating 8 threads to 4 cores on each LPAR would help them to perform better. This is because each of the LPAR will get 100% time to run applications and there will no switching required by the Hypervisor. To achieve this, enable SD_ASYM_PACKING flag at CACHE, MC and DIE level when the system is running in shared processor mode and has big cores. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214180720.310852-2-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index ab691c89d787..3fc8ad9646a4 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1003,6 +1003,13 @@ static int powerpc_smt_flags(void) } #endif +/* + * On shared processor LPARs scheduled on a big core (which has two or more + * independent thread groups per core), prefer lower numbered CPUs, so + * that workload consolidates to lesser number of cores. + */ +static __ro_after_init DEFINE_STATIC_KEY_FALSE(splpar_asym_pack); + /* * P9 has a slightly odd architecture where pairs of cores share an L2 cache. * This topology makes it *much* cheaper to migrate tasks between adjacent cores @@ -1011,9 +1018,20 @@ static int powerpc_smt_flags(void) */ static int powerpc_shared_cache_flags(void) { + if (static_branch_unlikely(&splpar_asym_pack)) + return SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING; + return SD_SHARE_PKG_RESOURCES; } +static int powerpc_shared_proc_flags(void) +{ + if (static_branch_unlikely(&splpar_asym_pack)) + return SD_ASYM_PACKING; + + return 0; +} + /* * We can't just pass cpu_l2_cache_mask() directly because * returns a non-const pointer and the compiler barfs on that. @@ -1050,8 +1068,8 @@ static struct sched_domain_topology_level powerpc_topology[] = { { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, #endif { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, - { cpu_mc_mask, SD_INIT_NAME(MC) }, - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + { cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) }, + { cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) }, { NULL, }, }; @@ -1686,6 +1704,9 @@ static void __init fixup_topology(void) { int i; + if (is_shared_processor() && has_big_cores) + static_branch_enable(&splpar_asym_pack); + #ifdef CONFIG_SCHED_SMT if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); From 0e1c1986e0e65746daa05405d7747ce882f83cf1 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 14 Dec 2023 23:37:12 +0530 Subject: [PATCH 266/310] powerpc/smp: Disable MC domain for shared processor Like L2-cache info, coregroup information which is used to determine MC sched domains is only present on dedicated LPARs. i.e PowerVM doesn't export coregroup information for shared processor LPARs. Hence disable creating MC domains on shared LPAR Systems. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214180720.310852-3-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 3fc8ad9646a4..2cebc53e97f9 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1055,6 +1055,10 @@ static struct cpumask *cpu_coregroup_mask(int cpu) static bool has_coregroup_support(void) { + /* Coregroup identification not available on shared systems */ + if (is_shared_processor()) + return 0; + return coregroup_enabled; } From fd535a858ebeb1f478b1d065b6c057f52aad483a Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 14 Dec 2023 23:37:13 +0530 Subject: [PATCH 267/310] powerpc/smp: Add __ro_after_init attribute There are some variables that are only updated at boot time. So add __ro_after_init attribute to such variables Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214180720.310852-4-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 2cebc53e97f9..aea149627209 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -77,10 +77,10 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; #endif struct task_struct *secondary_current; -bool has_big_cores; -bool coregroup_enabled; -bool thread_group_shares_l2; -bool thread_group_shares_l3; +bool has_big_cores __ro_after_init; +bool coregroup_enabled __ro_after_init; +bool thread_group_shares_l2 __ro_after_init; +bool thread_group_shares_l3 __ro_after_init; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); @@ -987,7 +987,7 @@ static int __init init_thread_group_cache_map(int cpu, int cache_property) return 0; } -static bool shared_caches; +static bool shared_caches __ro_after_init; #ifdef CONFIG_SCHED_SMT /* cpumask of CPUs with asymmetric SMT dependency */ From 0e93f1c780e8fd315f1262467b7d35eb6f766d2f Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 14 Dec 2023 23:37:14 +0530 Subject: [PATCH 268/310] powerpc/smp: Avoid asym packing within thread_group of a core PowerVM Hypervisor will schedule at a core granularity. However each core can have more than one thread_groups. For better utilization in case of a shared processor, its preferable for the scheduler to pack to the lowest core. However there is no benefit of moving a thread between two thread groups of the same core. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214180720.310852-5-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index aea149627209..9d8bb9a084bd 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1763,6 +1763,19 @@ void __init smp_cpus_done(unsigned int max_cpus) set_sched_topology(powerpc_topology); } +/* + * For asym packing, by default lower numbered CPU has higher priority. + * On shared processors, pack to lower numbered core. However avoid moving + * between thread_groups within the same core. + */ +int arch_asym_cpu_priority(int cpu) +{ + if (static_branch_unlikely(&splpar_asym_pack)) + return -cpu / threads_per_core; + + return -cpu; +} + #ifdef CONFIG_HOTPLUG_CPU int __cpu_disable(void) { From c46975715f5a7b941aa09bc0539a8dbe297f308f Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 14 Dec 2023 23:37:15 +0530 Subject: [PATCH 269/310] powerpc/smp: Dynamically build Powerpc topology Currently there are four Powerpc specific sched topologies. These are all statically defined. However not all these topologies are used by all Powerpc systems. To avoid unnecessary degenerations by the scheduler, masks and flags are compared. However if the sched topologies are build dynamically then the code is simpler and there are greater chances of avoiding degenerations. Note: Even X86 builds its sched topologies dynamically and proposed changes are very similar to the way X86 is building its topologies. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231214180720.310852-6-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 80 ++++++++++++++------------------------- 1 file changed, 29 insertions(+), 51 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 9d8bb9a084bd..693334c20d07 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -93,15 +93,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); EXPORT_SYMBOL_GPL(has_big_cores); -enum { -#ifdef CONFIG_SCHED_SMT - smt_idx, -#endif - cache_idx, - mc_idx, - die_idx, -}; - #define MAX_THREAD_LIST_SIZE 8 #define THREAD_GROUP_SHARE_L1 1 #define THREAD_GROUP_SHARE_L2_L3 2 @@ -1067,16 +1058,6 @@ static const struct cpumask *cpu_mc_mask(int cpu) return cpu_coregroup_mask(cpu); } -static struct sched_domain_topology_level powerpc_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, -#endif - { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, - { cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) }, - { cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) }, - { NULL, }, -}; - static int __init init_big_cores(void) { int cpu; @@ -1704,9 +1685,11 @@ void start_secondary(void *unused) BUG(); } -static void __init fixup_topology(void) +static struct sched_domain_topology_level powerpc_topology[6]; + +static void __init build_sched_topology(void) { - int i; + int i = 0; if (is_shared_processor() && has_big_cores) static_branch_enable(&splpar_asym_pack); @@ -1714,36 +1697,33 @@ static void __init fixup_topology(void) #ifdef CONFIG_SCHED_SMT if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); - powerpc_topology[smt_idx].mask = smallcore_smt_mask; + powerpc_topology[i++] = (struct sched_domain_topology_level){ + smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) + }; + } else { + powerpc_topology[i++] = (struct sched_domain_topology_level){ + cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) + }; } #endif - - if (!has_coregroup_support()) - powerpc_topology[mc_idx].mask = powerpc_topology[cache_idx].mask; - - /* - * Try to consolidate topology levels here instead of - * allowing scheduler to degenerate. - * - Dont consolidate if masks are different. - * - Dont consolidate if sd_flags exists and are different. - */ - for (i = 1; i <= die_idx; i++) { - if (powerpc_topology[i].mask != powerpc_topology[i - 1].mask) - continue; - - if (powerpc_topology[i].sd_flags && powerpc_topology[i - 1].sd_flags && - powerpc_topology[i].sd_flags != powerpc_topology[i - 1].sd_flags) - continue; - - if (!powerpc_topology[i - 1].sd_flags) - powerpc_topology[i - 1].sd_flags = powerpc_topology[i].sd_flags; - - powerpc_topology[i].mask = powerpc_topology[i + 1].mask; - powerpc_topology[i].sd_flags = powerpc_topology[i + 1].sd_flags; -#ifdef CONFIG_SCHED_DEBUG - powerpc_topology[i].name = powerpc_topology[i + 1].name; -#endif + if (shared_caches) { + powerpc_topology[i++] = (struct sched_domain_topology_level){ + shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) + }; } + if (has_coregroup_support()) { + powerpc_topology[i++] = (struct sched_domain_topology_level){ + cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) + }; + } + powerpc_topology[i++] = (struct sched_domain_topology_level){ + cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) + }; + + /* There must be one trailing NULL entry left. */ + BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1); + + set_sched_topology(powerpc_topology); } void __init smp_cpus_done(unsigned int max_cpus) @@ -1758,9 +1738,7 @@ void __init smp_cpus_done(unsigned int max_cpus) smp_ops->bringup_done(); dump_numa_cpu_topology(); - - fixup_topology(); - set_sched_topology(powerpc_topology); + build_sched_topology(); } /* From 3ed57b41a4125609e9fd03e32228aec61d95fe1f Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 15 Nov 2023 11:54:48 -0800 Subject: [PATCH 270/310] x86/mce: Remove old CMCI storm mitigation code When a "storm" of corrected machine check interrupts (CMCI) is detected this code mitigates by disabling CMCI interrupt signalling from all of the banks owned by the CPU that saw the storm. There are problems with this approach: 1) It is very coarse grained. In all likelihood only one of the banks was generating the interrupts, but CMCI is disabled for all. This means Linux may delay seeing and processing errors logged from other banks. 2) Although CMCI stands for Corrected Machine Check Interrupt, it is also used to signal when an uncorrected error is logged. This is a problem because these errors should be handled in a timely manner. Delete all this code in preparation for a finer grained solution. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Yazen Ghannam Tested-by: Yazen Ghannam Link: https://lore.kernel.org/r/20231115195450.12963-2-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 20 +--- arch/x86/kernel/cpu/mce/intel.c | 145 ----------------------------- arch/x86/kernel/cpu/mce/internal.h | 6 -- 3 files changed, 1 insertion(+), 170 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 1642018dd6c9..b2ef48714825 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1617,13 +1617,6 @@ static unsigned long check_interval = INITIAL_CHECK_INTERVAL; static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static unsigned long mce_adjust_timer_default(unsigned long interval) -{ - return interval; -} - -static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; - static void __start_timer(struct timer_list *t, unsigned long interval) { unsigned long when = jiffies + interval; @@ -1653,15 +1646,9 @@ static void mce_timer_fn(struct timer_list *t) iv = __this_cpu_read(mce_next_interval); - if (mce_available(this_cpu_ptr(&cpu_info))) { + if (mce_available(this_cpu_ptr(&cpu_info))) mc_poll_banks(); - if (mce_intel_cmci_poll()) { - iv = mce_adjust_timer(iv); - goto done; - } - } - /* * Alert userspace if needed. If we logged an MCE, reduce the polling * interval, otherwise increase the polling interval. @@ -1671,7 +1658,6 @@ static void mce_timer_fn(struct timer_list *t) else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); -done: __this_cpu_write(mce_next_interval, iv); __start_timer(t, iv); } @@ -2011,7 +1997,6 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) intel_init_cmci(); intel_init_lmce(); - mce_adjust_timer = cmci_intel_adjust_timer; } static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) @@ -2024,7 +2009,6 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); - mce_adjust_timer = cmci_intel_adjust_timer; break; case X86_VENDOR_AMD: { @@ -2678,8 +2662,6 @@ static void mce_reenable_cpu(void) static int mce_cpu_dead(unsigned int cpu) { - mce_intel_hcpu_update(cpu); - /* intentionally ignoring frozen here */ if (!cpuhp_tasks_frozen) cmci_rediscover(); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 52bce533ddcc..fc4ffc434023 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -41,15 +41,6 @@ */ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); -/* - * CMCI storm detection backoff counter - * - * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've - * encountered an error. If not, we decrement it by one. We signal the end of - * the CMCI storm when it reaches 0. - */ -static DEFINE_PER_CPU(int, cmci_backoff_cnt); - /* * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. @@ -64,21 +55,6 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock); static DEFINE_SPINLOCK(cmci_poll_lock); #define CMCI_THRESHOLD 1 -#define CMCI_POLL_INTERVAL (30 * HZ) -#define CMCI_STORM_INTERVAL (HZ) -#define CMCI_STORM_THRESHOLD 15 - -static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); -static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); -static DEFINE_PER_CPU(unsigned int, cmci_storm_state); - -enum { - CMCI_STORM_NONE, - CMCI_STORM_ACTIVE, - CMCI_STORM_SUBSIDED, -}; - -static atomic_t cmci_storm_on_cpus; static int cmci_supported(int *banks) { @@ -134,124 +110,6 @@ static bool lmce_supported(void) return tmp & FEAT_CTL_LMCE_ENABLED; } -bool mce_intel_cmci_poll(void) -{ - if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) - return false; - - /* - * Reset the counter if we've logged an error in the last poll - * during the storm. - */ - if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned))) - this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); - else - this_cpu_dec(cmci_backoff_cnt); - - return true; -} - -void mce_intel_hcpu_update(unsigned long cpu) -{ - if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) - atomic_dec(&cmci_storm_on_cpus); - - per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; -} - -static void cmci_toggle_interrupt_mode(bool on) -{ - unsigned long flags, *owned; - int bank; - u64 val; - - raw_spin_lock_irqsave(&cmci_discover_lock, flags); - owned = this_cpu_ptr(mce_banks_owned); - for_each_set_bit(bank, owned, MAX_NR_BANKS) { - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); - - if (on) - val |= MCI_CTL2_CMCI_EN; - else - val &= ~MCI_CTL2_CMCI_EN; - - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); - } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); -} - -unsigned long cmci_intel_adjust_timer(unsigned long interval) -{ - if ((this_cpu_read(cmci_backoff_cnt) > 0) && - (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) { - mce_notify_irq(); - return CMCI_STORM_INTERVAL; - } - - switch (__this_cpu_read(cmci_storm_state)) { - case CMCI_STORM_ACTIVE: - - /* - * We switch back to interrupt mode once the poll timer has - * silenced itself. That means no events recorded and the timer - * interval is back to our poll interval. - */ - __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); - if (!atomic_sub_return(1, &cmci_storm_on_cpus)) - pr_notice("CMCI storm subsided: switching to interrupt mode\n"); - - fallthrough; - - case CMCI_STORM_SUBSIDED: - /* - * We wait for all CPUs to go back to SUBSIDED state. When that - * happens we switch back to interrupt mode. - */ - if (!atomic_read(&cmci_storm_on_cpus)) { - __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); - cmci_toggle_interrupt_mode(true); - cmci_recheck(); - } - return CMCI_POLL_INTERVAL; - default: - - /* We have shiny weather. Let the poll do whatever it thinks. */ - return interval; - } -} - -static bool cmci_storm_detect(void) -{ - unsigned int cnt = __this_cpu_read(cmci_storm_cnt); - unsigned long ts = __this_cpu_read(cmci_time_stamp); - unsigned long now = jiffies; - int r; - - if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) - return true; - - if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { - cnt++; - } else { - cnt = 1; - __this_cpu_write(cmci_time_stamp, now); - } - __this_cpu_write(cmci_storm_cnt, cnt); - - if (cnt <= CMCI_STORM_THRESHOLD) - return false; - - cmci_toggle_interrupt_mode(false); - __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); - r = atomic_add_return(1, &cmci_storm_on_cpus); - mce_timer_kick(CMCI_STORM_INTERVAL); - this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); - - if (r == 1) - pr_notice("CMCI storm detected: switching to poll mode\n"); - return true; -} - /* * The interrupt handler. This is called on every event. * Just call the poller directly to log any events. @@ -260,9 +118,6 @@ static bool cmci_storm_detect(void) */ static void intel_threshold_interrupt(void) { - if (cmci_storm_detect()) - return; - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index e13a26c9c0ac..b18e99016ce5 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -41,9 +41,6 @@ struct dentry *mce_get_debugfs_dir(void); extern mce_banks_t mce_banks_ce_disabled; #ifdef CONFIG_X86_MCE_INTEL -unsigned long cmci_intel_adjust_timer(unsigned long interval); -bool mce_intel_cmci_poll(void); -void mce_intel_hcpu_update(unsigned long cpu); void cmci_disable_bank(int bank); void intel_init_cmci(void); void intel_init_lmce(void); @@ -51,9 +48,6 @@ void intel_clear_lmce(void); bool intel_filter_mce(struct mce *m); bool intel_mce_usable_address(struct mce *m); #else -# define cmci_intel_adjust_timer mce_adjust_timer_default -static inline bool mce_intel_cmci_poll(void) { return false; } -static inline void mce_intel_hcpu_update(unsigned long cpu) { } static inline void cmci_disable_bank(int bank) { } static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } From 7eae17c4add5de46efcca45356388f480103e6d9 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 15 Nov 2023 11:54:49 -0800 Subject: [PATCH 271/310] x86/mce: Add per-bank CMCI storm mitigation This is the core functionality to track CMCI storms at the machine check bank granularity. Subsequent patches will add the vendor specific hooks to supply input to the storm detection and take actions on the start/end of a storm. machine_check_poll() is called both by the CMCI interrupt code, and for periodic polls from a timer. Add a hook in this routine to maintain a bitmap history for each bank showing whether the bank logged an corrected error or not each time it is polled. In normal operation the interval between polls of these banks determines how far to shift the history. The 64 bit width corresponds to about one second. When a storm is observed a CPU vendor specific action is taken to reduce or stop CMCI from the bank that is the source of the storm. The bank is added to the bitmap of banks for this CPU to poll. The polling rate is increased to once per second. During a storm each bit in the history indicates the status of the bank each time it is polled. Thus the history covers just over a minute. Declare a storm for that bank if the number of corrected interrupts seen in that history is above some threshold (defined as 5 in this series, could be tuned later if there is data to suggest a better value). A storm on a bank ends if enough consecutive polls of the bank show no corrected errors (defined as 30, may also change). That calls the CPU vendor specific function to revert to normal operational mode, and changes the polling rate back to the default. [ bp: Massage. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231115195450.12963-3-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 33 ++++++-- arch/x86/kernel/cpu/mce/internal.h | 58 +++++++++++++- arch/x86/kernel/cpu/mce/threshold.c | 112 ++++++++++++++++++++++++++++ 3 files changed, 194 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b2ef48714825..fd5ce12c4f9a 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -686,6 +686,16 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) barrier(); m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + /* + * Update storm tracking here, before checking for the + * MCI_STATUS_VAL bit. Valid corrected errors count + * towards declaring, or maintaining, storm status. No + * error in a bank counts towards avoiding, or ending, + * storm status. + */ + if (!mca_cfg.cmci_disabled) + mce_track_storm(&m); + /* If this entry is not valid, ignore it */ if (!(m.status & MCI_STATUS_VAL)) continue; @@ -1658,22 +1668,29 @@ static void mce_timer_fn(struct timer_list *t) else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); - __this_cpu_write(mce_next_interval, iv); - __start_timer(t, iv); + if (mce_get_storm_mode()) { + __start_timer(t, HZ); + } else { + __this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); + } } /* - * Ensure that the timer is firing in @interval from now. + * When a storm starts on any bank on this CPU, switch to polling + * once per second. When the storm ends, revert to the default + * polling interval. */ -void mce_timer_kick(unsigned long interval) +void mce_timer_kick(bool storm) { struct timer_list *t = this_cpu_ptr(&mce_timer); - unsigned long iv = __this_cpu_read(mce_next_interval); - __start_timer(t, interval); + mce_set_storm_mode(storm); - if (interval < iv) - __this_cpu_write(mce_next_interval, interval); + if (storm) + __start_timer(t, HZ); + else + __this_cpu_write(mce_next_interval, check_interval * HZ); } /* Must not be called in IRQ context where del_timer_sync() can deadlock */ diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index b18e99016ce5..157b2f295667 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -56,7 +56,63 @@ static inline bool intel_filter_mce(struct mce *m) { return false; } static inline bool intel_mce_usable_address(struct mce *m) { return false; } #endif -void mce_timer_kick(unsigned long interval); +void mce_timer_kick(bool storm); + +#ifdef CONFIG_X86_MCE_THRESHOLD +void cmci_storm_begin(unsigned int bank); +void cmci_storm_end(unsigned int bank); +void mce_track_storm(struct mce *mce); +void mce_inherit_storm(unsigned int bank); +bool mce_get_storm_mode(void); +void mce_set_storm_mode(bool storm); +#else +static inline void cmci_storm_begin(unsigned int bank) {} +static inline void cmci_storm_end(unsigned int bank) {} +static inline void mce_track_storm(struct mce *mce) {} +static inline void mce_inherit_storm(unsigned int bank) {} +static inline bool mce_get_storm_mode(void) { return false; } +static inline void mce_set_storm_mode(bool storm) {} +#endif + +/* + * history: Bitmask tracking errors occurrence. Each set bit + * represents an error seen. + * + * timestamp: Last time (in jiffies) that the bank was polled. + * in_storm_mode: Is this bank in storm mode? + * poll_only: Bank does not support CMCI, skip storm tracking. + */ +struct storm_bank { + u64 history; + u64 timestamp; + bool in_storm_mode; + bool poll_only; +}; + +#define NUM_HISTORY_BITS (sizeof(u64) * BITS_PER_BYTE) + +/* How many errors within the history buffer mark the start of a storm. */ +#define STORM_BEGIN_THRESHOLD 5 + +/* + * How many polls of machine check bank without an error before declaring + * the storm is over. Since it is tracked by the bitmasks in the history + * field of struct storm_bank the mask is 30 bits [0 ... 29]. + */ +#define STORM_END_POLL_THRESHOLD 29 + +/* + * banks: per-cpu, per-bank details + * stormy_bank_count: count of MC banks in storm state + * poll_mode: CPU is in poll mode + */ +struct mca_storm_desc { + struct storm_bank banks[MAX_NR_BANKS]; + u8 stormy_bank_count; + bool poll_mode; +}; + +DECLARE_PER_CPU(struct mca_storm_desc, storm_desc); #ifdef CONFIG_ACPI_APEI int apei_write_mce(struct mce *m); diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index ef4e7bb5fd88..0e1988468ee4 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -29,3 +29,115 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); apic_eoi(); } + +DEFINE_PER_CPU(struct mca_storm_desc, storm_desc); + +void mce_inherit_storm(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + /* + * Previous CPU owning this bank had put it into storm mode, + * but the precise history of that storm is unknown. Assume + * the worst (all recent polls of the bank found a valid error + * logged). This will avoid the new owner prematurely declaring + * the storm has ended. + */ + storm->banks[bank].history = ~0ull; + storm->banks[bank].timestamp = jiffies; +} + +bool mce_get_storm_mode(void) +{ + return __this_cpu_read(storm_desc.poll_mode); +} + +void mce_set_storm_mode(bool storm) +{ + __this_cpu_write(storm_desc.poll_mode, storm); +} + +static void mce_handle_storm(unsigned int bank, bool on) +{ + switch (boot_cpu_data.x86_vendor) { + } +} + +void cmci_storm_begin(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + __set_bit(bank, this_cpu_ptr(mce_poll_banks)); + storm->banks[bank].in_storm_mode = true; + + /* + * If this is the first bank on this CPU to enter storm mode + * start polling. + */ + if (++storm->stormy_bank_count == 1) + mce_timer_kick(true); +} + +void cmci_storm_end(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + storm->banks[bank].history = 0; + storm->banks[bank].in_storm_mode = false; + + /* If no banks left in storm mode, stop polling. */ + if (!this_cpu_dec_return(storm_desc.stormy_bank_count)) + mce_timer_kick(false); +} + +void mce_track_storm(struct mce *mce) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + unsigned long now = jiffies, delta; + unsigned int shift = 1; + u64 history = 0; + + /* No tracking needed for banks that do not support CMCI */ + if (storm->banks[mce->bank].poll_only) + return; + + /* + * When a bank is in storm mode it is polled once per second and + * the history mask will record about the last minute of poll results. + * If it is not in storm mode, then the bank is only checked when + * there is a CMCI interrupt. Check how long it has been since + * this bank was last checked, and adjust the amount of "shift" + * to apply to history. + */ + if (!storm->banks[mce->bank].in_storm_mode) { + delta = now - storm->banks[mce->bank].timestamp; + shift = (delta + HZ) / HZ; + } + + /* If it has been a long time since the last poll, clear history. */ + if (shift < NUM_HISTORY_BITS) + history = storm->banks[mce->bank].history << shift; + + storm->banks[mce->bank].timestamp = now; + + /* History keeps track of corrected errors. VAL=1 && UC=0 */ + if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce)) + history |= 1; + + storm->banks[mce->bank].history = history; + + if (storm->banks[mce->bank].in_storm_mode) { + if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0)) + return; + printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank); + mce_handle_storm(mce->bank, false); + cmci_storm_end(mce->bank); + } else { + if (hweight64(history) < STORM_BEGIN_THRESHOLD) + return; + printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank); + mce_handle_storm(mce->bank, true); + cmci_storm_begin(mce->bank); + } +} From 1f68ce2a027250aeeb1756391110cdc4dc97c797 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 15 Nov 2023 11:54:50 -0800 Subject: [PATCH 272/310] x86/mce: Handle Intel threshold interrupt storms Add an Intel specific hook into machine_check_poll() to keep track of per-CPU, per-bank corrected error logs (with a stub for the CONFIG_MCE_INTEL=n case). When a storm is observed the rate of interrupts is reduced by setting a large threshold value for this bank in IA32_MCi_CTL2. This bank is added to the bitmap of banks for this CPU to poll. The polling rate is increased to once per second. When a storm ends reset the threshold in IA32_MCi_CTL2 back to 1, remove the bank from the bitmap for polling, and change the polling rate back to the default. If a CPU with banks in storm mode is taken offline, the new CPU that inherits ownership of those banks takes over management of storm(s) in the inherited bank(s). The cmci_discover() function was already very large. These changes pushed it well over the top. Refactor with three helper functions to bring it back under control. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231115195450.12963-4-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/intel.c | 205 +++++++++++++++++++++------- arch/x86/kernel/cpu/mce/internal.h | 2 + arch/x86/kernel/cpu/mce/threshold.c | 3 + 3 files changed, 160 insertions(+), 50 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index fc4ffc434023..399b62e223d2 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -54,8 +54,27 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock); */ static DEFINE_SPINLOCK(cmci_poll_lock); +/* Linux non-storm CMCI threshold (may be overridden by BIOS) */ #define CMCI_THRESHOLD 1 +/* + * MCi_CTL2 threshold for each bank when there is no storm. + * Default value for each bank may have been set by BIOS. + */ +static u16 cmci_threshold[MAX_NR_BANKS]; + +/* + * High threshold to limit CMCI rate during storms. Max supported is + * 0x7FFF. Use this slightly smaller value so it has a distinctive + * signature when some asks "Why am I not seeing all corrected errors?" + * A high threshold is used instead of just disabling CMCI for a + * bank because both corrected and uncorrected errors may be logged + * in the same bank and signalled with CMCI. The threshold only applies + * to corrected errors, so keeping CMCI enabled means that uncorrected + * errors will still be processed in a timely fashion. + */ +#define CMCI_STORM_THRESHOLD 32749 + static int cmci_supported(int *banks) { u64 cap; @@ -110,6 +129,31 @@ static bool lmce_supported(void) return tmp & FEAT_CTL_LMCE_ENABLED; } +/* + * Set a new CMCI threshold value. Preserve the state of the + * MCI_CTL2_CMCI_EN bit in case this happens during a + * cmci_rediscover() operation. + */ +static void cmci_set_threshold(int bank, int thresh) +{ + unsigned long flags; + u64 val; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val | thresh); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +void mce_intel_handle_storm(int bank, bool on) +{ + if (on) + cmci_set_threshold(bank, CMCI_STORM_THRESHOLD); + else + cmci_set_threshold(bank, cmci_threshold[bank]); +} + /* * The interrupt handler. This is called on every event. * Just call the poller directly to log any events. @@ -121,72 +165,130 @@ static void intel_threshold_interrupt(void) machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); } +/* + * Check all the reasons why current CPU cannot claim + * ownership of a bank. + * 1: CPU already owns this bank + * 2: BIOS owns this bank + * 3: Some other CPU owns this bank + */ +static bool cmci_skip_bank(int bank, u64 *val) +{ + unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned); + + if (test_bit(bank, owned)) + return true; + + /* Skip banks in firmware first mode */ + if (test_bit(bank, mce_banks_ce_disabled)) + return true; + + rdmsrl(MSR_IA32_MCx_CTL2(bank), *val); + + /* Already owned by someone else? */ + if (*val & MCI_CTL2_CMCI_EN) { + clear_bit(bank, owned); + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + return true; + } + + return false; +} + +/* + * Decide which CMCI interrupt threshold to use: + * 1: If this bank is in storm mode from whichever CPU was + * the previous owner, stay in storm mode. + * 2: If ignoring any threshold set by BIOS, set Linux default + * 3: Try to honor BIOS threshold (unless buggy BIOS set it at zero). + */ +static u64 cmci_pick_threshold(u64 val, int *bios_zero_thresh) +{ + if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) + return val; + + if (!mca_cfg.bios_cmci_threshold) { + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= CMCI_THRESHOLD; + } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { + /* + * If bios_cmci_threshold boot option was specified + * but the threshold is zero, we'll try to initialize + * it to 1. + */ + *bios_zero_thresh = 1; + val |= CMCI_THRESHOLD; + } + + return val; +} + +/* + * Try to claim ownership of a bank. + */ +static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_wrong_thresh) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + val |= MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + + /* If the enable bit did not stick, this bank should be polled. */ + if (!(val & MCI_CTL2_CMCI_EN)) { + WARN_ON(!test_bit(bank, this_cpu_ptr(mce_poll_banks))); + storm->banks[bank].poll_only = true; + return; + } + + /* This CPU successfully set the enable bit. */ + set_bit(bank, (void *)this_cpu_ptr(&mce_banks_owned)); + + if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) { + pr_notice("CPU%d BANK%d CMCI inherited storm\n", smp_processor_id(), bank); + mce_inherit_storm(bank); + cmci_storm_begin(bank); + } else { + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + } + + /* + * We are able to set thresholds for some banks that + * had a threshold of 0. This means the BIOS has not + * set the thresholds properly or does not work with + * this boot option. Note down now and report later. + */ + if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && + (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) + *bios_wrong_thresh = 1; + + /* Save default threshold for each bank */ + if (cmci_threshold[bank] == 0) + cmci_threshold[bank] = val & MCI_CTL2_CMCI_THRESHOLD_MASK; +} + /* * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks * on this CPU. Use the algorithm recommended in the SDM to discover shared - * banks. + * banks. Called during initial bootstrap, and also for hotplug CPU operations + * to rediscover/reassign machine check banks. */ static void cmci_discover(int banks) { - unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned); + int bios_wrong_thresh = 0; unsigned long flags; int i; - int bios_wrong_thresh = 0; raw_spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; int bios_zero_thresh = 0; - if (test_bit(i, owned)) + if (cmci_skip_bank(i, &val)) continue; - /* Skip banks in firmware first mode */ - if (test_bit(i, mce_banks_ce_disabled)) - continue; - - rdmsrl(MSR_IA32_MCx_CTL2(i), val); - - /* Already owned by someone else? */ - if (val & MCI_CTL2_CMCI_EN) { - clear_bit(i, owned); - __clear_bit(i, this_cpu_ptr(mce_poll_banks)); - continue; - } - - if (!mca_cfg.bios_cmci_threshold) { - val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; - val |= CMCI_THRESHOLD; - } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { - /* - * If bios_cmci_threshold boot option was specified - * but the threshold is zero, we'll try to initialize - * it to 1. - */ - bios_zero_thresh = 1; - val |= CMCI_THRESHOLD; - } - - val |= MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(i), val); - rdmsrl(MSR_IA32_MCx_CTL2(i), val); - - /* Did the enable bit stick? -- the bank supports CMCI */ - if (val & MCI_CTL2_CMCI_EN) { - set_bit(i, owned); - __clear_bit(i, this_cpu_ptr(mce_poll_banks)); - /* - * We are able to set thresholds for some banks that - * had a threshold of 0. This means the BIOS has not - * set the thresholds properly or does not work with - * this boot option. Note down now and report later. - */ - if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && - (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) - bios_wrong_thresh = 1; - } else { - WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks))); - } + val = cmci_pick_threshold(val, &bios_zero_thresh); + cmci_claim_bank(i, val, bios_zero_thresh, &bios_wrong_thresh); } raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { @@ -225,6 +327,9 @@ static void __cmci_disable_bank(int bank) val &= ~MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(bank), val); __clear_bit(bank, this_cpu_ptr(mce_banks_owned)); + + if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) + cmci_storm_end(bank); } /* diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 157b2f295667..01f8f03969e6 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -41,6 +41,7 @@ struct dentry *mce_get_debugfs_dir(void); extern mce_banks_t mce_banks_ce_disabled; #ifdef CONFIG_X86_MCE_INTEL +void mce_intel_handle_storm(int bank, bool on); void cmci_disable_bank(int bank); void intel_init_cmci(void); void intel_init_lmce(void); @@ -48,6 +49,7 @@ void intel_clear_lmce(void); bool intel_filter_mce(struct mce *m); bool intel_mce_usable_address(struct mce *m); #else +static inline void mce_intel_handle_storm(int bank, bool on) { } static inline void cmci_disable_bank(int bank) { } static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index 0e1988468ee4..89e31e1e5c9c 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -60,6 +60,9 @@ void mce_set_storm_mode(bool storm) static void mce_handle_storm(unsigned int bank, bool on) { switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_handle_storm(bank, on); + break; } } From bb339db4d363c84e0a8d70827df591397ccd7312 Mon Sep 17 00:00:00 2001 From: James Clark Date: Fri, 15 Dec 2023 17:56:48 +0000 Subject: [PATCH 273/310] arm: perf: Fix ARCH=arm build with GCC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LLVM ignores everything inside the if statement and doesn't generate errors, but GCC doesn't ignore it, resulting in the following error: drivers/perf/arm_pmuv3.c: In function ‘armv8pmu_write_evtype’: include/linux/bits.h:34:29: error: left shift count >= width of type [-Werror=shift-count-overflow] 34 | (((~UL(0)) - (UL(1) << (l)) + 1) & \ Fix it by using GENMASK_ULL which doesn't overflow on arm32 (even though the value is never used there). Fixes: 3115ee021bfb ("arm64: perf: Include threshold control fields in PMEVTYPER mask") Reported-by: Uwe Kleine-König Closes: https://lore.kernel.org/linux-arm-kernel/20231215120817.h2f3akgv72zhrtqo@pengutronix.de/ Signed-off-by: James Clark Acked-by: Mark Rutland Reviewed-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231215175648.3397170-2-james.clark@arm.com Signed-off-by: Will Deacon --- include/linux/perf/arm_pmuv3.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 0f4d62ef3a9a..46377e134d67 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -234,8 +234,8 @@ * PMXEVTYPER: Event selection reg */ #define ARMV8_PMU_EVTYPE_EVENT GENMASK(15, 0) /* Mask for EVENT bits */ -#define ARMV8_PMU_EVTYPE_TH GENMASK(43, 32) -#define ARMV8_PMU_EVTYPE_TC GENMASK(63, 61) +#define ARMV8_PMU_EVTYPE_TH GENMASK_ULL(43, 32) /* arm64 only */ +#define ARMV8_PMU_EVTYPE_TC GENMASK_ULL(63, 61) /* arm64 only */ /* * Event filters for PMUv3 From 5cc5ed7a668dcfb62c02f380c7cca9c808242ed0 Mon Sep 17 00:00:00 2001 From: Wang Jinchao Date: Fri, 15 Dec 2023 14:40:08 +0800 Subject: [PATCH 274/310] arm64: memory: remove duplicated include remove duplicated include Signed-off-by: Wang Jinchao Link: https://lore.kernel.org/r/202312151439+0800-wangjinchao@xfusion.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/memory.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index fde4186cc387..27fc302a6d70 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -407,6 +407,5 @@ void dump_mem_limit(void); #define INIT_MEMBLOCK_MEMORY_REGIONS (INIT_MEMBLOCK_REGIONS * 8) #endif -#include #endif /* __ASM_MEMORY_H */ From 3b077ad8cb25c936ff55780c517dbd8ea36fb018 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:41 +0000 Subject: [PATCH 275/310] arm64/sysreg: Add missing Pauth_LR field definitions to ID_AA64ISAR1_EL1 Add the Pauth_LR field definitions to ID_AA64ISAR1_EL1, based on DDI0601 2023-09. These fields aren't used yet. Adding them for completeness and consistency (definition already exists for ID_AA64ISAR2_EL1). Signed-off-by: Fuad Tabba Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20231214100158.2305400-2-tabba@google.com Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 2c4b6665c5bf..d596be2599d1 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1401,6 +1401,7 @@ UnsignedEnum 11:8 API 0b0011 PAuth2 0b0100 FPAC 0b0101 FPACCOMBINE + 0b0110 PAuth_LR EndEnum UnsignedEnum 7:4 APA 0b0000 NI @@ -1409,6 +1410,7 @@ UnsignedEnum 7:4 APA 0b0011 PAuth2 0b0100 FPAC 0b0101 FPACCOMBINE + 0b0110 PAuth_LR EndEnum UnsignedEnum 3:0 DPB 0b0000 NI From 4f101cdcb578638454eeff3e1d6c2cb2495d8005 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:42 +0000 Subject: [PATCH 276/310] arm64/sysreg: Add missing ExtTrcBuff field definition to ID_AA64DFR0_EL1 Add the ExtTrcBuff field definitions to ID_AA64DFR0_EL1 from DDI0601 2023-09. This field isn't used yet. Adding it for completeness and because it will be used in future patches. Signed-off-by: Fuad Tabba Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20231214100158.2305400-3-tabba@google.com Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index d596be2599d1..a8e36640c027 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1189,7 +1189,10 @@ Enum 63:60 HPMN0 0b0000 UNPREDICTABLE 0b0001 DEF EndEnum -Res0 59:56 +UnsignedEnum 59:56 ExtTrcBuff + 0b0000 NI + 0b0001 IMP +EndEnum UnsignedEnum 55:52 BRBE 0b0000 NI 0b0001 IMP From 885c6d8e2885915451dd4f4a90ddd1bb82ba5a4f Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:43 +0000 Subject: [PATCH 277/310] arm64/sysreg: Add missing system register definitions for FGT Add the definitions of missing system registers that are trappable by fine grain traps. The definitions are based on DDI0601 2023-09. Signed-off-by: Fuad Tabba Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20231214100158.2305400-4-tabba@google.com Signed-off-by: Will Deacon --- arch/arm64/tools/sysreg | 43 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index a8e36640c027..5ceaa1d3630e 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -2086,10 +2086,18 @@ Sysreg CONTEXTIDR_EL1 3 0 13 0 1 Fields CONTEXTIDR_ELx EndSysreg +Sysreg RCWSMASK_EL1 3 0 13 0 3 +Field 63:0 RCWSMASK +EndSysreg + Sysreg TPIDR_EL1 3 0 13 0 4 Field 63:0 ThreadID EndSysreg +Sysreg RCWMASK_EL1 3 0 13 0 6 +Field 63:0 RCWMASK +EndSysreg + Sysreg SCXTNUM_EL1 3 0 13 0 7 Field 63:0 SoftwareContextNumber EndSysreg @@ -2714,6 +2722,33 @@ Field 1 PIE Field 0 PnCH EndSysreg +SysregFields MAIR2_ELx +Field 63:56 Attr7 +Field 55:48 Attr6 +Field 47:40 Attr5 +Field 39:32 Attr4 +Field 31:24 Attr3 +Field 23:16 Attr2 +Field 15:8 Attr1 +Field 7:0 Attr0 +EndSysregFields + +Sysreg MAIR2_EL1 3 0 10 2 1 +Fields MAIR2_ELx +EndSysreg + +Sysreg MAIR2_EL2 3 4 10 1 1 +Fields MAIR2_ELx +EndSysreg + +Sysreg AMAIR2_EL1 3 0 10 3 1 +Field 63:0 ImpDef +EndSysreg + +Sysreg AMAIR2_EL2 3 4 10 3 1 +Field 63:0 ImpDef +EndSysreg + SysregFields PIRx_ELx Field 63:60 Perm15 Field 59:56 Perm14 @@ -2765,6 +2800,14 @@ Sysreg POR_EL12 3 5 10 2 4 Fields PIRx_ELx EndSysreg +Sysreg S2POR_EL1 3 0 10 2 5 +Fields PIRx_ELx +EndSysreg + +Sysreg S2PIR_EL2 3 4 10 2 5 +Fields PIRx_ELx +EndSysreg + Sysreg LORSA_EL1 3 0 10 4 0 Res0 63:52 Field 51:16 SA From 4ebee8cebdf6d661dfe7272cf74d378108160a3e Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Thu, 14 Dec 2023 10:01:44 +0000 Subject: [PATCH 278/310] arm64/sysreg: Add missing system instruction definitions for FGT Add the definitions of missing system instructions that are trappable by fine grain traps. The definitions are based on DDI0602 2023-09. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20231214100158.2305400-5-tabba@google.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/sysreg.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 9c2caf0efdc7..b320fb0de56b 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -645,6 +645,7 @@ #define OP_AT_S1E0W sys_insn(AT_Op0, 0, AT_CRn, 8, 3) #define OP_AT_S1E1RP sys_insn(AT_Op0, 0, AT_CRn, 9, 0) #define OP_AT_S1E1WP sys_insn(AT_Op0, 0, AT_CRn, 9, 1) +#define OP_AT_S1E1A sys_insn(AT_Op0, 0, AT_CRn, 9, 2) #define OP_AT_S1E2R sys_insn(AT_Op0, 4, AT_CRn, 8, 0) #define OP_AT_S1E2W sys_insn(AT_Op0, 4, AT_CRn, 8, 1) #define OP_AT_S12E1R sys_insn(AT_Op0, 4, AT_CRn, 8, 4) @@ -781,10 +782,16 @@ #define OP_TLBI_VMALLS12E1NXS sys_insn(1, 4, 9, 7, 6) /* Misc instructions */ +#define OP_GCSPUSHX sys_insn(1, 0, 7, 7, 4) +#define OP_GCSPOPCX sys_insn(1, 0, 7, 7, 5) +#define OP_GCSPOPX sys_insn(1, 0, 7, 7, 6) +#define OP_GCSPUSHM sys_insn(1, 3, 7, 7, 0) + #define OP_BRB_IALL sys_insn(1, 1, 7, 2, 4) #define OP_BRB_INJ sys_insn(1, 1, 7, 2, 5) #define OP_CFP_RCTX sys_insn(1, 3, 7, 3, 4) #define OP_DVP_RCTX sys_insn(1, 3, 7, 3, 5) +#define OP_COSP_RCTX sys_insn(1, 3, 7, 3, 6) #define OP_CPP_RCTX sys_insn(1, 3, 7, 3, 7) /* Common SCTLR_ELx flags. */ From 7b21ed7d119dc06b0ed2ba3e406a02cafe3a8d03 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Dec 2023 11:18:50 -0500 Subject: [PATCH 279/310] arm64: properly install vmlinuz.efi If you select CONFIG_EFI_ZBOOT, we will generate vmlinuz.efi, and then when we go to install the kernel we'll install the vmlinux instead because install.sh only recognizes Image.gz as wanting the compressed install image. With CONFIG_EFI_ZBOOT we don't get the proper kernel installed, which means it doesn't boot, which makes for a very confused and subsequently angry kernel developer. Fix this by properly installing our compressed kernel if we've enabled CONFIG_EFI_ZBOOT. Signed-off-by: Josef Bacik Cc: # 6.1.x Fixes: c37b830fef13 ("arm64: efi: enable generic EFI compressed boot") Reviewed-by: Simon Glass Link: https://lore.kernel.org/r/6edb1402769c2c14c4fbef8f7eaedb3167558789.1702570674.git.josef@toxicpanda.com Signed-off-by: Will Deacon --- arch/arm64/boot/install.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/install.sh b/arch/arm64/boot/install.sh index 7399d706967a..9b7a09808a3d 100755 --- a/arch/arm64/boot/install.sh +++ b/arch/arm64/boot/install.sh @@ -17,7 +17,8 @@ # $3 - kernel map file # $4 - default install path (blank if root directory) -if [ "$(basename $2)" = "Image.gz" ]; then +if [ "$(basename $2)" = "Image.gz" ] || [ "$(basename $2)" = "vmlinuz.efi" ] +then # Compressed install echo "Installing compressed kernel" base=vmlinuz From 97ba4416d6dd53c4202038ee7d86dfb29774e00f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 18 Dec 2023 17:01:27 +0900 Subject: [PATCH 280/310] efi/libstub: zboot: do not use $(shell ...) in cmd_copy_and_pad You do not need to use $(shell ...) in recipe lines, as they are already executed in a shell. An alternative solution is $$(...), which is an escaped sequence of the shell's command substituion, $(...). For this case, there is a reason to avoid $(shell ...). Kbuild detects command changes by using the if_changed macro, which compares the previous command recorded in .*.cmd with the current command from Makefile. If they differ, Kbuild re-runs the build rule. To diff the commands, Make must expand $(shell ...) first. It means that hexdump is executed every time, even when nothing needs rebuilding. If Kbuild determines that vmlinux.bin needs rebuilding, hexdump will be executed again to evaluate the 'cmd' macro, one more time to really build vmlinux.bin, and finally yet again to record the expanded command into .*.cmd. Replace $(shell ...) with $$(...) to avoid multiple, unnecessay shell evaluations. Since Make is agnostic about the shell code, $(...), the if_changed macro compares the string "$(hexdump -s16 -n4 ...)" verbatim, so hexdump is run only for building vmlinux.bin. For the same reason, $(shell ...) in EFI_ZBOOT_OBJCOPY_FLAGS should be eliminated. While I was here, I replaced '&&' with ';' because a command for if_changed is executed with 'set -e'. Signed-off-by: Masahiro Yamada Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20231218080127.907460-1-masahiroy@kernel.org Signed-off-by: Will Deacon --- arch/arm64/boot/Makefile | 2 +- drivers/firmware/efi/libstub/Makefile.zboot | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/Makefile b/arch/arm64/boot/Makefile index 1761f5972443..a5a787371117 100644 --- a/arch/arm64/boot/Makefile +++ b/arch/arm64/boot/Makefile @@ -44,7 +44,7 @@ EFI_ZBOOT_BFD_TARGET := elf64-littleaarch64 EFI_ZBOOT_MACH_TYPE := ARM64 EFI_ZBOOT_FORWARD_CFI := $(CONFIG_ARM64_BTI_KERNEL) -EFI_ZBOOT_OBJCOPY_FLAGS = --add-symbol zboot_code_size=0x$(shell \ +EFI_ZBOOT_OBJCOPY_FLAGS = --add-symbol zboot_code_size=0x$$( \ $(NM) vmlinux|grep _kernel_codesize|cut -d' ' -f1) include $(srctree)/drivers/firmware/efi/libstub/Makefile.zboot diff --git a/drivers/firmware/efi/libstub/Makefile.zboot b/drivers/firmware/efi/libstub/Makefile.zboot index 2c489627a807..65ffd0b760b2 100644 --- a/drivers/firmware/efi/libstub/Makefile.zboot +++ b/drivers/firmware/efi/libstub/Makefile.zboot @@ -5,8 +5,8 @@ # EFI_ZBOOT_FORWARD_CFI quiet_cmd_copy_and_pad = PAD $@ - cmd_copy_and_pad = cp $< $@ && \ - truncate -s $(shell hexdump -s16 -n4 -e '"%u"' $<) $@ + cmd_copy_and_pad = cp $< $@; \ + truncate -s $$(hexdump -s16 -n4 -e '"%u"' $<) $@ # Pad the file to the size of the uncompressed image in memory, including BSS $(obj)/vmlinux.bin: $(obj)/$(EFI_ZBOOT_PAYLOAD) FORCE From 9a802ddb2123e5adec394d35cd539cc0b15bc830 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 18 Dec 2023 23:39:32 +0000 Subject: [PATCH 281/310] kselftest/arm64: Don't probe the current VL for unsupported vector types The vec-syscfg selftest verifies that setting the VL of the currently tested vector type does not disrupt the VL of the other vector type. To do this it records the current vector length for each type but neglects to guard this with a check for that vector type actually being supported. Add one, using a helper function which we also update all the other instances of this pattern. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20231218-kselftest-arm64-vec-syscfg-rdvl-v1-1-0ac22d47e81f@kernel.org Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/vec-syscfg.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/vec-syscfg.c b/tools/testing/selftests/arm64/fp/vec-syscfg.c index 5f648b97a06f..ea9c7d47790f 100644 --- a/tools/testing/selftests/arm64/fp/vec-syscfg.c +++ b/tools/testing/selftests/arm64/fp/vec-syscfg.c @@ -66,6 +66,11 @@ static struct vec_data vec_data[] = { }, }; +static bool vec_type_supported(struct vec_data *data) +{ + return getauxval(data->hwcap_type) & data->hwcap; +} + static int stdio_read_integer(FILE *f, const char *what, int *val) { int n = 0; @@ -564,8 +569,11 @@ static void prctl_set_all_vqs(struct vec_data *data) return; } - for (i = 0; i < ARRAY_SIZE(vec_data); i++) + for (i = 0; i < ARRAY_SIZE(vec_data); i++) { + if (!vec_type_supported(&vec_data[i])) + continue; orig_vls[i] = vec_data[i].rdvl(); + } for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; vq++) { vl = sve_vl_from_vq(vq); @@ -594,7 +602,7 @@ static void prctl_set_all_vqs(struct vec_data *data) if (&vec_data[i] == data) continue; - if (!(getauxval(vec_data[i].hwcap_type) & vec_data[i].hwcap)) + if (!vec_type_supported(&vec_data[i])) continue; if (vec_data[i].rdvl() != orig_vls[i]) { @@ -765,7 +773,7 @@ int main(void) struct vec_data *data = &vec_data[i]; unsigned long supported; - supported = getauxval(data->hwcap_type) & data->hwcap; + supported = vec_type_supported(data); if (!supported) all_supported = false; From 9ec1d7486e2520b4898d7f8e1ec3acc7c13c8dc8 Mon Sep 17 00:00:00 2001 From: David Heidelberg Date: Tue, 12 Dec 2023 19:44:58 +0100 Subject: [PATCH 282/310] powerpc/fsl: Fix fsl,tmu-calibration to match the schema fsl,tmu-calibration is defined as a u32 matrix in Documentation/devicetree/bindings/thermal/qoriq-thermal.yaml. Use matching property syntax. No functional changes. Signed-off-by: David Heidelberg Signed-off-by: Michael Ellerman Link: https://msgid.link/20231212184515.82886-2-david@ixit.cz --- arch/powerpc/boot/dts/fsl/t1023si-post.dtsi | 73 +++++++++++---------- arch/powerpc/boot/dts/fsl/t1040si-post.dtsi | 65 +++++++++--------- 2 files changed, 70 insertions(+), 68 deletions(-) diff --git a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi index d552044c5afc..aa5152ca8120 100644 --- a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi @@ -367,45 +367,46 @@ reg = <0xf0000 0x1000>; interrupts = <18 2 0 0>; fsl,tmu-range = <0xb0000 0xa0026 0x80048 0x30061>; - fsl,tmu-calibration = <0x00000000 0x0000000f - 0x00000001 0x00000017 - 0x00000002 0x0000001e - 0x00000003 0x00000026 - 0x00000004 0x0000002e - 0x00000005 0x00000035 - 0x00000006 0x0000003d - 0x00000007 0x00000044 - 0x00000008 0x0000004c - 0x00000009 0x00000053 - 0x0000000a 0x0000005b - 0x0000000b 0x00000064 + fsl,tmu-calibration = + <0x00000000 0x0000000f>, + <0x00000001 0x00000017>, + <0x00000002 0x0000001e>, + <0x00000003 0x00000026>, + <0x00000004 0x0000002e>, + <0x00000005 0x00000035>, + <0x00000006 0x0000003d>, + <0x00000007 0x00000044>, + <0x00000008 0x0000004c>, + <0x00000009 0x00000053>, + <0x0000000a 0x0000005b>, + <0x0000000b 0x00000064>, - 0x00010000 0x00000011 - 0x00010001 0x0000001c - 0x00010002 0x00000024 - 0x00010003 0x0000002b - 0x00010004 0x00000034 - 0x00010005 0x00000039 - 0x00010006 0x00000042 - 0x00010007 0x0000004c - 0x00010008 0x00000051 - 0x00010009 0x0000005a - 0x0001000a 0x00000063 + <0x00010000 0x00000011>, + <0x00010001 0x0000001c>, + <0x00010002 0x00000024>, + <0x00010003 0x0000002b>, + <0x00010004 0x00000034>, + <0x00010005 0x00000039>, + <0x00010006 0x00000042>, + <0x00010007 0x0000004c>, + <0x00010008 0x00000051>, + <0x00010009 0x0000005a>, + <0x0001000a 0x00000063>, - 0x00020000 0x00000013 - 0x00020001 0x00000019 - 0x00020002 0x00000024 - 0x00020003 0x0000002c - 0x00020004 0x00000035 - 0x00020005 0x0000003d - 0x00020006 0x00000046 - 0x00020007 0x00000050 - 0x00020008 0x00000059 + <0x00020000 0x00000013>, + <0x00020001 0x00000019>, + <0x00020002 0x00000024>, + <0x00020003 0x0000002c>, + <0x00020004 0x00000035>, + <0x00020005 0x0000003d>, + <0x00020006 0x00000046>, + <0x00020007 0x00000050>, + <0x00020008 0x00000059>, - 0x00030000 0x00000002 - 0x00030001 0x0000000d - 0x00030002 0x00000019 - 0x00030003 0x00000024>; + <0x00030000 0x00000002>, + <0x00030001 0x0000000d>, + <0x00030002 0x00000019>, + <0x00030003 0x00000024>; #thermal-sensor-cells = <1>; }; diff --git a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi index ad0ab33336b8..776788623204 100644 --- a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi @@ -447,41 +447,42 @@ reg = <0xf0000 0x1000>; interrupts = <18 2 0 0>; fsl,tmu-range = <0xa0000 0x90026 0x8004a 0x1006a>; - fsl,tmu-calibration = <0x00000000 0x00000025 - 0x00000001 0x00000028 - 0x00000002 0x0000002d - 0x00000003 0x00000031 - 0x00000004 0x00000036 - 0x00000005 0x0000003a - 0x00000006 0x00000040 - 0x00000007 0x00000044 - 0x00000008 0x0000004a - 0x00000009 0x0000004f - 0x0000000a 0x00000054 + fsl,tmu-calibration = + <0x00000000 0x00000025>, + <0x00000001 0x00000028>, + <0x00000002 0x0000002d>, + <0x00000003 0x00000031>, + <0x00000004 0x00000036>, + <0x00000005 0x0000003a>, + <0x00000006 0x00000040>, + <0x00000007 0x00000044>, + <0x00000008 0x0000004a>, + <0x00000009 0x0000004f>, + <0x0000000a 0x00000054>, - 0x00010000 0x0000000d - 0x00010001 0x00000013 - 0x00010002 0x00000019 - 0x00010003 0x0000001f - 0x00010004 0x00000025 - 0x00010005 0x0000002d - 0x00010006 0x00000033 - 0x00010007 0x00000043 - 0x00010008 0x0000004b - 0x00010009 0x00000053 + <0x00010000 0x0000000d>, + <0x00010001 0x00000013>, + <0x00010002 0x00000019>, + <0x00010003 0x0000001f>, + <0x00010004 0x00000025>, + <0x00010005 0x0000002d>, + <0x00010006 0x00000033>, + <0x00010007 0x00000043>, + <0x00010008 0x0000004b>, + <0x00010009 0x00000053>, - 0x00020000 0x00000010 - 0x00020001 0x00000017 - 0x00020002 0x0000001f - 0x00020003 0x00000029 - 0x00020004 0x00000031 - 0x00020005 0x0000003c - 0x00020006 0x00000042 - 0x00020007 0x0000004d - 0x00020008 0x00000056 + <0x00020000 0x00000010>, + <0x00020001 0x00000017>, + <0x00020002 0x0000001f>, + <0x00020003 0x00000029>, + <0x00020004 0x00000031>, + <0x00020005 0x0000003c>, + <0x00020006 0x00000042>, + <0x00020007 0x0000004d>, + <0x00020008 0x00000056>, - 0x00030000 0x00000012 - 0x00030001 0x0000001d>; + <0x00030000 0x00000012>, + <0x00030001 0x0000001d>; #thermal-sensor-cells = <1>; }; From 88a2b4edda3d0709727be53f4423b0b832d91de3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 4 Dec 2023 09:47:02 +0100 Subject: [PATCH 283/310] x86/Kconfig: Rework CONFIG_X86_PAE dependency While looking at a Xen Kconfig dependency issue, I tried to understand the exact dependencies for CONFIG_X86_PAE, which is selected by CONFIG_HIGHMEM64G but can also be enabled manually. Apparently the dependencies for CONFIG_HIGHMEM64G are strictly about CPUs that do support PAE, but the actual feature can be incorrectly enabled on older CPUs as well. The CONFIG_X86_CMPXCHG64 dependencies on the other hand include X86_PAE because cmpxchg8b is requried for PAE to work. Rework this for readability and correctness, using a positive list of CPUs that support PAE in a new X86_HAVE_PAE symbol that can serve as a dependency for both X86_PAE and HIGHMEM64G as well as simplify the X86_CMPXCHG64 dependency list. Signed-off-by: Arnd Bergmann Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231204084722.3789473-2-arnd@kernel.org --- arch/x86/Kconfig | 4 ++-- arch/x86/Kconfig.cpu | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3762f41bb092..469238067aa1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1415,7 +1415,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" - depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6 + depends on X86_HAVE_PAE select X86_PAE help Select this if you have a 32-bit processor and more than 4 @@ -1472,7 +1472,7 @@ config HIGHMEM config X86_PAE bool "PAE (Physical Address Extension) Support" - depends on X86_32 && !HIGHMEM4G + depends on X86_32 && X86_HAVE_PAE select PHYS_ADDR_T_64BIT select SWIOTLB help diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 00468adf180f..b9224cf2ee4d 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -362,9 +362,13 @@ config X86_TSC def_bool y depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 +config X86_HAVE_PAE + def_bool y + depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC7 || MCORE2 || MATOM || X86_64 + config X86_CMPXCHG64 def_bool y - depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 + depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 # this should be set for all -march=.. options where the compiler # generates cmov. From 016919c1f2e5b7ea3436abe6db0b73dbabd36682 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 19 Dec 2023 16:11:56 +0100 Subject: [PATCH 284/310] x86/asm: Provide new infrastructure for GDT descriptors Linus suggested replacing the magic numbers in the GDT descriptors using preprocessor macros. Designing the interface properly is actually pretty hard -- there are several constraints: - you want the final expressions to be readable at a glance; something like GDT_ENTRY_FLAGS(5, 1, 0, 1, 0, 1, 1, 0) isn't because you need to visit the definition to understand what each parameter represents and then match up parameters in the user and the definition (which is hard when there are so many of them) - you want the final expressions to be fairly short/information-dense; something like GDT_ENTRY_PRESENT | GDT_ENTRY_DATA_WRITABLE | GDT_ENTRY_SYSTEM | GDT_ENTRY_DB | GDT_ENTRY_GRANULARITY_4K is a bit too verbose to write out every time and is actually hard to read as well because of all the repetition - you may want to assume defaults for some things (e.g. entries are DPL-0 a.k.a. kernel segments by default) and allow the user to override the default -- but this works best if you can OR in the override; if you want DPL-3 by default and override with DPL-0 you would need to start masking off bits instead of OR-ing them in and that just becomes harder to read - you may want to parameterize some things (e.g. CODE vs. DATA or KERNEL vs. USER) since both values are used and you don't really want prefer either one by default -- or DPL, which is always some value that is always specified This patch tries to balance these requirements and has two layers of definitions -- low-level and high-level: - the low-level defines are the mapping between human-readable names and the actual bit numbers - the high-level defines are the mapping from high-level intent to combinations of low-level flags, representing roughly a tuple (data/code/tss, 64/32/16-bits) plus an override for DPL-3 (= USER), since that's relatively rare but still very important to mark properly for those segments. - we have *_BIOS variants for 32-bit code and data segments that don't have the G flag set and give the limit in terms of bytes instead of pages [ mingo: Improved readability bit more. ] Signed-off-by: Vegard Nossum Signed-off-by: Ingo Molnar Acked-by: Linus Torvalds Link: https://lore.kernel.org/r/20231219151200.2878271-2-vegard.nossum@oracle.com --- arch/x86/include/asm/desc_defs.h | 76 ++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index f7e7099af595..7c08cbf3cbd0 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -8,6 +8,56 @@ * archs. */ +/* + * Low-level interface mapping flags/field names to bits + */ + +/* Flags for _DESC_S (non-system) descriptors */ +#define _DESC_ACCESSED 0x0001 +#define _DESC_DATA_WRITABLE 0x0002 +#define _DESC_CODE_READABLE 0x0002 +#define _DESC_DATA_EXPAND_DOWN 0x0004 +#define _DESC_CODE_CONFORMING 0x0004 +#define _DESC_CODE_EXECUTABLE 0x0008 + +/* Common flags */ +#define _DESC_S 0x0010 +#define _DESC_DPL(dpl) ((dpl) << 5) +#define _DESC_PRESENT 0x0080 + +#define _DESC_LONG_CODE 0x2000 +#define _DESC_DB 0x4000 +#define _DESC_GRANULARITY_4K 0x8000 + +/* System descriptors have a numeric "type" field instead of flags */ +#define _DESC_SYSTEM(code) (code) + +/* + * High-level interface mapping intended usage to low-level combinations + * of flags + */ + +#define _DESC_DATA (_DESC_S | _DESC_PRESENT | \ + _DESC_DATA_WRITABLE) +#define _DESC_CODE (_DESC_S | _DESC_PRESENT | \ + _DESC_CODE_READABLE | _DESC_CODE_EXECUTABLE) + +#define DESC_DATA16 (_DESC_DATA) +#define DESC_CODE16 (_DESC_CODE) + +#define DESC_DATA32 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB) +#define DESC_DATA32_BIOS (_DESC_DATA | _DESC_DB) + +#define DESC_CODE32 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_DB) +#define DESC_CODE32_BIOS (_DESC_CODE | _DESC_DB) + +#define DESC_TSS32 (_DESC_SYSTEM(9) | _DESC_PRESENT) + +#define DESC_DATA64 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB) +#define DESC_CODE64 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_LONG_CODE) + +#define DESC_USER (_DESC_DPL(3)) + #ifndef __ASSEMBLY__ #include @@ -22,19 +72,19 @@ struct desc_struct { #define GDT_ENTRY_INIT(flags, base, limit) \ { \ - .limit0 = (u16) (limit), \ - .limit1 = ((limit) >> 16) & 0x0F, \ - .base0 = (u16) (base), \ - .base1 = ((base) >> 16) & 0xFF, \ - .base2 = ((base) >> 24) & 0xFF, \ - .type = (flags & 0x0f), \ - .s = (flags >> 4) & 0x01, \ - .dpl = (flags >> 5) & 0x03, \ - .p = (flags >> 7) & 0x01, \ - .avl = (flags >> 12) & 0x01, \ - .l = (flags >> 13) & 0x01, \ - .d = (flags >> 14) & 0x01, \ - .g = (flags >> 15) & 0x01, \ + .limit0 = ((limit) >> 0) & 0xFFFF, \ + .limit1 = ((limit) >> 16) & 0x000F, \ + .base0 = ((base) >> 0) & 0xFFFF, \ + .base1 = ((base) >> 16) & 0x00FF, \ + .base2 = ((base) >> 24) & 0x00FF, \ + .type = ((flags) >> 0) & 0x000F, \ + .s = ((flags) >> 4) & 0x0001, \ + .dpl = ((flags) >> 5) & 0x0003, \ + .p = ((flags) >> 7) & 0x0001, \ + .avl = ((flags) >> 12) & 0x0001, \ + .l = ((flags) >> 13) & 0x0001, \ + .d = ((flags) >> 14) & 0x0001, \ + .g = ((flags) >> 15) & 0x0001, \ } enum { From 41ef75c848e33beb1f7b981866b62b0066f744c7 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 19 Dec 2023 16:11:57 +0100 Subject: [PATCH 285/310] x86/asm: Replace magic numbers in GDT descriptors, preparations We'd like to replace all the magic numbers in various GDT descriptors with new, semantically meaningful, symbolic values. In order to be able to verify that the change doesn't cause any actual changes to the compiled binary code, I've split the change into two patches: - Part 1 (this commit): everything _but_ actually replacing the numbers - Part 2 (the following commit): _only_ replacing the numbers The reason we need this split for verification is that including new headers causes some spurious changes to the object files, mostly line number changes in the debug info but occasionally other subtle codegen changes. Signed-off-by: Vegard Nossum Signed-off-by: Ingo Molnar Acked-by: Linus Torvalds Link: https://lore.kernel.org/r/20231219151200.2878271-3-vegard.nossum@oracle.com --- arch/x86/boot/pm.c | 1 + arch/x86/include/asm/desc_defs.h | 2 ++ arch/x86/kernel/cpu/common.c | 8 -------- arch/x86/platform/pvh/head.S | 1 + arch/x86/realmode/rm/reboot.S | 1 + 5 files changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c index 40031a614712..0361b5307bd8 100644 --- a/arch/x86/boot/pm.c +++ b/arch/x86/boot/pm.c @@ -11,6 +11,7 @@ */ #include "boot.h" +#include #include /* diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 7c08cbf3cbd0..33d229ed96dc 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -144,6 +144,7 @@ struct gate_struct { typedef struct gate_struct gate_desc; +#ifndef _SETUP static inline unsigned long gate_offset(const gate_desc *g) { #ifdef CONFIG_X86_64 @@ -158,6 +159,7 @@ static inline unsigned long gate_segment(const gate_desc *g) { return g->segment; } +#endif struct desc_ptr { unsigned short size; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b14fc8c1c953..ceb6e4b6d57e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -204,25 +204,17 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ - /* 32-bit code */ [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), - /* 16-bit code */ [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), - /* 16-bit data */ [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), - /* 16-bit data */ [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), - /* 16-bit data */ [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ - /* 32-bit code */ [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), - /* 16-bit code */ [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), - /* data */ [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index c4365a05ab83..9bcafdded2a1 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S index f10515b10e0a..447641820a8d 100644 --- a/arch/x86/realmode/rm/reboot.S +++ b/arch/x86/realmode/rm/reboot.S @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include #include #include #include From 1445f6e15f7ddd80311307475191e34c0b2312e8 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 19 Dec 2023 16:11:58 +0100 Subject: [PATCH 286/310] x86/asm: Replace magic numbers in GDT descriptors, script-generated change Actually replace the numeric values by the new symbolic values. I used this to find all the existing users of the GDT_ENTRY*() macros: $ git grep -P 'GDT_ENTRY(_INIT)?\(' Some of the lines will exceed 80 characters, but some of them will be shorter again in the next couple of patches. Signed-off-by: Vegard Nossum Signed-off-by: Ingo Molnar Acked-by: Linus Torvalds Link: https://lore.kernel.org/r/20231219151200.2878271-4-vegard.nossum@oracle.com --- arch/x86/boot/pm.c | 6 ++-- arch/x86/kernel/apm_32.c | 2 +- arch/x86/kernel/cpu/common.c | 40 ++++++++++++------------- arch/x86/kernel/head64.c | 6 ++-- arch/x86/kernel/setup_percpu.c | 4 +-- arch/x86/platform/pvh/head.S | 6 ++-- arch/x86/realmode/rm/reboot.S | 2 +- drivers/firmware/efi/libstub/x86-5lvl.c | 4 +-- drivers/pnp/pnpbios/bioscalls.c | 2 +- 9 files changed, 36 insertions(+), 36 deletions(-) diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c index 0361b5307bd8..ab35b52d2c4b 100644 --- a/arch/x86/boot/pm.c +++ b/arch/x86/boot/pm.c @@ -68,13 +68,13 @@ static void setup_gdt(void) being 8-byte unaligned. Intel recommends 16 byte alignment. */ static const u64 boot_gdt[] __attribute__((aligned(16))) = { /* CS: code, read/execute, 4 GB, base 0 */ - [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff), + [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(DESC_CODE32 | _DESC_ACCESSED, 0, 0xfffff), /* DS: data, read/write, 4 GB, base 0 */ - [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff), + [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(DESC_DATA32 | _DESC_ACCESSED, 0, 0xfffff), /* TSS: 32-bit tss, 104 bytes, base 4096 */ /* We only have a TSS here to keep Intel VT happy; we don't actually use it for anything. */ - [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103), + [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(DESC_TSS32, 4096, 103), }; /* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead of the gdt_ptr contents. Thus, make it static so it will diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5934ee5bc087..76a5ced278c2 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -420,7 +420,7 @@ static DEFINE_MUTEX(apm_mutex); * This is for buggy BIOS's that refer to (real mode) segment 0x40 * even though they are called in protected mode. */ -static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(DESC_DATA32_BIOS, (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); static const char driver_version[] = "1.16ac"; /* no spaces */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ceb6e4b6d57e..32934a0656af 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -188,37 +188,37 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { * TLS descriptors are currently at a different place compared to i386. * Hopefully nobody expects them at a fixed place (Wine?) */ - [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), - [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), - [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | _DESC_ACCESSED, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64 | _DESC_ACCESSED, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64 | _DESC_ACCESSED, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER | _DESC_ACCESSED, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA64 | DESC_USER | _DESC_ACCESSED, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE64 | DESC_USER | _DESC_ACCESSED, 0, 0xfffff), #else - [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), - [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA32 | DESC_USER, 0, 0xfffff), /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ - [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), - [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), - [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), - [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), - [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff), + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff), + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0xffff), + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0), + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0), /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ - [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), - [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), - [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff), + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff), + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(DESC_DATA32_BIOS, 0, 0xffff), - [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), - [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff), + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff), #endif } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 05a110c97111..00dbddfdfece 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -71,9 +71,9 @@ EXPORT_SYMBOL(vmemmap_base); * GDT used on the boot CPU before switching to virtual addresses. */ static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = { - [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), - [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), - [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | _DESC_ACCESSED, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64 | _DESC_ACCESSED, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64 | _DESC_ACCESSED, 0, 0xfffff), }; /* diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2c97bf7b56ae..f2583de97a64 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -106,8 +106,8 @@ void __init pcpu_populate_pte(unsigned long addr) static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 - struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu), - 0xFFFFF); + struct desc_struct d = GDT_ENTRY_INIT(DESC_DATA32 & ~_DESC_DB, + per_cpu_offset(cpu), 0xFFFFF); write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S); #endif diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index 9bcafdded2a1..7c6a1089ce1c 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -149,11 +149,11 @@ SYM_DATA_END(gdt) SYM_DATA_START_LOCAL(gdt_start) .quad 0x0000000000000000 /* NULL descriptor */ #ifdef CONFIG_X86_64 - .quad GDT_ENTRY(0xa09a, 0, 0xfffff) /* PVH_CS_SEL */ + .quad GDT_ENTRY(DESC_CODE64, 0, 0xfffff) /* PVH_CS_SEL */ #else - .quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* PVH_CS_SEL */ + .quad GDT_ENTRY(DESC_CODE32, 0, 0xfffff) /* PVH_CS_SEL */ #endif - .quad GDT_ENTRY(0xc092, 0, 0xfffff) /* PVH_DS_SEL */ + .quad GDT_ENTRY(DESC_DATA32, 0, 0xfffff) /* PVH_DS_SEL */ SYM_DATA_END_LABEL(gdt_start, SYM_L_LOCAL, gdt_end) .balign 16 diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S index 447641820a8d..5bc068b9acdd 100644 --- a/arch/x86/realmode/rm/reboot.S +++ b/arch/x86/realmode/rm/reboot.S @@ -154,5 +154,5 @@ SYM_DATA_START(machine_real_restart_gdt) * base value 0x100; since this is consistent with real mode * semantics we don't have to reload the segments once CR0.PE = 0. */ - .quad GDT_ENTRY(0x0093, 0x100, 0xffff) + .quad GDT_ENTRY(DESC_DATA16 | _DESC_ACCESSED, 0x100, 0xffff) SYM_DATA_END(machine_real_restart_gdt) diff --git a/drivers/firmware/efi/libstub/x86-5lvl.c b/drivers/firmware/efi/libstub/x86-5lvl.c index 479dd445acdc..005dd9b14f95 100644 --- a/drivers/firmware/efi/libstub/x86-5lvl.c +++ b/drivers/firmware/efi/libstub/x86-5lvl.c @@ -13,8 +13,8 @@ bool efi_no5lvl; static void (*la57_toggle)(void *cr3); static const struct desc_struct gdt[] = { - [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), - [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | _DESC_ACCESSED, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64 | _DESC_ACCESSED, 0, 0xfffff), }; /* diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c index ddc6f2163c8e..1f31dce5835a 100644 --- a/drivers/pnp/pnpbios/bioscalls.c +++ b/drivers/pnp/pnpbios/bioscalls.c @@ -60,7 +60,7 @@ do { \ set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \ } while(0) -static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(DESC_DATA32_BIOS, (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); /* From 3b184b71dfcb156e08246f8fbe0cd088c6a6efed Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 19 Dec 2023 16:11:59 +0100 Subject: [PATCH 287/310] x86/asm: Always set A (accessed) flag in GDT descriptors We have no known use for having the CPU track whether GDT descriptors have been accessed or not. Simplify the code by adding the flag to the common flags and removing it everywhere else. Signed-off-by: Vegard Nossum Signed-off-by: Ingo Molnar Acked-by: Linus Torvalds Link: https://lore.kernel.org/r/20231219151200.2878271-5-vegard.nossum@oracle.com --- arch/x86/boot/pm.c | 4 ++-- arch/x86/include/asm/desc_defs.h | 4 ++-- arch/x86/kernel/cpu/common.c | 12 ++++++------ arch/x86/kernel/head64.c | 6 +++--- arch/x86/realmode/rm/reboot.S | 2 +- drivers/firmware/efi/libstub/x86-5lvl.c | 4 ++-- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c index ab35b52d2c4b..5941f930f6c5 100644 --- a/arch/x86/boot/pm.c +++ b/arch/x86/boot/pm.c @@ -68,9 +68,9 @@ static void setup_gdt(void) being 8-byte unaligned. Intel recommends 16 byte alignment. */ static const u64 boot_gdt[] __attribute__((aligned(16))) = { /* CS: code, read/execute, 4 GB, base 0 */ - [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(DESC_CODE32 | _DESC_ACCESSED, 0, 0xfffff), + [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(DESC_CODE32, 0, 0xfffff), /* DS: data, read/write, 4 GB, base 0 */ - [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(DESC_DATA32 | _DESC_ACCESSED, 0, 0xfffff), + [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(DESC_DATA32, 0, 0xfffff), /* TSS: 32-bit tss, 104 bytes, base 4096 */ /* We only have a TSS here to keep Intel VT happy; we don't actually use it for anything. */ diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 33d229ed96dc..d440a65af8f3 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -37,9 +37,9 @@ * of flags */ -#define _DESC_DATA (_DESC_S | _DESC_PRESENT | \ +#define _DESC_DATA (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \ _DESC_DATA_WRITABLE) -#define _DESC_CODE (_DESC_S | _DESC_PRESENT | \ +#define _DESC_CODE (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \ _DESC_CODE_READABLE | _DESC_CODE_EXECUTABLE) #define DESC_DATA16 (_DESC_DATA) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 32934a0656af..6184488a7d77 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -188,12 +188,12 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { * TLS descriptors are currently at a different place compared to i386. * Hopefully nobody expects them at a fixed place (Wine?) */ - [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | _DESC_ACCESSED, 0, 0xfffff), - [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64 | _DESC_ACCESSED, 0, 0xfffff), - [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64 | _DESC_ACCESSED, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER | _DESC_ACCESSED, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA64 | DESC_USER | _DESC_ACCESSED, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE64 | DESC_USER | _DESC_ACCESSED, 0, 0xfffff), + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA64 | DESC_USER, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE64 | DESC_USER, 0, 0xfffff), #else [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff), [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff), diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 00dbddfdfece..dc0956067944 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -71,9 +71,9 @@ EXPORT_SYMBOL(vmemmap_base); * GDT used on the boot CPU before switching to virtual addresses. */ static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = { - [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | _DESC_ACCESSED, 0, 0xfffff), - [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64 | _DESC_ACCESSED, 0, 0xfffff), - [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64 | _DESC_ACCESSED, 0, 0xfffff), + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff), }; /* diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S index 5bc068b9acdd..e714b4624e36 100644 --- a/arch/x86/realmode/rm/reboot.S +++ b/arch/x86/realmode/rm/reboot.S @@ -154,5 +154,5 @@ SYM_DATA_START(machine_real_restart_gdt) * base value 0x100; since this is consistent with real mode * semantics we don't have to reload the segments once CR0.PE = 0. */ - .quad GDT_ENTRY(DESC_DATA16 | _DESC_ACCESSED, 0x100, 0xffff) + .quad GDT_ENTRY(DESC_DATA16, 0x100, 0xffff) SYM_DATA_END(machine_real_restart_gdt) diff --git a/drivers/firmware/efi/libstub/x86-5lvl.c b/drivers/firmware/efi/libstub/x86-5lvl.c index 005dd9b14f95..77359e802181 100644 --- a/drivers/firmware/efi/libstub/x86-5lvl.c +++ b/drivers/firmware/efi/libstub/x86-5lvl.c @@ -13,8 +13,8 @@ bool efi_no5lvl; static void (*la57_toggle)(void *cr3); static const struct desc_struct gdt[] = { - [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | _DESC_ACCESSED, 0, 0xfffff), - [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64 | _DESC_ACCESSED, 0, 0xfffff), + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff), }; /* From bc90aefa99f74452d549d503a3f1cbf3adc9c6bb Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 19 Dec 2023 16:12:00 +0100 Subject: [PATCH 288/310] x86/asm: Add DB flag to 32-bit percpu GDT entry The D/B size flag for the 32-bit percpu GDT entry was not set. The Intel manual (vol 3, section 3.4.5) only specifies the meaning of this flag for three cases: 1) code segments used for %cs -- doesn't apply here 2) stack segments used for %ss -- doesn't apply 3) expand-down data segments -- but we don't have the expand-down flag set, so it also doesn't apply here The flag likely doesn't do anything here, although the manual does also say: "This flag should always be set to 1 for 32-bit code and data segments [...]" so we should probably do it anyway. Signed-off-by: Vegard Nossum Signed-off-by: Ingo Molnar Acked-by: Linus Torvalds Link: https://lore.kernel.org/r/20231219151200.2878271-6-vegard.nossum@oracle.com --- arch/x86/kernel/setup_percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index f2583de97a64..b30d6e180df7 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -106,7 +106,7 @@ void __init pcpu_populate_pte(unsigned long addr) static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 - struct desc_struct d = GDT_ENTRY_INIT(DESC_DATA32 & ~_DESC_DB, + struct desc_struct d = GDT_ENTRY_INIT(DESC_DATA32, per_cpu_offset(cpu), 0xFFFFF); write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S); From 257ca14f4d780e27a0605fd68053d2cc3178a232 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 19 Dec 2023 14:13:04 +0000 Subject: [PATCH 289/310] x86/boot: Remove redundant initialization of the 'delta' variable in strcmp() The 'delta' variable is zero-initialized, but never read before the real initialization happens. The assignment is redundant and can be removed. Signed-off-by: Colin Ian King Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231219141304.367200-1-colin.i.king@gmail.com --- arch/x86/boot/string.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 1c8541ae3b3a..c23f3b9c84fe 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -49,7 +49,7 @@ int strcmp(const char *str1, const char *str2) { const unsigned char *s1 = (const unsigned char *)str1; const unsigned char *s2 = (const unsigned char *)str2; - int delta = 0; + int delta; while (*s1 || *s2) { delta = *s1 - *s2; From 6addc560e69cd1b2e68ef43ad62a878ac1956f51 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 21 Dec 2023 12:45:08 +0800 Subject: [PATCH 290/310] powerpc/mpc83xx: Add the missing set_freezable() for agent_thread_fn() The kernel thread function agent_thread_fn() invokes the try_to_freeze() in its loop. But all the kernel threads are non-freezable by default. So if we want to make a kernel thread to be freezable, we have to invoke set_freezable() explicitly. Signed-off-by: Kevin Hao Signed-off-by: Michael Ellerman Link: https://msgid.link/20231221044510.1802429-2-haokexin@gmail.com --- arch/powerpc/platforms/83xx/suspend.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/83xx/suspend.c b/arch/powerpc/platforms/83xx/suspend.c index 9833c36bda83..eed325ed08cc 100644 --- a/arch/powerpc/platforms/83xx/suspend.c +++ b/arch/powerpc/platforms/83xx/suspend.c @@ -261,6 +261,8 @@ static int mpc83xx_suspend_begin(suspend_state_t state) static int agent_thread_fn(void *data) { + set_freezable(); + while (1) { wait_event_interruptible(agent_wq, pci_pm_state >= 2); try_to_freeze(); From 11611d254c15cce1f58431b2965c6edb5aa7e610 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 21 Dec 2023 12:45:09 +0800 Subject: [PATCH 291/310] powerpc/mpc83xx: Use wait_event_freezable() for freezable kthread A freezable kernel thread can enter frozen state during freezing by either calling try_to_freeze() or using wait_event_freezable() and its variants. So for the following snippet of code in a kernel thread loop: wait_event_interruptible(); try_to_freeze(); We can change it to a simple wait_event_freezable() and then eliminate a function call. Signed-off-by: Kevin Hao Signed-off-by: Michael Ellerman Link: https://msgid.link/20231221044510.1802429-3-haokexin@gmail.com --- arch/powerpc/platforms/83xx/suspend.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/83xx/suspend.c b/arch/powerpc/platforms/83xx/suspend.c index eed325ed08cc..c9664e46b03d 100644 --- a/arch/powerpc/platforms/83xx/suspend.c +++ b/arch/powerpc/platforms/83xx/suspend.c @@ -264,8 +264,7 @@ static int agent_thread_fn(void *data) set_freezable(); while (1) { - wait_event_interruptible(agent_wq, pci_pm_state >= 2); - try_to_freeze(); + wait_event_freezable(agent_wq, pci_pm_state >= 2); if (signal_pending(current) || pci_pm_state < 2) continue; From ccc0f7b7673e63139ba9d916f4567d4fadb14b55 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Thu, 21 Dec 2023 12:45:10 +0800 Subject: [PATCH 292/310] powerpc/ps3: Add missing set_freezable() for ps3_probe_thread() The kernel thread function ps3_probe_thread() invokes the try_to_freeze() in its loop. But all the kernel threads are non-freezable by default. So if we want to make a kernel thread to be freezable, we have to invoke set_freezable() explicitly. Signed-off-by: Kevin Hao Acked-by: Geoff Levand Signed-off-by: Michael Ellerman Link: https://msgid.link/20231221044510.1802429-4-haokexin@gmail.com --- arch/powerpc/platforms/ps3/device-init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c index e87360a0fb40..878bc160246e 100644 --- a/arch/powerpc/platforms/ps3/device-init.c +++ b/arch/powerpc/platforms/ps3/device-init.c @@ -827,6 +827,7 @@ static int ps3_probe_thread(void *data) if (res) goto fail_free_irq; + set_freezable(); /* Loop here processing the requested notification events. */ do { try_to_freeze(); From eb8446e164572180c2cd0ea4e8494e4419202396 Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Tue, 19 Dec 2023 14:52:36 +0530 Subject: [PATCH 293/310] powerpc/hvcall: Reorder Nestedv2 hcall opcodes Reorder the newly introduced hcall opcodes for Nestedv2 to follow the increasing-opcode-number convention followed in 'hvcall.h'. Also updates the value for MAX_HCALL_OPCODE which is used in various places in arch code for range checking. Notably in the KVM enabled-hcall logic, and in hcall tracing. Fixes: 19d31c5f1157 ("KVM: PPC: Add support for nestedv2 guests") Suggested-by: Michael Ellerman Signed-off-by: Vaibhav Jain Signed-off-by: Michael Ellerman Link: https://msgid.link/20231219092309.118151-1-vaibhav@linux.ibm.com --- arch/powerpc/include/asm/hvcall.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index ddb99e982917..a41e542ba94d 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -349,7 +349,16 @@ #define H_GET_ENERGY_SCALE_INFO 0x450 #define H_PKS_SIGNED_UPDATE 0x454 #define H_WATCHDOG 0x45C -#define MAX_HCALL_OPCODE H_WATCHDOG +#define H_GUEST_GET_CAPABILITIES 0x460 +#define H_GUEST_SET_CAPABILITIES 0x464 +#define H_GUEST_CREATE 0x470 +#define H_GUEST_CREATE_VCPU 0x474 +#define H_GUEST_GET_STATE 0x478 +#define H_GUEST_SET_STATE 0x47C +#define H_GUEST_RUN_VCPU 0x480 +#define H_GUEST_COPY_MEMORY 0x484 +#define H_GUEST_DELETE 0x488 +#define MAX_HCALL_OPCODE H_GUEST_DELETE /* Scope args for H_SCM_UNBIND_ALL */ #define H_UNBIND_SCOPE_ALL (0x1) @@ -393,15 +402,6 @@ #define H_ENTER_NESTED 0xF804 #define H_TLB_INVALIDATE 0xF808 #define H_COPY_TOFROM_GUEST 0xF80C -#define H_GUEST_GET_CAPABILITIES 0x460 -#define H_GUEST_SET_CAPABILITIES 0x464 -#define H_GUEST_CREATE 0x470 -#define H_GUEST_CREATE_VCPU 0x474 -#define H_GUEST_GET_STATE 0x478 -#define H_GUEST_SET_STATE 0x47C -#define H_GUEST_RUN_VCPU 0x480 -#define H_GUEST_COPY_MEMORY 0x484 -#define H_GUEST_DELETE 0x488 /* Flags for H_SVM_PAGE_IN */ #define H_PAGE_IN_SHARED 0x1 From ba5b952ad5f52e58c0f288b9d5427ad734600568 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 15 Dec 2023 11:24:56 +0000 Subject: [PATCH 294/310] selftests/powerpc: Fix spelling mistake "EACCESS" -> "EACCES" There is a spelling mistake of the EACCES error name, fix it. Signed-off-by: Colin Ian King Signed-off-by: Michael Ellerman Link: https://msgid.link/20231215112456.13554-1-colin.i.king@gmail.com --- tools/testing/selftests/powerpc/papr_sysparm/papr_sysparm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/papr_sysparm/papr_sysparm.c b/tools/testing/selftests/powerpc/papr_sysparm/papr_sysparm.c index d5436de5b8ed..f56c15a11e2f 100644 --- a/tools/testing/selftests/powerpc/papr_sysparm/papr_sysparm.c +++ b/tools/testing/selftests/powerpc/papr_sysparm/papr_sysparm.c @@ -177,7 +177,7 @@ static const struct sysparm_test sysparm_tests[] = { }, { .function = set_with_ro_fd, - .description = "PAPR_IOC_SYSPARM_SET returns EACCESS on read-only fd", + .description = "PAPR_IOC_SYSPARM_SET returns EACCES on read-only fd", }, }; From e6beb47edb89ca9dc8906515e2dfbeb5913312c8 Mon Sep 17 00:00:00 2001 From: Haoran Liu Date: Mon, 27 Nov 2023 06:41:08 -0800 Subject: [PATCH 295/310] powerpc/powernv: Add error handling to opal_prd_range_is_valid In the opal_prd_range_is_valid function within opal-prd.c, error handling was missing for the of_get_address call. This patch adds necessary error checking, ensuring that the function gracefully handles scenarios where of_get_address fails. Signed-off-by: Haoran Liu Signed-off-by: Michael Ellerman Link: https://msgid.link/20231127144108.29782-1-liuhaoran14@163.com --- arch/powerpc/platforms/powernv/opal-prd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index 327e2f76905d..b66b06efcef1 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -66,6 +66,8 @@ static bool opal_prd_range_is_valid(uint64_t addr, uint64_t size) const char *label; addrp = of_get_address(node, 0, &range_size, NULL); + if (!addrp) + continue; range_addr = of_read_number(addrp, 2); range_end = range_addr + range_size; From a6293b3e285cd0d7692141d7981a5f144f0e2f0b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 17:48:52 +0200 Subject: [PATCH 296/310] fs: factor out backing_file_{read,write}_iter() helpers Overlayfs submits files io to backing files on other filesystems. Factor out some common helpers to perform io to backing files, into fs/backing-file.c. Suggested-by: Miklos Szeredi Link: https://lore.kernel.org/r/CAJfpeguhmZbjP3JLqtUy0AdWaHOkAPWeP827BBWwRFEAUgnUcQ@mail.gmail.com Signed-off-by: Amir Goldstein --- fs/backing-file.c | 210 +++++++++++++++++++++++++++++++++++ fs/overlayfs/file.c | 188 +++---------------------------- fs/overlayfs/overlayfs.h | 8 +- fs/overlayfs/super.c | 11 +- include/linux/backing-file.h | 15 +++ 5 files changed, 247 insertions(+), 185 deletions(-) diff --git a/fs/backing-file.c b/fs/backing-file.c index 04b33036f709..323187a49da3 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -2,6 +2,9 @@ /* * Common helpers for stackable filesystems and backing files. * + * Forked from fs/overlayfs/file.c. + * + * Copyright (C) 2017 Red Hat, Inc. * Copyright (C) 2023 CTERA Networks. */ @@ -46,3 +49,210 @@ struct file *backing_file_open(const struct path *user_path, int flags, return f; } EXPORT_SYMBOL_GPL(backing_file_open); + +struct backing_aio { + struct kiocb iocb; + refcount_t ref; + struct kiocb *orig_iocb; + /* used for aio completion */ + void (*end_write)(struct file *); + struct work_struct work; + long res; +}; + +static struct kmem_cache *backing_aio_cachep; + +#define BACKING_IOCB_MASK \ + (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) + +static rwf_t iocb_to_rw_flags(int flags) +{ + return (__force rwf_t)(flags & BACKING_IOCB_MASK); +} + +static void backing_aio_put(struct backing_aio *aio) +{ + if (refcount_dec_and_test(&aio->ref)) { + fput(aio->iocb.ki_filp); + kmem_cache_free(backing_aio_cachep, aio); + } +} + +static void backing_aio_cleanup(struct backing_aio *aio, long res) +{ + struct kiocb *iocb = &aio->iocb; + struct kiocb *orig_iocb = aio->orig_iocb; + + if (aio->end_write) + aio->end_write(orig_iocb->ki_filp); + + orig_iocb->ki_pos = iocb->ki_pos; + backing_aio_put(aio); +} + +static void backing_aio_rw_complete(struct kiocb *iocb, long res) +{ + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); + struct kiocb *orig_iocb = aio->orig_iocb; + + if (iocb->ki_flags & IOCB_WRITE) + kiocb_end_write(iocb); + + backing_aio_cleanup(aio, res); + orig_iocb->ki_complete(orig_iocb, res); +} + +static void backing_aio_complete_work(struct work_struct *work) +{ + struct backing_aio *aio = container_of(work, struct backing_aio, work); + + backing_aio_rw_complete(&aio->iocb, aio->res); +} + +static void backing_aio_queue_completion(struct kiocb *iocb, long res) +{ + struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); + + /* + * Punt to a work queue to serialize updates of mtime/size. + */ + aio->res = res; + INIT_WORK(&aio->work, backing_aio_complete_work); + queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq, + &aio->work); +} + +static int backing_aio_init_wq(struct kiocb *iocb) +{ + struct super_block *sb = file_inode(iocb->ki_filp)->i_sb; + + if (sb->s_dio_done_wq) + return 0; + + return sb_init_dio_done_wq(sb); +} + + +ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx) +{ + struct backing_aio *aio = NULL; + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) + return -EIO; + + if (!iov_iter_count(iter)) + return 0; + + if (iocb->ki_flags & IOCB_DIRECT && + !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + old_cred = override_creds(ctx->cred); + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf); + } else { + ret = -ENOMEM; + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + goto out; + + aio->orig_iocb = iocb; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_complete = backing_aio_rw_complete; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_read(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + } +out: + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_read_iter); + +ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) + return -EIO; + + if (!iov_iter_count(iter)) + return 0; + + ret = file_remove_privs(ctx->user_file); + if (ret) + return ret; + + if (iocb->ki_flags & IOCB_DIRECT && + !(file->f_mode & FMODE_CAN_ODIRECT)) + return -EINVAL; + + /* + * Stacked filesystems don't support deferred completions, don't copy + * this property in case it is set by the issuer. + */ + flags &= ~IOCB_DIO_CALLER_COMP; + + old_cred = override_creds(ctx->cred); + if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(flags); + + ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); + if (ctx->end_write) + ctx->end_write(ctx->user_file); + } else { + struct backing_aio *aio; + + ret = backing_aio_init_wq(iocb); + if (ret) + goto out; + + ret = -ENOMEM; + aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); + if (!aio) + goto out; + + aio->orig_iocb = iocb; + aio->end_write = ctx->end_write; + kiocb_clone(&aio->iocb, iocb, get_file(file)); + aio->iocb.ki_flags = flags; + aio->iocb.ki_complete = backing_aio_queue_completion; + refcount_set(&aio->ref, 2); + ret = vfs_iocb_iter_write(file, &aio->iocb, iter); + backing_aio_put(aio); + if (ret != -EIOCBQUEUED) + backing_aio_cleanup(aio, ret); + } +out: + revert_creds(old_cred); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_write_iter); + +static int __init backing_aio_init(void) +{ + backing_aio_cachep = kmem_cache_create("backing_aio", + sizeof(struct backing_aio), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!backing_aio_cachep) + return -ENOMEM; + + return 0; +} +fs_initcall(backing_aio_init); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index a6da3eaf6d4f..1b578cb27a26 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -16,19 +16,6 @@ #include #include "overlayfs.h" -#include "../internal.h" /* for sb_init_dio_done_wq */ - -struct ovl_aio_req { - struct kiocb iocb; - refcount_t ref; - struct kiocb *orig_iocb; - /* used for aio completion */ - struct work_struct work; - long res; -}; - -static struct kmem_cache *ovl_aio_request_cachep; - static char ovl_whatisit(struct inode *inode, struct inode *realinode) { if (realinode != ovl_inode_upper(inode)) @@ -275,84 +262,16 @@ static void ovl_file_accessed(struct file *file) touch_atime(&file->f_path); } -#define OVL_IOCB_MASK \ - (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) - -static rwf_t iocb_to_rw_flags(int flags) -{ - return (__force rwf_t)(flags & OVL_IOCB_MASK); -} - -static inline void ovl_aio_put(struct ovl_aio_req *aio_req) -{ - if (refcount_dec_and_test(&aio_req->ref)) { - fput(aio_req->iocb.ki_filp); - kmem_cache_free(ovl_aio_request_cachep, aio_req); - } -} - -static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) -{ - struct kiocb *iocb = &aio_req->iocb; - struct kiocb *orig_iocb = aio_req->orig_iocb; - - if (iocb->ki_flags & IOCB_WRITE) - ovl_file_modified(orig_iocb->ki_filp); - - orig_iocb->ki_pos = iocb->ki_pos; - ovl_aio_put(aio_req); -} - -static void ovl_aio_rw_complete(struct kiocb *iocb, long res) -{ - struct ovl_aio_req *aio_req = container_of(iocb, - struct ovl_aio_req, iocb); - struct kiocb *orig_iocb = aio_req->orig_iocb; - - if (iocb->ki_flags & IOCB_WRITE) - kiocb_end_write(iocb); - - ovl_aio_cleanup_handler(aio_req); - orig_iocb->ki_complete(orig_iocb, res); -} - -static void ovl_aio_complete_work(struct work_struct *work) -{ - struct ovl_aio_req *aio_req = container_of(work, - struct ovl_aio_req, work); - - ovl_aio_rw_complete(&aio_req->iocb, aio_req->res); -} - -static void ovl_aio_queue_completion(struct kiocb *iocb, long res) -{ - struct ovl_aio_req *aio_req = container_of(iocb, - struct ovl_aio_req, iocb); - struct kiocb *orig_iocb = aio_req->orig_iocb; - - /* - * Punt to a work queue to serialize updates of mtime/size. - */ - aio_req->res = res; - INIT_WORK(&aio_req->work, ovl_aio_complete_work); - queue_work(file_inode(orig_iocb->ki_filp)->i_sb->s_dio_done_wq, - &aio_req->work); -} - -static int ovl_init_aio_done_wq(struct super_block *sb) -{ - if (sb->s_dio_done_wq) - return 0; - - return sb_init_dio_done_wq(sb); -} - static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct fd real; - const struct cred *old_cred; ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(file)->i_sb), + .user_file = file, + .accessed = ovl_file_accessed, + }; if (!iov_iter_count(iter)) return 0; @@ -361,37 +280,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret) return ret; - ret = -EINVAL; - if (iocb->ki_flags & IOCB_DIRECT && - !(real.file->f_mode & FMODE_CAN_ODIRECT)) - goto out_fdput; - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - if (is_sync_kiocb(iocb)) { - rwf_t rwf = iocb_to_rw_flags(iocb->ki_flags); - - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, rwf); - } else { - struct ovl_aio_req *aio_req; - - ret = -ENOMEM; - aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); - if (!aio_req) - goto out; - - aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); - aio_req->iocb.ki_complete = ovl_aio_rw_complete; - refcount_set(&aio_req->ref, 2); - ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); - ovl_aio_put(aio_req); - if (ret != -EIOCBQUEUED) - ovl_aio_cleanup_handler(aio_req); - } -out: - revert_creds(old_cred); - ovl_file_accessed(file); -out_fdput: + ret = backing_file_read_iter(real.file, iter, iocb, iocb->ki_flags, + &ctx); fdput(real); return ret; @@ -402,9 +292,13 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct fd real; - const struct cred *old_cred; ssize_t ret; int ifl = iocb->ki_flags; + struct backing_file_ctx ctx = { + .cred = ovl_creds(inode->i_sb), + .user_file = file, + .end_write = ovl_file_modified, + }; if (!iov_iter_count(iter)) return 0; @@ -412,19 +306,11 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) inode_lock(inode); /* Update mode */ ovl_copyattr(inode); - ret = file_remove_privs(file); - if (ret) - goto out_unlock; ret = ovl_real_fdget(file, &real); if (ret) goto out_unlock; - ret = -EINVAL; - if (iocb->ki_flags & IOCB_DIRECT && - !(real.file->f_mode & FMODE_CAN_ODIRECT)) - goto out_fdput; - if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); @@ -433,39 +319,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) * this property in case it is set by the issuer. */ ifl &= ~IOCB_DIO_CALLER_COMP; - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - if (is_sync_kiocb(iocb)) { - rwf_t rwf = iocb_to_rw_flags(ifl); - - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf); - /* Update size */ - ovl_file_modified(file); - } else { - struct ovl_aio_req *aio_req; - - ret = ovl_init_aio_done_wq(inode->i_sb); - if (ret) - goto out; - - ret = -ENOMEM; - aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); - if (!aio_req) - goto out; - - aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); - aio_req->iocb.ki_flags = ifl; - aio_req->iocb.ki_complete = ovl_aio_queue_completion; - refcount_set(&aio_req->ref, 2); - ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); - ovl_aio_put(aio_req); - if (ret != -EIOCBQUEUED) - ovl_aio_cleanup_handler(aio_req); - } -out: - revert_creds(old_cred); -out_fdput: + ret = backing_file_write_iter(real.file, iter, iocb, ifl, &ctx); fdput(real); out_unlock: @@ -777,19 +631,3 @@ const struct file_operations ovl_file_operations = { .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, }; - -int __init ovl_aio_request_cache_init(void) -{ - ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req", - sizeof(struct ovl_aio_req), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!ovl_aio_request_cachep) - return -ENOMEM; - - return 0; -} - -void ovl_aio_request_cache_destroy(void) -{ - kmem_cache_destroy(ovl_aio_request_cachep); -} diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index ca88b2636a57..509a57c85fae 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -417,6 +417,12 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); + +static inline const struct cred *ovl_creds(struct super_block *sb) +{ + return OVL_FS(sb)->creator_cred; +} + int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); @@ -829,8 +835,6 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, /* file.c */ extern const struct file_operations ovl_file_operations; -int __init ovl_aio_request_cache_init(void); -void ovl_aio_request_cache_destroy(void); int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index a0967bb25003..bcd4c314a7eb 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1501,14 +1501,10 @@ static int __init ovl_init(void) if (ovl_inode_cachep == NULL) return -ENOMEM; - err = ovl_aio_request_cache_init(); - if (!err) { - err = register_filesystem(&ovl_fs_type); - if (!err) - return 0; + err = register_filesystem(&ovl_fs_type); + if (!err) + return 0; - ovl_aio_request_cache_destroy(); - } kmem_cache_destroy(ovl_inode_cachep); return err; @@ -1524,7 +1520,6 @@ static void __exit ovl_exit(void) */ rcu_barrier(); kmem_cache_destroy(ovl_inode_cachep); - ovl_aio_request_cache_destroy(); } module_init(ovl_init); diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h index 55c9e804f780..0648d548a418 100644 --- a/include/linux/backing-file.h +++ b/include/linux/backing-file.h @@ -9,9 +9,24 @@ #define _LINUX_BACKING_FILE_H #include +#include +#include + +struct backing_file_ctx { + const struct cred *cred; + struct file *user_file; + void (*accessed)(struct file *); + void (*end_write)(struct file *); +}; struct file *backing_file_open(const struct path *user_path, int flags, const struct path *real_path, const struct cred *cred); +ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx); +ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx); #endif /* _LINUX_BACKING_FILE_H */ From 9b7e9e2f5d5c3d079ec46bc71b114012e362ea6e Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 13 Oct 2023 12:13:12 +0300 Subject: [PATCH 297/310] fs: factor out backing_file_splice_{read,write}() helpers There is not much in those helpers, but it makes sense to have them logically next to the backing_file_{read,write}_iter() helpers as they may grow more common logic in the future. Signed-off-by: Amir Goldstein --- fs/backing-file.c | 51 ++++++++++++++++++++++++++++++++++++ fs/overlayfs/file.c | 33 +++++++++-------------- include/linux/backing-file.h | 8 ++++++ 3 files changed, 72 insertions(+), 20 deletions(-) diff --git a/fs/backing-file.c b/fs/backing-file.c index 323187a49da3..ddd35c1d6c71 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -10,6 +10,7 @@ #include #include +#include #include "internal.h" @@ -245,6 +246,56 @@ out: } EXPORT_SYMBOL_GPL(backing_file_write_iter); +ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING))) + return -EIO; + + old_cred = override_creds(ctx->cred); + ret = vfs_splice_read(in, ppos, pipe, len, flags); + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_splice_read); + +ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + ssize_t ret; + + if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) + return -EIO; + + ret = file_remove_privs(ctx->user_file); + if (ret) + return ret; + + old_cred = override_creds(ctx->cred); + file_start_write(out); + ret = iter_file_splice_write(pipe, out, ppos, len, flags); + file_end_write(out); + revert_creds(old_cred); + + if (ctx->end_write) + ctx->end_write(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_splice_write); + static int __init backing_aio_init(void) { backing_aio_cachep = kmem_cache_create("backing_aio", diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 1b578cb27a26..69b52d2f9c74 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -332,20 +331,21 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - const struct cred *old_cred; struct fd real; ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(in)->i_sb), + .user_file = in, + .accessed = ovl_file_accessed, + }; ret = ovl_real_fdget(in, &real); if (ret) return ret; - old_cred = ovl_override_creds(file_inode(in)->i_sb); - ret = vfs_splice_read(real.file, ppos, pipe, len, flags); - revert_creds(old_cred); - ovl_file_accessed(in); - + ret = backing_file_splice_read(real.file, ppos, pipe, len, flags, &ctx); fdput(real); + return ret; } @@ -361,30 +361,23 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct fd real; - const struct cred *old_cred; struct inode *inode = file_inode(out); ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(inode->i_sb), + .user_file = out, + .end_write = ovl_file_modified, + }; inode_lock(inode); /* Update mode */ ovl_copyattr(inode); - ret = file_remove_privs(out); - if (ret) - goto out_unlock; ret = ovl_real_fdget(out, &real); if (ret) goto out_unlock; - old_cred = ovl_override_creds(inode->i_sb); - file_start_write(real.file); - - ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); - - file_end_write(real.file); - /* Update size */ - ovl_file_modified(out); - revert_creds(old_cred); + ret = backing_file_splice_write(pipe, real.file, ppos, len, flags, &ctx); fdput(real); out_unlock: diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h index 0648d548a418..0546d5b1c9f5 100644 --- a/include/linux/backing-file.h +++ b/include/linux/backing-file.h @@ -28,5 +28,13 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, struct kiocb *iocb, int flags, struct backing_file_ctx *ctx); +ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx); +ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx); #endif /* _LINUX_BACKING_FILE_H */ From f567377e406c032fff0799bde4fdf4a977529b84 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 13 Oct 2023 12:49:37 +0300 Subject: [PATCH 298/310] fs: factor out backing_file_mmap() helper Assert that the file object is allocated in a backing_file container so that file_user_path() could be used to display the user path and not the backing file's path in /proc//maps. Signed-off-by: Amir Goldstein --- fs/backing-file.c | 27 +++++++++++++++++++++++++++ fs/overlayfs/file.c | 23 ++++++----------------- include/linux/backing-file.h | 2 ++ 3 files changed, 35 insertions(+), 17 deletions(-) diff --git a/fs/backing-file.c b/fs/backing-file.c index ddd35c1d6c71..a681f38d84d8 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" @@ -296,6 +297,32 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, } EXPORT_SYMBOL_GPL(backing_file_splice_write); +int backing_file_mmap(struct file *file, struct vm_area_struct *vma, + struct backing_file_ctx *ctx) +{ + const struct cred *old_cred; + int ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) || + WARN_ON_ONCE(ctx->user_file != vma->vm_file)) + return -EIO; + + if (!file->f_op->mmap) + return -ENODEV; + + vma_set_file(vma, file); + + old_cred = override_creds(ctx->cred); + ret = call_mmap(vma->vm_file, vma); + revert_creds(old_cred); + + if (ctx->accessed) + ctx->accessed(ctx->user_file); + + return ret; +} +EXPORT_SYMBOL_GPL(backing_file_mmap); + static int __init backing_aio_init(void) { backing_aio_cachep = kmem_cache_create("backing_aio", diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 69b52d2f9c74..05536964d37f 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include "overlayfs.h" @@ -415,23 +414,13 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) static int ovl_mmap(struct file *file, struct vm_area_struct *vma) { struct file *realfile = file->private_data; - const struct cred *old_cred; - int ret; + struct backing_file_ctx ctx = { + .cred = ovl_creds(file_inode(file)->i_sb), + .user_file = file, + .accessed = ovl_file_accessed, + }; - if (!realfile->f_op->mmap) - return -ENODEV; - - if (WARN_ON(file != vma->vm_file)) - return -EIO; - - vma_set_file(vma, realfile); - - old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = call_mmap(vma->vm_file, vma); - revert_creds(old_cred); - ovl_file_accessed(file); - - return ret; + return backing_file_mmap(realfile, vma, &ctx); } static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len) diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h index 0546d5b1c9f5..3f1fe1774f1b 100644 --- a/include/linux/backing-file.h +++ b/include/linux/backing-file.h @@ -36,5 +36,7 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags, struct backing_file_ctx *ctx); +int backing_file_mmap(struct file *file, struct vm_area_struct *vma, + struct backing_file_ctx *ctx); #endif /* _LINUX_BACKING_FILE_H */ From c39e2ae3943d4ee278af4e1b1dcfd5946da1089b Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 28 Dec 2023 11:06:08 +0100 Subject: [PATCH 299/310] fs: fix __sb_write_started() kerneldoc formatting When running 'make htmldocs', I see the following warning: Documentation/filesystems/api-summary:14: ./include/linux/fs.h:1659: WARNING: Definition list ends without a blank line; unexpected unindent. The official guidance [1] seems to be to use lists, which will prevent both the "unexpected unindent" warning as well as ensure that each line is formatted on a separate line in the HTML output instead of being all considered a single paragraph. [1]: https://docs.kernel.org/doc-guide/kernel-doc.html#return-values Fixes: 8802e580ee64 ("fs: create __sb_write_started() helper") Cc: Amir Goldstein Cc: Josef Bacik Cc: Jan Kara Signed-off-by: Vegard Nossum Link: https://lore.kernel.org/r/20231228100608.3123987-1-vegard.nossum@oracle.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index db5d07e6e02e..473063f385e5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1650,9 +1650,9 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level) * @sb: the super we write to * @level: the freeze level * - * > 0 sb freeze level is held - * 0 sb freeze level is not held - * < 0 !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN + * * > 0 - sb freeze level is held + * * 0 - sb freeze level is not held + * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN */ static inline int __sb_write_started(const struct super_block *sb, int level) { From 5bb13e63cb00f0fdca5141f33d7a47bb26730a81 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 23 Nov 2023 14:29:02 +1100 Subject: [PATCH 300/310] powerpc/86xx: Drop unused CONFIG_MPC8610 The MPC8610 symbol used to be default y if MPC8610_HPCD, but since MPC8610_HPCD was removed MPC8610 is now never used. Remove it. Fixes: 248667f8bbde ("powerpc: drop HPCD/MPC8610 evaluation platform support") Signed-off-by: Michael Ellerman Link: https://msgid.link/20231123032902.2760818-1-mpe@ellerman.id.au --- arch/powerpc/platforms/86xx/Kconfig | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/powerpc/platforms/86xx/Kconfig b/arch/powerpc/platforms/86xx/Kconfig index 67467cd6f34c..06b1e5c49d6f 100644 --- a/arch/powerpc/platforms/86xx/Kconfig +++ b/arch/powerpc/platforms/86xx/Kconfig @@ -52,10 +52,3 @@ config MPC8641 select MPIC default y if GEF_SBC610 || GEF_SBC310 || GEF_PPC9A \ || MVME7100 - -config MPC8610 - bool - select HAVE_PCI - select FSL_PCI if PCI - select PPC_UDBG_16550 - select MPIC From 482b718a84f08b6fc84879c3e90cc57dba11c115 Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Sun, 24 Dec 2023 09:52:46 +0900 Subject: [PATCH 301/310] powerpc/ps3_defconfig: Disable PPC64_BIG_ENDIAN_ELF_ABI_V2 Commit 8c5fa3b5c4df ("powerpc/64: Make ELFv2 the default for big-endian builds"), merged in Linux-6.5-rc1 changes the calling ABI in a way that is incompatible with the current code for the PS3's LV1 hypervisor calls. This change just adds the line '# CONFIG_PPC64_BIG_ENDIAN_ELF_ABI_V2 is not set' to the ps3_defconfig file so that the PPC64_ELF_ABI_V1 is used. Fixes run time errors like these: BUG: Kernel NULL pointer dereference at 0x00000000 Faulting instruction address: 0xc000000000047cf0 Oops: Kernel access of bad area, sig: 11 [#1] Call Trace: [c0000000023039e0] [c00000000100ebfc] ps3_create_spu+0xc4/0x2b0 (unreliable) [c000000002303ab0] [c00000000100d4c4] create_spu+0xcc/0x3c4 [c000000002303b40] [c00000000100eae4] ps3_enumerate_spus+0xa4/0xf8 Fixes: 8c5fa3b5c4df ("powerpc/64: Make ELFv2 the default for big-endian builds") Cc: stable@vger.kernel.org # v6.5+ Signed-off-by: Geoff Levand Signed-off-by: Michael Ellerman Link: https://msgid.link/df906ac1-5f17-44b9-b0bb-7cd292a0df65@infradead.org --- arch/powerpc/configs/ps3_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/configs/ps3_defconfig b/arch/powerpc/configs/ps3_defconfig index 2b175ddf82f0..aa8bb0208bcc 100644 --- a/arch/powerpc/configs/ps3_defconfig +++ b/arch/powerpc/configs/ps3_defconfig @@ -24,6 +24,7 @@ CONFIG_PS3_VRAM=m CONFIG_PS3_LPM=m # CONFIG_PPC_OF_BOOT_TRAMPOLINE is not set CONFIG_KEXEC=y +# CONFIG_PPC64_BIG_ENDIAN_ELF_ABI_V2 is not set CONFIG_PPC_4K_PAGES=y CONFIG_SCHED_SMT=y CONFIG_PM=y From 7991ed43587d1106315208cc289c851d6915d4a3 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Sat, 30 Dec 2023 12:20:04 +0100 Subject: [PATCH 302/310] x86/alternative: Correct feature bit debug output In https://lore.kernel.org/r/20231206110636.GBZXBVvCWj2IDjVk4c@fat_crate.local I wanted to adjust the alternative patching debug output to the new changes introduced by da0fe6e68e10 ("x86/alternative: Add indirect call patching") but removed the '*' which denotes the ->x86_capability word. The correct output should be, for example: [ 0.230071] SMP alternatives: feat: 11*32+15, old: (entry_SYSCALL_64_after_hwframe+0x5a/0x77 (ffffffff81c000c2) len: 16), repl: (ffffffff89ae896a, len: 5) flags: 0x0 while the incorrect one says "... 1132+15" currently. Add back the '*'. Fixes: da0fe6e68e10 ("x86/alternative: Add indirect call patching") Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231206110636.GBZXBVvCWj2IDjVk4c@fat_crate.local --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f26983a3e035..f7ea108fbd88 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -498,7 +498,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, continue; } - DPRINTK(ALT, "feat: %d32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x", + DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x", a->cpuid >> 5, a->cpuid & 0x1f, instr, instr, a->instrlen, From 1e92af09fab1b5589f3a7ae68109e3c6a5ca6c6e Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 7 Dec 2023 09:45:12 +0800 Subject: [PATCH 303/310] EDAC/skx_common: Filter out the invalid address Decoding an invalid address with certain firmware decoders could cause a #PF (Page Fault) in the EFI runtime context, which could subsequently hang the system. To make {i10nm,skx}_edac more robust against such bogus firmware decoders, filter out invalid addresses before allowing the firmware decoder to process them. Suggested-by: Tony Luck Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20231207014512.78564-1-qiuxu.zhuo@intel.com --- drivers/edac/skx_common.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index ce3e0069e028..9c5b6f8bd8bd 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -648,6 +648,10 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val, memset(&res, 0, sizeof(res)); res.mce = mce; res.addr = mce->addr & MCI_ADDR_PHYSADDR; + if (!pfn_to_online_page(res.addr >> PAGE_SHIFT)) { + pr_err("Invalid address 0x%llx in IA32_MC%d_ADDR\n", mce->addr, mce->bank); + return NOTIFY_DONE; + } /* Try driver decoder first */ if (!(driver_decode && driver_decode(&res))) { From d642ef7111014805f2e21e9cddb0c0a93ae1313d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 26 Dec 2023 14:28:03 +0100 Subject: [PATCH 304/310] virt: sev-guest: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Borislav Petkov (AMD) Acked-by: Tom Lendacky Link: https://lore.kernel.org/r/52826a50250304ab0af14c594009f7b901c2cd31.1703596577.git.u.kleine-koenig@pengutronix.de --- drivers/virt/coco/sev-guest/sev-guest.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index bc564adcf499..87f241825bc3 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -994,7 +994,7 @@ e_unmap: return ret; } -static int __exit sev_guest_remove(struct platform_device *pdev) +static void __exit sev_guest_remove(struct platform_device *pdev) { struct snp_guest_dev *snp_dev = platform_get_drvdata(pdev); @@ -1003,8 +1003,6 @@ static int __exit sev_guest_remove(struct platform_device *pdev) free_shared_pages(snp_dev->request, sizeof(struct snp_guest_msg)); deinit_crypto(snp_dev->crypto); misc_deregister(&snp_dev->misc); - - return 0; } /* @@ -1013,7 +1011,7 @@ static int __exit sev_guest_remove(struct platform_device *pdev) * with the SEV-SNP support, it is named "sev-guest". */ static struct platform_driver sev_guest_driver = { - .remove = __exit_p(sev_guest_remove), + .remove_new = __exit_p(sev_guest_remove), .driver = { .name = "sev-guest", }, From 54aa699e8094efb7d7675fefbc03dfce24f98456 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 2 Jan 2024 18:40:11 -0600 Subject: [PATCH 305/310] arch/x86: Fix typos Fix typos, most reported by "codespell arch/x86". Only touches comments, no code changes. Signed-off-by: Bjorn Helgaas Signed-off-by: Ingo Molnar Reviewed-by: Randy Dunlap Link: https://lore.kernel.org/r/20240103004011.1758650-1-helgaas@kernel.org --- arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/mem.c | 2 +- arch/x86/coco/tdx/tdx.c | 2 +- arch/x86/crypto/aesni-intel_asm.S | 2 +- arch/x86/crypto/aesni-intel_avx-x86_64.S | 2 +- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 2 +- arch/x86/crypto/sha512-avx-asm.S | 2 +- arch/x86/crypto/sha512-ssse3-asm.S | 2 +- arch/x86/events/amd/brs.c | 2 +- arch/x86/events/amd/core.c | 2 +- arch/x86/events/intel/core.c | 2 +- arch/x86/hyperv/hv_apic.c | 2 +- arch/x86/hyperv/irqdomain.c | 2 +- arch/x86/hyperv/ivm.c | 2 +- arch/x86/include/asm/amd_nb.h | 2 +- arch/x86/include/asm/extable_fixup_types.h | 2 +- arch/x86/include/asm/fpu/types.h | 2 +- arch/x86/include/asm/iosf_mbi.h | 2 +- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/include/asm/nospec-branch.h | 4 ++-- arch/x86/include/asm/pgtable_64.h | 2 +- arch/x86/include/asm/uv/uv_hub.h | 2 +- arch/x86/include/asm/vdso/gettimeofday.h | 2 +- arch/x86/include/asm/xen/interface_64.h | 2 +- arch/x86/include/uapi/asm/amd_hsmp.h | 2 +- arch/x86/kernel/alternative.c | 2 +- arch/x86/kernel/amd_gart_64.c | 2 +- arch/x86/kernel/apic/Makefile | 2 +- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/vector.c | 4 ++-- arch/x86/kernel/cpu/sgx/ioctl.c | 2 +- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/head_64.S | 4 ++-- arch/x86/kernel/hpet.c | 4 ++-- arch/x86/kernel/kvm.c | 2 +- arch/x86/kernel/kvmclock.c | 2 +- arch/x86/kernel/ldt.c | 6 +++--- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/sev-shared.c | 2 +- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/mmu/tdp_iter.c | 2 +- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 6 +++--- arch/x86/lib/delay.c | 2 +- arch/x86/mm/init_64.c | 6 +++--- arch/x86/mm/pat/memtype.c | 2 +- arch/x86/mm/pat/set_memory.c | 4 ++-- arch/x86/mm/pti.c | 2 +- arch/x86/mm/tlb.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- arch/x86/net/bpf_jit_comp32.c | 2 +- arch/x86/platform/intel-quark/imr_selftest.c | 2 +- arch/x86/platform/pvh/head.S | 2 +- arch/x86/platform/uv/uv_nmi.c | 2 +- arch/x86/platform/uv/uv_time.c | 2 +- arch/x86/realmode/init.c | 2 +- arch/x86/xen/mmu_pv.c | 2 +- 60 files changed, 72 insertions(+), 72 deletions(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 71fc531b95b4..f19c038409aa 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -53,7 +53,7 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h -# sev.c indirectly inludes inat-table.h which is generated during +# sev.c indirectly includes inat-table.h which is generated during # compilation and stored in $(objtree). Add the directory to the includes so # that the compiler finds it even with out-of-tree builds (make O=/some/path). CFLAGS_sev.o += -I$(objtree)/arch/x86/lib/ diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c index b3c3a4be7471..dbba332e4a12 100644 --- a/arch/x86/boot/compressed/mem.c +++ b/arch/x86/boot/compressed/mem.c @@ -8,7 +8,7 @@ /* * accept_memory() and process_unaccepted_memory() called from EFI stub which - * runs before decompresser and its early_tdx_detect(). + * runs before decompressor and its early_tdx_detect(). * * Enumerate TDX directly from the early users. */ diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 1b5d17a9f70d..4f06e6721c0a 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -886,7 +886,7 @@ void __init tdx_early_init(void) * there. * * Intel-TDX has a secure RDMSR hypercall, but that needs to be - * implemented seperately in the low level startup ASM code. + * implemented separately in the low level startup ASM code. * Until that is in place, disable parallel bringup for TDX. */ x86_cpuinit.parallel_bringup = false; diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 187f913cc239..411d8c83e88a 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -666,7 +666,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff .ifc \operation, dec movdqa %xmm1, %xmm3 - pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) + pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) mov \PLAIN_CYPH_LEN, %r10 add %r13, %r10 diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 74dd230973cf..8c9749ed0651 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -747,7 +747,7 @@ VARIABLE_OFFSET = 16*8 .if \ENC_DEC == DEC vmovdqa %xmm1, %xmm3 - pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) + pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) mov \PLAIN_CYPH_LEN, %r10 add %r13, %r10 diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 81ce0f4db555..bbcff1fb78cb 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -184,7 +184,7 @@ SYM_FUNC_START(crc_pcl) xor crc1,crc1 xor crc2,crc2 - # Fall thruogh into top of crc array (crc_128) + # Fall through into top of crc array (crc_128) ################################################################ ## 3) CRC Array: diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index d902b8ea0721..5bfce4b045fd 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -84,7 +84,7 @@ frame_size = frame_WK + WK_SIZE # Useful QWORD "arrays" for simpler memory references # MSG, DIGEST, K_t, W_t are arrays -# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even +# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even # Input message (arg1) #define MSG(i) 8*i(msg) diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index 65be30156816..30a2c4777f9d 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -82,7 +82,7 @@ frame_size = frame_WK + WK_SIZE # Useful QWORD "arrays" for simpler memory references # MSG, DIGEST, K_t, W_t are arrays -# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even +# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even # Input message (arg1) #define MSG(i) 8*i(msg) diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index ed308719236c..780acd3dff22 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -125,7 +125,7 @@ int amd_brs_hw_config(struct perf_event *event) * Where X is the number of taken branches due to interrupt * skid. Skid is large. * - * Where Y is the occurences of the event while BRS is + * Where Y is the occurrences of the event while BRS is * capturing the lbr_nr entries. * * By using retired taken branches, we limit the impact on the diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index e24976593a29..25ad6fd98166 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1184,7 +1184,7 @@ static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc, * period of each one and given that the BRS saturates, it would not be possible * to guarantee correlated content for all events. Therefore, in situations * where multiple events want to use BRS, the kernel enforces mutual exclusion. - * Exclusion is enforced by chosing only one counter for events using BRS. + * Exclusion is enforced by choosing only one counter for events using BRS. * The event scheduling logic will then automatically multiplex the * events and ensure that at most one event is actively using BRS. * diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a08f794a0e79..fdfcd5112884 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4027,7 +4027,7 @@ static int intel_pmu_hw_config(struct perf_event *event) /* * Currently, the only caller of this function is the atomic_switch_perf_msrs(). - * The host perf conext helps to prepare the values of the real hardware for + * The host perf context helps to prepare the values of the real hardware for * a set of msrs that need to be switched atomically in a vmx transaction. * * For example, the pseudocode needed to add a new msr should look like: diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 97bfe5f0531f..5fc45543e955 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -209,7 +209,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector, /* * This particular version of the IPI hypercall can - * only target upto 64 CPUs. + * only target up to 64 CPUs. */ if (vcpu >= 64) goto do_ex_hypercall; diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c index 42c70d28ef27..3215a4a07408 100644 --- a/arch/x86/hyperv/irqdomain.c +++ b/arch/x86/hyperv/irqdomain.c @@ -212,7 +212,7 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) * This interrupt is already mapped. Let's unmap first. * * We don't use retarget interrupt hypercalls here because - * Microsoft Hypervisor doens't allow root to change the vector + * Microsoft Hypervisor doesn't allow root to change the vector * or specify VPs outside of the set that is initially used * during mapping. */ diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 02e55237d919..7dcbf153ad72 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -144,7 +144,7 @@ void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason) /* Tell the hypervisor what went wrong. */ val |= GHCB_SEV_TERM_REASON(set, reason); - /* Request Guest Termination from Hypvervisor */ + /* Request Guest Termination from Hypervisor */ wr_ghcb_msr(val); VMGEXIT(); diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index ed0eaf65c437..5c37944c8a5e 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -104,7 +104,7 @@ static inline bool amd_gart_present(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return false; - /* GART present only on Fam15h, upto model 0fh */ + /* GART present only on Fam15h, up to model 0fh */ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) return true; diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h index 991e31cfde94..fe6312045042 100644 --- a/arch/x86/include/asm/extable_fixup_types.h +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -4,7 +4,7 @@ /* * Our IMM is signed, as such it must live at the top end of the word. Also, - * since C99 hex constants are of ambigious type, force cast the mask to 'int' + * since C99 hex constants are of ambiguous type, force cast the mask to 'int' * so that FIELD_GET() will DTRT and sign extend the value when it extracts it. */ #define EX_DATA_TYPE_MASK ((int)0x000000FF) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index eb810074f1e7..f1fadc318a88 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -415,7 +415,7 @@ struct fpu_state_perm { * * This master permission field is only to be used when * task.fpu.fpstate based checks fail to validate whether the task - * is allowed to expand it's xfeatures set which requires to + * is allowed to expand its xfeatures set which requires to * allocate a larger sized fpstate buffer. * * Do not access this field directly. Use the provided helper diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index a1911fea8739..af7541c11821 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -111,7 +111,7 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask); * This function will block all kernel access to the PMIC I2C bus, so that the * P-Unit can safely access the PMIC over the shared I2C bus. * - * Note on these systems the i2c-bus driver will request a sempahore from the + * Note on these systems the i2c-bus driver will request a semaphore from the * P-Unit for exclusive access to the PMIC bus when i2c drivers are accessing * it, but this does not appear to be sufficient, we still need to avoid making * certain P-Unit requests during the access window to avoid problems. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d7036982332e..6711da000bb7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1652,7 +1652,7 @@ struct kvm_x86_ops { /* Whether or not a virtual NMI is pending in hardware. */ bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu); /* - * Attempt to pend a virtual NMI in harware. Returns %true on success + * Attempt to pend a virtual NMI in hardware. Returns %true on success * to allow using static_call_ret0 as the fallback. */ bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index f93e9b96927a..262e65539f83 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -49,7 +49,7 @@ * but there is still a cushion vs. the RSB depth. The algorithm does not * claim to be perfect and it can be speculated around by the CPU, but it * is considered that it obfuscates the problem enough to make exploitation - * extremly difficult. + * extremely difficult. */ #define RET_DEPTH_SHIFT 5 #define RSB_RET_STUFF_LOOPS 16 @@ -208,7 +208,7 @@ /* * Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should - * eventually turn into it's own annotation. + * eventually turn into its own annotation. */ .macro VALIDATE_UNRET_END #if defined(CONFIG_NOINSTR_VALIDATION) && \ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index a629b1b9f65a..24af25b1551a 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -203,7 +203,7 @@ static inline void native_pgd_clear(pgd_t *pgd) * F (2) in swp entry is used to record when a pagetable is * writeprotected by userfaultfd WP support. * - * E (3) in swp entry is used to rememeber PG_anon_exclusive. + * E (3) in swp entry is used to remember PG_anon_exclusive. * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 5fa76c2ced51..ea877fd83114 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -653,7 +653,7 @@ static inline int uv_blade_to_node(int blade) return uv_socket_to_node(blade); } -/* Blade number of current cpu. Numnbered 0 .. <#blades -1> */ +/* Blade number of current cpu. Numbered 0 .. <#blades -1> */ static inline int uv_numa_blade_id(void) { return uv_hub_info->numa_blade_id; diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index c81858d903dc..923053f366d7 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -321,7 +321,7 @@ static __always_inline u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) { /* - * Due to the MSB/Sign-bit being used as invald marker (see + * Due to the MSB/Sign-bit being used as invalid marker (see * arch_vdso_cycles_valid() above), the effective mask is S64_MAX. */ u64 delta = (cycles - last) & S64_MAX; diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h index c599ec269a25..c10f279aae93 100644 --- a/arch/x86/include/asm/xen/interface_64.h +++ b/arch/x86/include/asm/xen/interface_64.h @@ -61,7 +61,7 @@ * RING1 -> RING3 kernel mode. * RING2 -> RING3 kernel mode. * RING3 -> RING3 user mode. - * However RING0 indicates that the guest kernel should return to iteself + * However RING0 indicates that the guest kernel should return to itself * directly with * orb $3,1*8(%rsp) * iretq diff --git a/arch/x86/include/uapi/asm/amd_hsmp.h b/arch/x86/include/uapi/asm/amd_hsmp.h index fce22686c834..e5d182c7373c 100644 --- a/arch/x86/include/uapi/asm/amd_hsmp.h +++ b/arch/x86/include/uapi/asm/amd_hsmp.h @@ -238,7 +238,7 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = { /* * HSMP_GET_DIMM_THERMAL, num_args = 1, response_sz = 1 * input: args[0] = DIMM address[7:0] - * output: args[0] = temperature in degree celcius[31:21] + update rate in ms[16:8] + + * output: args[0] = temperature in degree celsius[31:21] + update rate in ms[16:8] + * DIMM address[7:0] */ {1, 1, HSMP_GET}, diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 73be3931e4f0..0c7fc2b92302 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1896,7 +1896,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l * Note that the caller must ensure that if the modified code is part of a * module, the module would not be removed during poking. This can be achieved * by registering a module notifier, and ordering module removal and patching - * trough a mutex. + * through a mutex. */ void *text_poke(void *addr, const void *opcode, size_t len) { diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 56a917df410d..2ae98f754e59 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -776,7 +776,7 @@ int __init gart_iommu_init(void) iommu_size >> PAGE_SHIFT); /* * Tricky. The GART table remaps the physical memory range, - * so the CPU wont notice potential aliases and if the memory + * so the CPU won't notice potential aliases and if the memory * is remapped to UC later on, we might surprise the PCI devices * with a stray writeout of a cacheline. So play it sure and * do an explicit, full-scale wbinvd() _after_ having marked all diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 2ee867d796d9..3bf0487cf3b7 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -4,7 +4,7 @@ # # Leads to non-deterministic coverage that is not a function of syscall inputs. -# In particualr, smp_apic_timer_interrupt() is called in random places. +# In particular, smp_apic_timer_interrupt() is called in random places. KCOV_INSTRUMENT := n obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o init.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 41093cf20acd..4667bc4b00ab 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -782,7 +782,7 @@ bool __init apic_needs_pit(void) /* * If interrupt delivery mode is legacy PIC or virtual wire without - * configuration, the local APIC timer wont be set up. Make sure + * configuration, the local APIC timer won't be set up. Make sure * that the PIT is initialized. */ if (apic_intr_mode == APIC_PIC || diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 319448d87b99..185738c72766 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -738,8 +738,8 @@ int __init arch_probe_nr_irqs(void) void lapic_assign_legacy_vector(unsigned int irq, bool replace) { /* - * Use assign system here so it wont get accounted as allocated - * and moveable in the cpu hotplug check and it prevents managed + * Use assign system here so it won't get accounted as allocated + * and movable in the cpu hotplug check and it prevents managed * irq reservation from touching it. */ irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace); diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 5d390df21440..b65ab214bdf5 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -581,7 +581,7 @@ err_out: * * Flush any outstanding enqueued EADD operations and perform EINIT. The * Launch Enclave Public Key Hash MSRs are rewritten as necessary to match - * the enclave's MRSIGNER, which is caculated from the provided sigstruct. + * the enclave's MRSIGNER, which is calculated from the provided sigstruct. * * Return: * - 0: Success. diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index a21a4d0ecc34..520deb411a70 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -308,7 +308,7 @@ EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); * Must be invoked from KVM after a VMEXIT before enabling interrupts when * XFD write emulation is disabled. This is required because the guest can * freely modify XFD and the state at VMEXIT is not guaranteed to be the - * same as the state on VMENTER. So software state has to be udpated before + * same as the state on VMENTER. So software state has to be updated before * any operation which depends on it can take place. * * Note: It can be invoked unconditionally even when write emulation is diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 1f79d809305d..d8d5d2224f43 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -205,9 +205,9 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) * Switch to new page-table * * For the boot CPU this switches to early_top_pgt which still has the - * indentity mappings present. The secondary CPUs will switch to the + * identity mappings present. The secondary CPUs will switch to the * init_top_pgt here, away from the trampoline_pgd and unmap the - * indentity mapped ranges. + * identity mapped ranges. */ movq %rax, %cr3 diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 41eecf180b7f..8ff2bf921519 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -707,7 +707,7 @@ static void __init hpet_select_clockevents(void) hpet_base.nr_clockevents = 0; - /* No point if MSI is disabled or CPU has an Always Runing APIC Timer */ + /* No point if MSI is disabled or CPU has an Always Running APIC Timer */ if (hpet_msi_disable || boot_cpu_has(X86_FEATURE_ARAT)) return; @@ -965,7 +965,7 @@ static bool __init mwait_pc10_supported(void) * and per CPU timer interrupts. * * The probability that this problem is going to be solved in the - * forseeable future is close to zero, so the kernel has to be cluttered + * foreseeable future is close to zero, so the kernel has to be cluttered * with heuristics to keep up with the ever growing amount of hardware and * firmware trainwrecks. Hopefully some day hardware people will understand * that the approach of "This can be fixed in software" is not sustainable. diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 0ddb3bd0f1aa..8b57e020614c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -942,7 +942,7 @@ static void __init kvm_init_platform(void) * Reset the host's shared pages list related to kernel * specific page encryption status settings before we load a * new kernel by kexec. Reset the page encryption status - * during early boot intead of just before kexec to avoid SMP + * during early boot instead of just before kexec to avoid SMP * races during kvm_pv_guest_cpu_reboot(). * NOTE: We cannot reset the complete shared pages list * here as we need to retain the UEFI/OVMF firmware diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index fb8f52149be9..a95d0900e8c6 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -42,7 +42,7 @@ static int __init parse_no_kvmclock_vsyscall(char *arg) } early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); -/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ +/* Aligned to page sizes to match what's mapped via vsyscalls to userspace */ #define HVC_BOOT_ARRAY_SIZE \ (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index adc67f98819a..7a814b41402d 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -7,7 +7,7 @@ * This handles calls from both 32bit and 64bit mode. * * Lock order: - * contex.ldt_usr_sem + * context.ldt_usr_sem * mmap_lock * context.lock */ @@ -49,7 +49,7 @@ void load_mm_ldt(struct mm_struct *mm) /* * Any change to mm->context.ldt is followed by an IPI to all * CPUs with the mm active. The LDT will not be freed until - * after the IPI is handled by all such CPUs. This means that, + * after the IPI is handled by all such CPUs. This means that * if the ldt_struct changes before we return, the values we see * will be safe, and the new values will be loaded before we run * any user code. @@ -685,7 +685,7 @@ SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , } /* * The SYSCALL_DEFINE() macros give us an 'unsigned long' - * return type, but tht ABI for sys_modify_ldt() expects + * return type, but the ABI for sys_modify_ldt() expects * 'int'. This cast gives us an int-sized value in %rax * for the return code. The 'unsigned' is necessary so * the compiler does not try to sign-extend the negative diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b6f4e8399fca..ab49ade31b0d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -477,7 +477,7 @@ void native_tss_update_io_bitmap(void) /* * Make sure that the TSS limit is covering the IO bitmap. It might have * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O - * access from user space to trigger a #GP because tbe bitmap is outside + * access from user space to trigger a #GP because the bitmap is outside * the TSS limit. */ refresh_tss_limit(); diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index ccb0915e84e1..1d24ec679915 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -96,7 +96,7 @@ static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) /* Tell the hypervisor what went wrong. */ val |= GHCB_SEV_TERM_REASON(set, reason); - /* Request Guest Termination from Hypvervisor */ + /* Request Guest Termination from Hypervisor */ sev_es_wr_ghcb_msr(val); VMGEXIT(); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index dda6fc4cfae8..42d3f47f4c07 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -105,7 +105,7 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( /* * If the index isn't significant, use the first entry with a - * matching function. It's userspace's responsibilty to not + * matching function. It's userspace's responsibility to not * provide "duplicate" entries in all cases. */ if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c57e181bba21..0b1f991b9a31 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -987,7 +987,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm, /* * The head descriptor is empty. If there are no tail descriptors, - * nullify the rmap head to mark the list as emtpy, else point the rmap + * nullify the rmap head to mark the list as empty, else point the rmap * head at the next descriptor, i.e. the new head. */ if (!head_desc->more) @@ -6544,7 +6544,7 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false); /* - * A TLB flush is unnecessary at this point for the same resons as in + * A TLB flush is unnecessary at this point for the same reasons as in * kvm_mmu_slot_try_split_huge_pages(). */ } diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index bd30ebfb2f2c..04c247bfe318 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -146,7 +146,7 @@ static bool try_step_up(struct tdp_iter *iter) * Step to the next SPTE in a pre-order traversal of the paging structure. * To get to the next SPTE, the iterator either steps down towards the goal * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a - * highter GFN. + * higher GFN. * * The basic algorithm is as follows: * 1. If the current SPTE is a non-last-level SPTE, step down into the page diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 712146312358..7097954904cb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4741,7 +4741,7 @@ static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * Emulation is possible for SEV guests if and only if a prefilled * buffer containing the bytes of the intercepted instruction is * available. SEV guest memory is encrypted with a guest specific key - * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and + * and cannot be decrypted by KVM, i.e. KVM would read ciphertext and * decode garbage. * * If KVM is NOT trying to simply skip an instruction, inject #UD if diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c5ec0ef51ff7..65826fe23f33 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6561,7 +6561,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, * code was changed such that flag signals vmcs12 should * be copied into eVMCS in guest memory. * - * To preserve backwards compatability, allow user + * To preserve backwards compatibility, allow user * to set this flag even when there is no VMXON region. */ if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index be20a60047b1..e0f86f11c345 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1809,7 +1809,7 @@ static void vmx_inject_exception(struct kvm_vcpu *vcpu) * do generate error codes with bits 31:16 set, and so KVM's * ABI lets userspace shove in arbitrary 32-bit values. Drop * the upper bits to avoid VM-Fail, losing information that - * does't really exist is preferable to killing the VM. + * doesn't really exist is preferable to killing the VM. */ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); intr_info |= INTR_INFO_DELIVER_CODE_MASK; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2c924075f6f1..b43b37c414d6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10165,7 +10165,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) * * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip * the instruction or inject an exception, then KVM can incorrecty inject a new - * asynchrounous event if the event became pending after the CPU fetched the + * asynchronous event if the event became pending after the CPU fetched the * instruction (in the guest). E.g. if a page fault (#PF, #NPF, EPT violation) * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be * injected on the restarted instruction instead of being deferred until the @@ -10186,7 +10186,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, int r; /* - * Process nested events first, as nested VM-Exit supercedes event + * Process nested events first, as nested VM-Exit supersedes event * re-injection. If there's an event queued for re-injection, it will * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit. */ @@ -10884,7 +10884,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * Assert that vCPU vs. VM APICv state is consistent. An APICv * update must kick and wait for all vCPUs before toggling the - * per-VM state, and responsing vCPUs must wait for the update + * per-VM state, and responding vCPUs must wait for the update * to complete before servicing KVM_REQ_APICV_UPDATE. */ WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 0e65d00e2339..23f81ca3f06b 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -128,7 +128,7 @@ static void delay_halt_mwaitx(u64 unused, u64 cycles) delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles); /* - * Use cpu_tss_rw as a cacheline-aligned, seldomly accessed per-cpu + * Use cpu_tss_rw as a cacheline-aligned, seldom accessed per-cpu * variable as the monitor target. */ __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a190aae8ceaf..a0dffaca6d2b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1013,7 +1013,7 @@ static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) return; } - /* free a pte talbe */ + /* free a pte table */ free_pagetable(pmd_page(*pmd), 0); spin_lock(&init_mm.page_table_lock); pmd_clear(pmd); @@ -1031,7 +1031,7 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) return; } - /* free a pmd talbe */ + /* free a pmd table */ free_pagetable(pud_page(*pud), 0); spin_lock(&init_mm.page_table_lock); pud_clear(pud); @@ -1049,7 +1049,7 @@ static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) return; } - /* free a pud talbe */ + /* free a pud table */ free_pagetable(p4d_page(*p4d), 0); spin_lock(&init_mm.page_table_lock); p4d_clear(p4d); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index de10800cd4dd..0904d7e8e126 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -14,7 +14,7 @@ * memory ranges: uncached, write-combining, write-through, write-protected, * and the most commonly used and default attribute: write-back caching. * - * PAT support supercedes and augments MTRR support in a compatible fashion: MTRR is + * PAT support supersedes and augments MTRR support in a compatible fashion: MTRR is * a hardware interface to enumerate a limited number of physical memory ranges * and set their caching attributes explicitly, programmed into the CPU via MSRs. * Even modern CPUs have MTRRs enabled - but these are typically not touched diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index bda9f129835e..e9b448d1b1b7 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1621,7 +1621,7 @@ repeat: /* * We need to keep the pfn from the existing PTE, - * after all we're only going to change it's attributes + * after all we're only going to change its attributes * not the memory it points to */ new_pte = pfn_pte(pfn, new_prot); @@ -2447,7 +2447,7 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, /* * The typical sequence for unmapping is to find a pte through * lookup_address_in_pgd() (ideally, it should never return NULL because - * the address is already mapped) and change it's protections. As pfn is + * the address is already mapped) and change its protections. As pfn is * the *target* of a mapping, it's not useful while unmapping. */ struct cpa_data cpa = { diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 5dd733944629..669ba1c345b3 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -6,7 +6,7 @@ * * https://github.com/IAIK/KAISER * - * The original work was written by and and signed off by for the Linux + * The original work was written by and signed off by for the Linux * kernel by: * * Signed-off-by: Richard Fellner diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 453ea95b667d..5768d386efab 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -355,7 +355,7 @@ static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm, /* * Validate that it is not running on an SMT sibling as this would - * make the excercise pointless because the siblings share L1D. If + * make the exercise pointless because the siblings share L1D. If * it runs on a SMT sibling, notify it with SIGBUS on return to * user/guest */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 8c10d9abc239..75ae6e5777e4 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2143,7 +2143,7 @@ static void save_args(const struct btf_func_model *m, u8 **prog, } else { /* Only copy the arguments on-stack to current * 'stack_size' and ignore the regs, used to - * prepare the arguments on-stack for orign call. + * prepare the arguments on-stack for origin call. */ if (for_call_origin) { nr_regs += arg_regs; diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 429a89c5468b..b18ce19981ec 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -1194,7 +1194,7 @@ struct jit_context { #define PROLOGUE_SIZE 35 /* - * Emit prologue code for BPF program and check it's size. + * Emit prologue code for BPF program and check its size. * bpf_tail_call helper will skip it while jumping into another program. */ static void emit_prologue(u8 **pprog, u32 stack_depth) diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c index 761f3689f60a..84ba715f44d1 100644 --- a/arch/x86/platform/intel-quark/imr_selftest.c +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -6,7 +6,7 @@ * Copyright(c) 2015 Bryan O'Donoghue * * IMR self test. The purpose of this module is to run a set of tests on the - * IMR API to validate it's sanity. We check for overlapping, reserved + * IMR API to validate its sanity. We check for overlapping, reserved * addresses and setup/teardown sanity. * */ diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index c4365a05ab83..bc27be467b96 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -41,7 +41,7 @@ * Bit 8 (TF) must be cleared. Other bits are all unspecified. * * All other processor registers and flag bits are unspecified. The OS is in - * charge of setting up it's own stack, GDT and IDT. + * charge of setting up its own stack, GDT and IDT. */ #define PVH_GDT_ENTRY_CS 1 diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index e03207de2880..5c50e550ab63 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -741,7 +741,7 @@ static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs) this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE); } -/* Trigger a slave CPU to dump it's state */ +/* Trigger a slave CPU to dump its state */ static void uv_nmi_trigger_dump(int cpu) { int retry = uv_nmi_trigger_delay; diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index ff5afc8a5a41..3712afc3534d 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -270,7 +270,7 @@ static int uv_rtc_unset_timer(int cpu, int force) * Read the RTC. * * Starting with HUB rev 2.0, the UV RTC register is replicated across all - * cachelines of it's own page. This allows faster simultaneous reads + * cachelines of its own page. This allows faster simultaneous reads * from a given socket. */ static u64 uv_read_rtc(struct clocksource *cs) diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 788e5559549f..f9bc444a3064 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -61,7 +61,7 @@ void __init reserve_real_mode(void) set_real_mode_mem(mem); /* - * Unconditionally reserve the entire fisrt 1M, see comment in + * Unconditionally reserve the entire first 1M, see comment in * setup_arch(). */ memblock_reserve(0, SZ_1M); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index b6830554ff69..72af496a160c 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -34,7 +34,7 @@ * would need to validate the whole pagetable before going on. * Naturally, this is quite slow. The solution is to "pin" a * pagetable, which enforces all the constraints on the pagetable even - * when it is not actively in use. This menas that Xen can be assured + * when it is not actively in use. This means that Xen can be assured * that it is still valid when you do load it into %cr3, and doesn't * need to revalidate it. * From a15f2d48c6f84ae0dd2000288592c79d5d1acd0e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 16:47:41 +0100 Subject: [PATCH 306/310] nubus: Make nubus_bus_type static and constant Now that the driver core can properly handle constant struct bus_type, move the nubus_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. It's also never used outside of drivers/nubus/bus.c so make it static and don't export it as no one is using it. Signed-off-by: Greg Kroah-Hartman Acked-by: Finn Thain Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/2023121940-enlarged-editor-c9a8@gregkh Signed-off-by: Geert Uytterhoeven --- drivers/nubus/bus.c | 3 +-- include/linux/nubus.h | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/nubus/bus.c b/drivers/nubus/bus.c index 72921e4f35f6..12df4d88970c 100644 --- a/drivers/nubus/bus.c +++ b/drivers/nubus/bus.c @@ -32,12 +32,11 @@ static void nubus_device_remove(struct device *dev) ndrv->remove(to_nubus_board(dev)); } -struct bus_type nubus_bus_type = { +static const struct bus_type nubus_bus_type = { .name = "nubus", .probe = nubus_device_probe, .remove = nubus_device_remove, }; -EXPORT_SYMBOL(nubus_bus_type); int nubus_driver_register(struct nubus_driver *ndrv) { diff --git a/include/linux/nubus.h b/include/linux/nubus.h index bdcd85e622d8..4d103ac8f5c7 100644 --- a/include/linux/nubus.h +++ b/include/linux/nubus.h @@ -89,8 +89,6 @@ struct nubus_driver { void (*remove)(struct nubus_board *board); }; -extern struct bus_type nubus_bus_type; - /* Generic NuBus interface functions, modelled after the PCI interface */ #ifdef CONFIG_PROC_FS extern bool nubus_populate_procfs; From 6b9c045b0602cf64b33ea6da5e6aa6f81dd47ae8 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 14 Nov 2023 14:33:58 +0100 Subject: [PATCH 307/310] m68k: defconfig: Update defconfigs for v6.7-rc1 - Enable modular build of the new bcachefs filesystem, - Drop CONFIG_CRYPTO_MANAGER=y (auto-enabled since commit 845346841b77af84 ("crypto: skcipher - Add dependency on ecb")), - Drop CONFIG_DEV_APPLETALK=m, CONFIG_IPDDP=m, and CONFIG_IPDDP_ENCAP=y (removed in commit 1dab47139e6118a4 ("appletalk: remove ipddp driver")). Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/7abb82edd14ee77d985f3949a673c52bb2ee28b5.1699960088.git.geert@linux-m68k.org --- arch/m68k/configs/amiga_defconfig | 2 +- arch/m68k/configs/apollo_defconfig | 2 +- arch/m68k/configs/atari_defconfig | 2 +- arch/m68k/configs/bvme6000_defconfig | 2 +- arch/m68k/configs/hp300_defconfig | 2 +- arch/m68k/configs/mac_defconfig | 5 +---- arch/m68k/configs/multi_defconfig | 5 +---- arch/m68k/configs/mvme147_defconfig | 2 +- arch/m68k/configs/mvme16x_defconfig | 2 +- arch/m68k/configs/q40_defconfig | 2 +- arch/m68k/configs/sun3_defconfig | 2 +- arch/m68k/configs/sun3x_defconfig | 2 +- 12 files changed, 12 insertions(+), 18 deletions(-) diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 7e6b74b6eecd..b4d71fea558f 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -453,6 +453,7 @@ CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set +CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -550,7 +551,6 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 0b403e2efcd5..682d8cd3dd3c 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -410,6 +410,7 @@ CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set +CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -507,7 +508,6 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 57aac3f4b001..15259ced8463 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -430,6 +430,7 @@ CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set +CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -527,7 +528,6 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 3c160636a2e9..7395c12caef6 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -402,6 +402,7 @@ CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set +CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -499,7 +500,6 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index 23cf07c49d14..92506bc7f78d 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -412,6 +412,7 @@ CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set +CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -509,7 +510,6 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 619a0d93ce5b..144bc8c0d8b5 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -269,9 +269,6 @@ CONFIG_RDS_TCP=m CONFIG_L2TP=m CONFIG_BRIDGE=m CONFIG_ATALK=m -CONFIG_DEV_APPLETALK=m -CONFIG_IPDDP=m -CONFIG_IPDDP_ENCAP=y CONFIG_6LOWPAN=m CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m CONFIG_6LOWPAN_GHC_UDP=m @@ -432,6 +429,7 @@ CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set +CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -529,7 +527,6 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index d9430bc2b2de..07594c729497 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -289,9 +289,6 @@ CONFIG_RDS_TCP=m CONFIG_L2TP=m CONFIG_BRIDGE=m CONFIG_ATALK=m -CONFIG_DEV_APPLETALK=m -CONFIG_IPDDP=m -CONFIG_IPDDP_ENCAP=y CONFIG_6LOWPAN=m CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m CONFIG_6LOWPAN_GHC_UDP=m @@ -518,6 +515,7 @@ CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set +CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -615,7 +613,6 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index eb6132f29bf5..c34de6c1de20 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -401,6 +401,7 @@ CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set +CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -498,7 +499,6 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index d0bad674cbb7..83bc029d1f33 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -402,6 +402,7 @@ CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set +CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -499,7 +500,6 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index dad6bcfcaeed..4f551dac2ed7 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -419,6 +419,7 @@ CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set +CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -516,7 +517,6 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index eb1b489b3139..b1bf01182bd3 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -400,6 +400,7 @@ CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set +CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -497,7 +498,6 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index 939589826546..5c9a3f71f036 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -400,6 +400,7 @@ CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set +CONFIG_BCACHEFS_FS=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y CONFIG_AUTOFS_FS=m @@ -497,7 +498,6 @@ CONFIG_NLS_MAC_TURKISH=m CONFIG_DLM=m CONFIG_ENCRYPTED_KEYS=m CONFIG_HARDENED_USERCOPY=y -CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_USER=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_TEST=m From bcf7ef56daca2eacf836d22eee23c66f7cd96a65 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 5 Dec 2023 12:53:08 -0700 Subject: [PATCH 308/310] x86/tools: objdump_reformat.awk: Skip bad instructions from llvm-objdump When running the instruction decoder selftest with LLVM=1 and CONFIG_PVH=y, there is a series of warnings: arch/x86/tools/insn_decoder_test: warning: Found an x86 instruction decoder bug, please report this. arch/x86/tools/insn_decoder_test: warning: ffffffff81000050 ea arch/x86/tools/insn_decoder_test: warning: objdump says 1 bytes, but insn_get_length() says 7 arch/x86/tools/insn_decoder_test: warning: Decoded and checked 7214721 instructions with 1 failures GNU objdump outputs "(bad)" instead of "", which is already handled in the bad_expr regex, so there is no warning. $ objdump -d arch/x86/platform/pvh/head.o | grep -E '50:\s+ea' 50: ea (bad) $ llvm-objdump -d arch/x86/platform/pvh/head.o | grep -E '50:\s+ea' 50: ea Add "" to the bad_expr regex to clear up the warning, allowing the instruction decoder selftest to fully pass with llvm-objdump. Signed-off-by: Nathan Chancellor Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231205-objdump_reformat-awk-handle-llvm-objdump-bad_expr-v1-1-b4a74f39396f@kernel.org --- arch/x86/tools/objdump_reformat.awk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/tools/objdump_reformat.awk b/arch/x86/tools/objdump_reformat.awk index a4120d907277..20b08a6c4d33 100644 --- a/arch/x86/tools/objdump_reformat.awk +++ b/arch/x86/tools/objdump_reformat.awk @@ -11,7 +11,7 @@ BEGIN { prev_addr = "" prev_hex = "" prev_mnemonic = "" - bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))" + bad_expr = "(\\(bad\\)||^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))" fwait_expr = "^9b[ \t]*fwait" fwait_str="9b\tfwait" } From 2b9d9e0a9ba0e24cb9c78336481f0ed8b2bc1ff2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Jan 2024 09:31:16 +0100 Subject: [PATCH 309/310] locking/mutex: Clarify that mutex_unlock(), and most other sleeping locks, can still use the lock object after it's unlocked Clarify the mutex lock lifetime rules a bit more. Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Jann Horn Cc: Linus Torvalds Link: https://lore.kernel.org/r/20231201121808.GL3818@noisy.programming.kicks-ass.net --- Documentation/locking/mutex-design.rst | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/Documentation/locking/mutex-design.rst b/Documentation/locking/mutex-design.rst index 7572339b2f12..7c30b4aa5e28 100644 --- a/Documentation/locking/mutex-design.rst +++ b/Documentation/locking/mutex-design.rst @@ -101,12 +101,24 @@ features that make lock debugging easier and faster: - Detects multi-task circular deadlocks and prints out all affected locks and tasks (and only those tasks). -Releasing a mutex is not an atomic operation: Once a mutex release operation -has begun, another context may be able to acquire the mutex before the release -operation has fully completed. The mutex user must ensure that the mutex is not -destroyed while a release operation is still in progress - in other words, -callers of mutex_unlock() must ensure that the mutex stays alive until -mutex_unlock() has returned. +Mutexes - and most other sleeping locks like rwsems - do not provide an +implicit reference for the memory they occupy, which reference is released +with mutex_unlock(). + +[ This is in contrast with spin_unlock() [or completion_done()], which + APIs can be used to guarantee that the memory is not touched by the + lock implementation after spin_unlock()/completion_done() releases + the lock. ] + +mutex_unlock() may access the mutex structure even after it has internally +released the lock already - so it's not safe for another context to +acquire the mutex and assume that the mutex_unlock() context is not using +the structure anymore. + +The mutex user must ensure that the mutex is not destroyed while a +release operation is still in progress - in other words, callers of +mutex_unlock() must ensure that the mutex stays alive until mutex_unlock() +has returned. Interfaces ---------- From f0a78b3e2a0c842cc7b4c2686f4a35681f02ca72 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 8 Jan 2024 17:09:04 -0800 Subject: [PATCH 310/310] arm64: Update __NR_compat_syscalls for statmount/listmount Commit d8b0f5465012 ("wire up syscalls for statmount/listmount") added two new system calls to arch/arm64/include/asm/unistd32.h but forgot to update the __NR_compat_syscalls number, thus causing the following build failures: arch/arm64/include/asm/unistd32.h:922:24: error: array index in initializer exceeds array bounds 922 | #define __NR_statmount 457 | ^~~ arch/arm64/kernel/sys32.c:130:34: note: in definition of macro '__SYSCALL' 130 | #define __SYSCALL(nr, sym) [nr] = __arm64_##sym, | ^~ Bump up the number by two to accomodate for the new system calls added. Fixes: d8b0f5465012 ("wire up syscalls for statmount/listmount") Signed-off-by: Florian Fainelli Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/unistd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 531effca5f1f..b63f870debaf 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -39,7 +39,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 457 +#define __NR_compat_syscalls 459 #endif #define __ARCH_WANT_SYS_CLONE