From 3b299b99556c1753923f8d9bbd9304bcd139282f Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 2 Jul 2024 13:21:37 +0000 Subject: [PATCH 001/126] x86/mm: Use IPIs to synchronize LAM enablement LAM can only be enabled when a process is single-threaded. But _kernel_ threads can temporarily use a single-threaded process's mm. If LAM is enabled by a userspace process while a kthread is using its mm, the kthread will not observe LAM enablement (i.e. LAM will be disabled in CR3). This could be fine for the kthread itself, as LAM only affects userspace addresses. However, if the kthread context switches to a thread in the same userspace process, CR3 may or may not be updated because the mm_struct doesn't change (based on pending TLB flushes). If CR3 is not updated, the userspace thread will run incorrectly with LAM disabled, which may cause page faults when using tagged addresses. Example scenario: CPU 1 CPU 2 /* kthread */ kthread_use_mm() /* user thread */ prctl_enable_tagged_addr() /* LAM enabled on CPU 2 */ /* LAM disabled on CPU 1 */ context_switch() /* to CPU 1 */ /* Switching to user thread */ switch_mm_irqs_off() /* CR3 not updated */ /* LAM is still disabled on CPU 1 */ Synchronize LAM enablement by sending an IPI to all CPUs running with the mm_struct to enable LAM. This makes sure LAM is enabled on CPU 1 in the above scenario before prctl_enable_tagged_addr() returns and userspace starts using tagged addresses, and before it's possible to run the userspace process on CPU 1. In switch_mm_irqs_off(), move reading the LAM mask until after mm_cpumask() is updated. This ensures that if an outdated LAM mask is written to CR3, an IPI is received to update it right after IRQs are re-enabled. [ dhansen: Add a LAM enabling helper and comment it ] Fixes: 82721d8b25d7 ("x86/mm: Handle LAM on context switch") Suggested-by: Andy Lutomirski Signed-off-by: Yosry Ahmed Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Link: https://lore.kernel.org/all/20240702132139.3332013-2-yosryahmed%40google.com --- arch/x86/kernel/process_64.c | 29 ++++++++++++++++++++++++++--- arch/x86/mm/tlb.c | 7 +++---- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6d3d20e3e43a..d8d582b750d4 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -798,6 +798,27 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) #define LAM_U57_BITS 6 +static void enable_lam_func(void *__mm) +{ + struct mm_struct *mm = __mm; + + if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) { + write_cr3(__read_cr3() | mm->context.lam_cr3_mask); + set_tlbstate_lam_mode(mm); + } +} + +static void mm_enable_lam(struct mm_struct *mm) +{ + /* + * Even though the process must still be single-threaded at this + * point, kernel threads may be using the mm. IPI those kernel + * threads if they exist. + */ + on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true); + set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); +} + static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) { if (!cpu_feature_enabled(X86_FEATURE_LAM)) @@ -814,6 +835,10 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) if (mmap_write_lock_killable(mm)) return -EINTR; + /* + * MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from + * being enabled unless the process is single threaded: + */ if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) { mmap_write_unlock(mm); return -EBUSY; @@ -830,9 +855,7 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) return -EINVAL; } - write_cr3(__read_cr3() | mm->context.lam_cr3_mask); - set_tlbstate_lam_mode(mm); - set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); + mm_enable_lam(mm); mmap_write_unlock(mm); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 44ac64f3a047..a041d2ecd838 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -503,9 +503,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, { struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - unsigned long new_lam = mm_lam_cr3_mask(next); bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); unsigned cpu = smp_processor_id(); + unsigned long new_lam; u64 next_tlb_gen; bool need_flush; u16 new_asid; @@ -619,9 +619,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, cpumask_clear_cpu(cpu, mm_cpumask(prev)); } - /* - * Start remote flushes and then read tlb_gen. - */ + /* Start receiving IPIs and then read tlb_gen (and LAM below) */ if (next != &init_mm) cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); @@ -633,6 +631,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, barrier(); } + new_lam = mm_lam_cr3_mask(next); set_tlbstate_lam_mode(next); if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); From ec225f8c255fd0f256c282cc73d211550cb08b34 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 2 Jul 2024 13:21:38 +0000 Subject: [PATCH 002/126] x86/mm: Fix LAM inconsistency during context switch LAM can only be enabled when a process is single-threaded. But _kernel_ threads can temporarily use a single-threaded process's mm. That means that a context-switching kernel thread can race and observe the mm's LAM metadata (mm->context.lam_cr3_mask) change. The context switch code does two logical things with that metadata: populate CR3 and populate 'cpu_tlbstate.lam'. If it hits this race, 'cpu_tlbstate.lam' and CR3 can end up out of sync. This de-synchronization is currently harmless. But it is confusing and might lead to warnings or real bugs. Update set_tlbstate_lam_mode() to take in the LAM mask and untag mask instead of an mm_struct pointer, and while we are at it, rename it to cpu_tlbstate_update_lam(). This should also make it clearer that we are updating cpu_tlbstate. In switch_mm_irqs_off(), read the LAM mask once and use it for both the cpu_tlbstate update and the CR3 update. Signed-off-by: Yosry Ahmed Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Link: https://lore.kernel.org/all/20240702132139.3332013-3-yosryahmed%40google.com --- arch/x86/include/asm/mmu_context.h | 8 +++++++- arch/x86/include/asm/tlbflush.h | 9 ++++----- arch/x86/kernel/process_64.c | 6 ++++-- arch/x86/mm/tlb.c | 8 +++++--- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 8dac45a2c7fc..19091ebb8633 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -88,7 +88,13 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) #ifdef CONFIG_ADDRESS_MASKING static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) { - return mm->context.lam_cr3_mask; + /* + * When switch_mm_irqs_off() is called for a kthread, it may race with + * LAM enablement. switch_mm_irqs_off() uses the LAM mask to do two + * things: populate CR3 and populate 'cpu_tlbstate.lam'. Make sure it + * reads a single value for both. + */ + return READ_ONCE(mm->context.lam_cr3_mask); } static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 25726893c6f4..69e79fff41b8 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -399,11 +399,10 @@ static inline u64 tlbstate_lam_cr3_mask(void) return lam << X86_CR3_LAM_U57_BIT; } -static inline void set_tlbstate_lam_mode(struct mm_struct *mm) +static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) { - this_cpu_write(cpu_tlbstate.lam, - mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT); - this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask); + this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT); + this_cpu_write(tlbstate_untag_mask, untag_mask); } #else @@ -413,7 +412,7 @@ static inline u64 tlbstate_lam_cr3_mask(void) return 0; } -static inline void set_tlbstate_lam_mode(struct mm_struct *mm) +static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) { } #endif diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index d8d582b750d4..e9f7cfdb9420 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -801,10 +801,12 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) static void enable_lam_func(void *__mm) { struct mm_struct *mm = __mm; + unsigned long lam; if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) { - write_cr3(__read_cr3() | mm->context.lam_cr3_mask); - set_tlbstate_lam_mode(mm); + lam = mm_lam_cr3_mask(mm); + write_cr3(__read_cr3() | lam); + cpu_tlbstate_update_lam(lam, mm_untag_mask(mm)); } } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index a041d2ecd838..1fe9ba33c580 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -632,7 +633,6 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, } new_lam = mm_lam_cr3_mask(next); - set_tlbstate_lam_mode(next); if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); @@ -651,6 +651,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + cpu_tlbstate_update_lam(new_lam, mm_untag_mask(next)); if (next != prev) { cr4_update_pce_mm(next); @@ -697,6 +698,7 @@ void initialize_tlbstate_and_flush(void) int i; struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); + unsigned long lam = mm_lam_cr3_mask(mm); unsigned long cr3 = __read_cr3(); /* Assert that CR3 already references the right mm. */ @@ -704,7 +706,7 @@ void initialize_tlbstate_and_flush(void) /* LAM expected to be disabled */ WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)); - WARN_ON(mm_lam_cr3_mask(mm)); + WARN_ON(lam); /* * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization @@ -723,7 +725,7 @@ void initialize_tlbstate_and_flush(void) this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); - set_tlbstate_lam_mode(mm); + cpu_tlbstate_update_lam(lam, mm_untag_mask(mm)); for (i = 1; i < TLB_NR_DYN_ASIDS; i++) this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); From b7c35279e0da414e7d90eba76f58a16223a734cb Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 2 Jul 2024 13:21:39 +0000 Subject: [PATCH 003/126] x86/mm: Cleanup prctl_enable_tagged_addr() nr_bits error checking There are two separate checks in prctl_enable_tagged_addr() that nr_bits is in the correct range. The checks are arranged such the correct case is sandwiched between both error cases, which do exactly the same thing. Simplify the if condition and pull the correct case outside with the rest of the success code path. Signed-off-by: Yosry Ahmed Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Link: https://lore.kernel.org/all/20240702132139.3332013-4-yosryahmed%40google.com --- arch/x86/kernel/process_64.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e9f7cfdb9420..226472332a70 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -812,6 +812,9 @@ static void enable_lam_func(void *__mm) static void mm_enable_lam(struct mm_struct *mm) { + mm->context.lam_cr3_mask = X86_CR3_LAM_U57; + mm->context.untag_mask = ~GENMASK(62, 57); + /* * Even though the process must still be single-threaded at this * point, kernel threads may be using the mm. IPI those kernel @@ -846,13 +849,7 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) return -EBUSY; } - if (!nr_bits) { - mmap_write_unlock(mm); - return -EINVAL; - } else if (nr_bits <= LAM_U57_BITS) { - mm->context.lam_cr3_mask = X86_CR3_LAM_U57; - mm->context.untag_mask = ~GENMASK(62, 57); - } else { + if (!nr_bits || nr_bits > LAM_U57_BITS) { mmap_write_unlock(mm); return -EINVAL; } From 59c34008d3bdeef4c8ebc0ed2426109b474334d4 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Mon, 22 Jul 2024 14:58:01 +0530 Subject: [PATCH 004/126] x86/amd_nb: Add new PCI IDs for AMD family 1Ah model 60h Add new PCI device IDs into the root IDs and miscellaneous IDs lists to provide support for the latest generation of AMD 1Ah family 60h processor models. Signed-off-by: Shyam Sundar S K Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Yazen Ghannam Link: https://lore.kernel.org/r/20240722092801.3480266-1-Shyam-sundar.S-k@amd.com --- arch/x86/kernel/amd_nb.c | 3 +++ drivers/hwmon/k10temp.c | 1 + include/linux/pci_ids.h | 1 + 3 files changed, 5 insertions(+) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 059e5c16af05..61eadde08511 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -26,6 +26,7 @@ #define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 #define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a #define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507 +#define PCI_DEVICE_ID_AMD_1AH_M60H_ROOT 0x1122 #define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb #define PCI_DEVICE_ID_AMD_MI300_ROOT 0x14f8 @@ -63,6 +64,7 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_ROOT) }, {} @@ -95,6 +97,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) }, diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c index 543526bac042..f96b91e43312 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@ -548,6 +548,7 @@ static const struct pci_device_id k10temp_id_table[] = { { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F3) }, { PCI_VDEVICE(HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} }; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index e388c8b1cbc2..91182aa1d2ec 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -580,6 +580,7 @@ #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F3 0x12fb #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3 0x12c3 #define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3 0x16fb +#define PCI_DEVICE_ID_AMD_1AH_M60H_DF_F3 0x124b #define PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3 0x12bb #define PCI_DEVICE_ID_AMD_MI200_DF_F3 0x14d3 #define PCI_DEVICE_ID_AMD_MI300_DF_F3 0x152b From ba386777a30b38dabcc7fb8a89ec2869a09915f7 Mon Sep 17 00:00:00 2001 From: Vignesh Balasubramanian Date: Thu, 25 Jul 2024 21:40:18 +0530 Subject: [PATCH 005/126] x86/elf: Add a new FPU buffer layout info to x86 core files Add a new .note section containing type, size, offset and flags of every xfeature that is present. This information will be used by debuggers to understand the XSAVE layout of the machine where the core file has been dumped, and to read XSAVE registers, especially during cross-platform debugging. The XSAVE layouts of modern AMD and Intel CPUs differ, especially since Memory Protection Keys and the AVX-512 features have been inculcated into the AMD CPUs. Since AMD never adopted (and hence never left room in the XSAVE layout for) the Intel MPX feature, tools like GDB had assumed a fixed XSAVE layout matching that of Intel (based on the XCR0 mask). Hence, core dumps from AMD CPUs didn't match the known size for the XCR0 mask. This resulted in GDB and other tools not being able to access the values of the AVX-512 and PKRU registers on AMD CPUs. To solve this, an interim solution has been accepted into GDB, and is already a part of GDB 14, see https://sourceware.org/pipermail/gdb-patches/2023-March/198081.html. But it depends on heuristics based on the total XSAVE register set size and the XCR0 mask to infer the layouts of the various register blocks for core dumps, and hence, is not a foolproof mechanism to determine the layout of the XSAVE area. Therefore, add a new core dump note in order to allow GDB/LLDB and other relevant tools to determine the layout of the XSAVE area of the machine where the corefile was dumped. The new core dump note (which is being proposed as a per-process .note section), NT_X86_XSAVE_LAYOUT (0x205) contains an array of structures. Each structure describes an individual extended feature containing offset, size and flags in this format: struct x86_xfeat_component { u32 type; u32 size; u32 offset; u32 flags; }; and in an independent manner, allowing for future extensions without depending on hw arch specifics like CPUID etc. [ bp: Massage commit message, zap trailing whitespace. ] Co-developed-by: Jini Susan George Signed-off-by: Jini Susan George Co-developed-by: Borislav Petkov (AMD) Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Vignesh Balasubramanian Link: https://lore.kernel.org/r/20240725161017.112111-2-vigbalas@amd.com --- arch/x86/Kconfig | 1 + arch/x86/include/uapi/asm/elf.h | 16 ++++++ arch/x86/kernel/fpu/xstate.c | 89 +++++++++++++++++++++++++++++++++ fs/binfmt_elf.c | 4 +- include/uapi/linux/elf.h | 1 + 5 files changed, 109 insertions(+), 2 deletions(-) create mode 100644 arch/x86/include/uapi/asm/elf.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 007bab9f2a0e..c15b4b3fb328 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -107,6 +107,7 @@ config X86 select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_HAVE_EXTRA_ELF_NOTES select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/arch/x86/include/uapi/asm/elf.h b/arch/x86/include/uapi/asm/elf.h new file mode 100644 index 000000000000..468e135fa285 --- /dev/null +++ b/arch/x86/include/uapi/asm/elf.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_ASM_X86_ELF_H +#define _UAPI_ASM_X86_ELF_H + +#include + +struct x86_xfeat_component { + __u32 type; + __u32 size; + __u32 offset; + __u32 flags; +} __packed; + +_Static_assert(sizeof(struct x86_xfeat_component) % 4 == 0, "x86_xfeat_component is not aligned"); + +#endif /* _UAPI_ASM_X86_ELF_H */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c5a026fee5e0..f3a2e59a28e7 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,8 @@ #include #include +#include + #include "context.h" #include "internal.h" #include "legacy.h" @@ -1838,3 +1841,89 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, return 0; } #endif /* CONFIG_PROC_PID_ARCH_STATUS */ + +#ifdef CONFIG_COREDUMP +static const char owner_name[] = "LINUX"; + +/* + * Dump type, size, offset and flag values for every xfeature that is present. + */ +static int dump_xsave_layout_desc(struct coredump_params *cprm) +{ + int num_records = 0; + int i; + + for_each_extended_xfeature(i, fpu_user_cfg.max_features) { + struct x86_xfeat_component xc = { + .type = i, + .size = xstate_sizes[i], + .offset = xstate_offsets[i], + /* reserved for future use */ + .flags = 0, + }; + + if (!dump_emit(cprm, &xc, sizeof(xc))) + return 0; + + num_records++; + } + return num_records; +} + +static u32 get_xsave_desc_size(void) +{ + u32 cnt = 0; + u32 i; + + for_each_extended_xfeature(i, fpu_user_cfg.max_features) + cnt++; + + return cnt * (sizeof(struct x86_xfeat_component)); +} + +int elf_coredump_extra_notes_write(struct coredump_params *cprm) +{ + int num_records = 0; + struct elf_note en; + + if (!fpu_user_cfg.max_features) + return 0; + + en.n_namesz = sizeof(owner_name); + en.n_descsz = get_xsave_desc_size(); + en.n_type = NT_X86_XSAVE_LAYOUT; + + if (!dump_emit(cprm, &en, sizeof(en))) + return 1; + if (!dump_emit(cprm, owner_name, en.n_namesz)) + return 1; + if (!dump_align(cprm, 4)) + return 1; + + num_records = dump_xsave_layout_desc(cprm); + if (!num_records) + return 1; + + /* Total size should be equal to the number of records */ + if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz) + return 1; + + return 0; +} + +int elf_coredump_extra_notes_size(void) +{ + int size; + + if (!fpu_user_cfg.max_features) + return 0; + + /* .note header */ + size = sizeof(struct elf_note); + /* Name plus alignment to 4 bytes */ + size += roundup(sizeof(owner_name), 4); + size += get_xsave_desc_size(); + + return size; +} +#endif /* CONFIG_COREDUMP */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 19fa49cd9907..01bcbe7fdebd 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2039,7 +2039,7 @@ static int elf_core_dump(struct coredump_params *cprm) { size_t sz = info.size; - /* For cell spufs */ + /* For cell spufs and x86 xstate */ sz += elf_coredump_extra_notes_size(); phdr4note = kmalloc(sizeof(*phdr4note), GFP_KERNEL); @@ -2103,7 +2103,7 @@ static int elf_core_dump(struct coredump_params *cprm) if (!write_note_info(&info, cprm)) goto end_coredump; - /* For cell spufs */ + /* For cell spufs and x86 xstate */ if (elf_coredump_extra_notes_write(cprm)) goto end_coredump; diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index b54b313bcf07..e30a9b47dc87 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -411,6 +411,7 @@ typedef struct elf64_shdr { #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ /* Old binutils treats 0x203 as a CET state */ #define NT_X86_SHSTK 0x204 /* x86 SHSTK state */ +#define NT_X86_XSAVE_LAYOUT 0x205 /* XSAVE layout description */ #define NT_S390_HIGH_GPRS 0x300 /* s390 upper register halves */ #define NT_S390_TIMER 0x301 /* s390 timer register */ #define NT_S390_TODCMP 0x302 /* s390 TOD clock comparator register */ From b745fdeff5398b108bbd1f8df19eba8e4b33fe77 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Mon, 29 Jul 2024 15:01:27 +0100 Subject: [PATCH 006/126] docs/core-api: memory-allocation: GFP_NOWAIT doesn't need __GFP_NOWARN Since v6.8 the definition of GFP_NOWAIT has implied __GFP_NOWARN, so it is now redundant to add this flag explicitly. Update the docs to match, and emphasise the need for a fallback when using GFP_NOWAIT. Fixes: 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT") Signed-off-by: Dave Martin Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (Microsoft) Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240729140127.244606-1-Dave.Martin@arm.com --- Documentation/core-api/memory-allocation.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/core-api/memory-allocation.rst b/Documentation/core-api/memory-allocation.rst index 8b84eb4bdae7..0f19dd524323 100644 --- a/Documentation/core-api/memory-allocation.rst +++ b/Documentation/core-api/memory-allocation.rst @@ -45,8 +45,9 @@ here we briefly outline their recommended usage: * If the allocation is performed from an atomic context, e.g interrupt handler, use ``GFP_NOWAIT``. This flag prevents direct reclaim and IO or filesystem operations. Consequently, under memory pressure - ``GFP_NOWAIT`` allocation is likely to fail. Allocations which - have a reasonable fallback should be using ``GFP_NOWARN``. + ``GFP_NOWAIT`` allocation is likely to fail. Users of this flag need + to provide a suitable fallback to cope with such failures where + appropriate. * If you think that accessing memory reserves is justified and the kernel will be stressed unless allocation succeeds, you may use ``GFP_ATOMIC``. * Untrusted allocations triggered from userspace should be a subject From e6a5af90f0a24f08445e3b8a11b727ac84a9520c Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Fri, 26 Jul 2024 22:57:23 +0800 Subject: [PATCH 007/126] docs/zh_CN: add the translation of kbuild/headers_install.rst Finish the translation of kbuild/headers_install.rst and kbuild/index.rst, then add kbuild into zh_CN/index.rst. Update to commit 5b67fbfc32b5 ("Merge tag 'kbuild-v5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild") Signed-off-by: Dongliang Mu Reviewed-by: Alex Shi Reviewed-by: Yanteng Si [jc: added missing EOF newline ] Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240726145754.2598197-1-dzm91@hust.edu.cn --- Documentation/translations/zh_CN/index.rst | 2 +- .../zh_CN/kbuild/headers_install.rst | 39 +++++++++++++++++++ .../translations/zh_CN/kbuild/index.rst | 35 +++++++++++++++++ 3 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 Documentation/translations/zh_CN/kbuild/headers_install.rst create mode 100644 Documentation/translations/zh_CN/kbuild/index.rst diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst index 20b9d4270d1f..7574e1673180 100644 --- a/Documentation/translations/zh_CN/index.rst +++ b/Documentation/translations/zh_CN/index.rst @@ -89,10 +89,10 @@ TODOList: admin-guide/index admin-guide/reporting-issues.rst userspace-api/index + 内核构建系统 TODOList: -* 内核构建系统 * 用户空间工具 也可参考独立于内核文档的 `Linux 手册页 `_ 。 diff --git a/Documentation/translations/zh_CN/kbuild/headers_install.rst b/Documentation/translations/zh_CN/kbuild/headers_install.rst new file mode 100644 index 000000000000..02cb8896e555 --- /dev/null +++ b/Documentation/translations/zh_CN/kbuild/headers_install.rst @@ -0,0 +1,39 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/kbuild/headers_install.rst +:Translator: 慕冬亮 Dongliang Mu + +============================ +导出内核头文件供用户空间使用 +============================ + +"make headers_install" 命令以适合于用户空间程序的形式导出内核头文件。 + +Linux 内核导出的头文件描述了用户空间程序尝试使用内核服务的 API。这些内核 +头文件被系统的 C 库(例如 glibc 和 uClibc)用于定义可用的系统调用,以及 +与这些系统调用一起使用的常量和结构。C 库的头文件包括来自 linux 子目录的 +内核头文件。系统的 libc 头文件通常被安装在默认位置 /usr/include,而内核 +头文件在该位置的子目录中(主要是 /usr/include/linux 和 /usr/include/asm)。 + +内核头文件向后兼容,但不向前兼容。这意味着使用旧内核头文件的 C 库构建的程序 +可以在新内核上运行(尽管它可能无法访问新特性),但使用新内核头文件构建的程序 +可能无法在旧内核上运行。 + +"make headers_install" 命令可以在内核源代码的顶层目录中运行(或使用标准 +的树外构建)。它接受两个可选参数:: + + make headers_install ARCH=i386 INSTALL_HDR_PATH=/usr + +ARCH 表明为其生成头文件的架构,默认为当前架构。导出内核头文件的 linux/asm +目录是基于特定平台的,要查看支持架构的完整列表,使用以下命令:: + + ls -d include/asm-* | sed 's/.*-//' + +INSTALL_HDR_PATH 表明头文件的安装位置,默认为 "./usr"。 + +该命令会在 INSTALL_HDR_PATH 中自动创建创建一个 'include' 目录,而头文件 +会被安装在 INSTALL_HDR_PATH/include 中。 + +内核头文件导出的基础设施由 David Woodhouse 维护。 diff --git a/Documentation/translations/zh_CN/kbuild/index.rst b/Documentation/translations/zh_CN/kbuild/index.rst new file mode 100644 index 000000000000..b9feb56b846a --- /dev/null +++ b/Documentation/translations/zh_CN/kbuild/index.rst @@ -0,0 +1,35 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/kbuild/index +:Translator: 慕冬亮 Dongliang Mu + +============ +内核编译系统 +============ + +.. toctree:: + :maxdepth: 1 + + headers_install + +TODO: + +- kconfig-language +- kconfig-macro-language +- kbuild +- kconfig +- makefiles +- modules +- issues +- reproducible-builds +- gcc-plugins +- llvm + +.. only:: subproject and html + + 目录 + ===== + + * :ref:`genindex` From 0a8e4dc1d353f0931c5f42487f842e94e158a039 Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Thu, 25 Jul 2024 19:11:21 +0200 Subject: [PATCH 008/126] Documentation: ioctl: document 0x07 ioctl code It looks like this was supposed to be documented when VSOCK was added[1], but it got lost in later submissions. Link: https://lore.kernel.org/20130109000024.3719.71468.stgit@promb-2n-dhcp175.eng.vmware.com/#Z31Documentation:ioctl:ioctl-number.txt [1] Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Signed-off-by: Alyssa Ross Reviewed-by: Randy Dunlap Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240725171120.12226-2-hi@alyssa.is --- Documentation/userspace-api/ioctl/ioctl-number.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index e91c0376ee59..217bdc76fe56 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -78,6 +78,7 @@ Code Seq# Include File Comments 0x03 all linux/hdreg.h 0x04 D2-DC linux/umsdos_fs.h Dead since 2.6.11, but don't reuse these. 0x06 all linux/lp.h +0x07 9F-D0 linux/vmw_vmci_defs.h, uapi/linux/vm_sockets.h 0x09 all linux/raid/md_u.h 0x10 00-0F drivers/char/s390/vmcp.h 0x10 10-1F arch/s390/include/uapi/sclp_ctl.h From 565a3041b5273f2ffc75145411791364bcb45056 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 23 Jul 2024 15:32:58 -0700 Subject: [PATCH 009/126] MAINTAINERS: add Documentation/dev-tools/ to workflows@ The goal of the workflows@ mailing list was to make it easier for maintainers who don't use lore+lei to subscribe to topics related to process changes. In other words it should cover changes to Documentation files which most maintainers should know about. Recent changes from Kees [1] to provide guidelines on naming KUnit files did not fall under workflows@ since Documentation/dev-tools/ isn't covered. The patch volume for dev-tools isn't huge and most of the changes are interesting. Add it. Link: https://lore.kernel.org/20240720165441.it.320-kees@kernel.org/ # [1] Signed-off-by: Jakub Kicinski Reviewed-by: Kees Cook Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240723223258.2197852-1-kuba@kernel.org --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 42decde38320..b34385f2e46d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6711,6 +6711,7 @@ DOCUMENTATION PROCESS M: Jonathan Corbet L: workflows@vger.kernel.org S: Maintained +F: Documentation/dev-tools/ F: Documentation/maintainer/ F: Documentation/process/ From 63e96ce050e52613eafe5c5dd90a9b81ce6eddb2 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Fri, 19 Jul 2024 12:13:33 +0800 Subject: [PATCH 010/126] scripts: fix all issues reported by pylint This patch 1) fixes all the issues (not most) reported by pylint, 2) add the functionability to tackle documents that need translation, 3) add logging to adjust the logging level and log file Signed-off-by: Dongliang Mu Reviewed-by: Yanteng Si Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240719041400.3909775-2-dzm91@hust.edu.cn --- scripts/checktransupdate.py | 212 ++++++++++++++++++++++++------------ 1 file changed, 140 insertions(+), 72 deletions(-) diff --git a/scripts/checktransupdate.py b/scripts/checktransupdate.py index 5a0fc99e3f93..578c3fecfdfd 100755 --- a/scripts/checktransupdate.py +++ b/scripts/checktransupdate.py @@ -10,31 +10,28 @@ differences occur, report the file and commits that need to be updated. The usage is as follows: - ./scripts/checktransupdate.py -l zh_CN -This will print all the files that need to be updated in the zh_CN locale. +This will print all the files that need to be updated or translated in the zh_CN locale. - ./scripts/checktransupdate.py Documentation/translations/zh_CN/dev-tools/testing-overview.rst This will only print the status of the specified file. The output is something like: -Documentation/translations/zh_CN/dev-tools/testing-overview.rst (1 commits) +Documentation/dev-tools/kfence.rst +No translation in the locale of zh_CN + +Documentation/translations/zh_CN/dev-tools/testing-overview.rst commit 42fb9cfd5b18 ("Documentation: dev-tools: Add link to RV docs") +1 commits needs resolving in total """ import os -from argparse import ArgumentParser, BooleanOptionalAction +import time +import logging +from argparse import ArgumentParser, ArgumentTypeError, BooleanOptionalAction from datetime import datetime -flag_p_c = False -flag_p_uf = False -flag_debug = False - - -def dprint(*args, **kwargs): - if flag_debug: - print("[DEBUG] ", end="") - print(*args, **kwargs) - def get_origin_path(file_path): + """Get the origin path from the translation path""" paths = file_path.split("/") tidx = paths.index("translations") opaths = paths[:tidx] @@ -43,17 +40,16 @@ def get_origin_path(file_path): def get_latest_commit_from(file_path, commit): - command = "git log --pretty=format:%H%n%aD%n%cD%n%n%B {} -1 -- {}".format( - commit, file_path - ) - dprint(command) + """Get the latest commit from the specified commit for the specified file""" + command = f"git log --pretty=format:%H%n%aD%n%cD%n%n%B {commit} -1 -- {file_path}" + logging.debug(command) pipe = os.popen(command) result = pipe.read() result = result.split("\n") if len(result) <= 1: return None - dprint("Result: {}".format(result[0])) + logging.debug("Result: %s", result[0]) return { "hash": result[0], @@ -64,17 +60,19 @@ def get_latest_commit_from(file_path, commit): def get_origin_from_trans(origin_path, t_from_head): + """Get the latest origin commit from the translation commit""" o_from_t = get_latest_commit_from(origin_path, t_from_head["hash"]) while o_from_t is not None and o_from_t["author_date"] > t_from_head["author_date"]: o_from_t = get_latest_commit_from(origin_path, o_from_t["hash"] + "^") if o_from_t is not None: - dprint("tracked origin commit id: {}".format(o_from_t["hash"])) + logging.debug("tracked origin commit id: %s", o_from_t["hash"]) return o_from_t def get_commits_count_between(opath, commit1, commit2): - command = "git log --pretty=format:%H {}...{} -- {}".format(commit1, commit2, opath) - dprint(command) + """Get the commits count between two commits for the specified file""" + command = f"git log --pretty=format:%H {commit1}...{commit2} -- {opath}" + logging.debug(command) pipe = os.popen(command) result = pipe.read().split("\n") # filter out empty lines @@ -83,50 +81,120 @@ def get_commits_count_between(opath, commit1, commit2): def pretty_output(commit): - command = "git log --pretty='format:%h (\"%s\")' -1 {}".format(commit) - dprint(command) + """Pretty print the commit message""" + command = f"git log --pretty='format:%h (\"%s\")' -1 {commit}" + logging.debug(command) pipe = os.popen(command) return pipe.read() +def valid_commit(commit): + """Check if the commit is valid or not""" + msg = pretty_output(commit) + return "Merge tag" not in msg + def check_per_file(file_path): + """Check the translation status for the specified file""" opath = get_origin_path(file_path) if not os.path.isfile(opath): - dprint("Error: Cannot find the origin path for {}".format(file_path)) + logging.error("Cannot find the origin path for {file_path}") return o_from_head = get_latest_commit_from(opath, "HEAD") t_from_head = get_latest_commit_from(file_path, "HEAD") if o_from_head is None or t_from_head is None: - print("Error: Cannot find the latest commit for {}".format(file_path)) + logging.error("Cannot find the latest commit for %s", file_path) return o_from_t = get_origin_from_trans(opath, t_from_head) if o_from_t is None: - print("Error: Cannot find the latest origin commit for {}".format(file_path)) + logging.error("Error: Cannot find the latest origin commit for %s", file_path) return if o_from_head["hash"] == o_from_t["hash"]: - if flag_p_uf: - print("No update needed for {}".format(file_path)) - return + logging.debug("No update needed for %s", file_path) else: - print("{}".format(file_path), end="\t") + logging.info(file_path) commits = get_commits_count_between( opath, o_from_t["hash"], o_from_head["hash"] ) - print("({} commits)".format(len(commits))) - if flag_p_c: - for commit in commits: - msg = pretty_output(commit) - if "Merge tag" not in msg: - print("commit", msg) + count = 0 + for commit in commits: + if valid_commit(commit): + logging.info("commit %s", pretty_output(commit)) + count += 1 + logging.info("%d commits needs resolving in total\n", count) + + +def valid_locales(locale): + """Check if the locale is valid or not""" + script_path = os.path.dirname(os.path.abspath(__file__)) + linux_path = os.path.join(script_path, "..") + if not os.path.isdir(f"{linux_path}/Documentation/translations/{locale}"): + raise ArgumentTypeError("Invalid locale: {locale}") + return locale + + +def list_files_with_excluding_folders(folder, exclude_folders, include_suffix): + """List all files with the specified suffix in the folder and its subfolders""" + files = [] + stack = [folder] + + while stack: + pwd = stack.pop() + # filter out the exclude folders + if os.path.basename(pwd) in exclude_folders: + continue + # list all files and folders + for item in os.listdir(pwd): + ab_item = os.path.join(pwd, item) + if os.path.isdir(ab_item): + stack.append(ab_item) + else: + if ab_item.endswith(include_suffix): + files.append(ab_item) + + return files + + +class DmesgFormatter(logging.Formatter): + """Custom dmesg logging formatter""" + def format(self, record): + timestamp = time.time() + formatted_time = f"[{timestamp:>10.6f}]" + log_message = f"{formatted_time} {record.getMessage()}" + return log_message + + +def config_logging(log_level, log_file="checktransupdate.log"): + """configure logging based on the log level""" + # set up the root logger + logger = logging.getLogger() + logger.setLevel(log_level) + + # Create console handler + console_handler = logging.StreamHandler() + console_handler.setLevel(log_level) + + # Create file handler + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(log_level) + + # Create formatter and add it to the handlers + formatter = DmesgFormatter() + console_handler.setFormatter(formatter) + file_handler.setFormatter(formatter) + + # Add the handler to the logger + logger.addHandler(console_handler) + logger.addHandler(file_handler) def main(): + """Main function of the script""" script_path = os.path.dirname(os.path.abspath(__file__)) linux_path = os.path.join(script_path, "..") @@ -134,62 +202,62 @@ def main(): parser.add_argument( "-l", "--locale", + default="zh_CN", + type=valid_locales, help="Locale to check when files are not specified", ) + parser.add_argument( - "--print-commits", + "--print-missing-translations", action=BooleanOptionalAction, default=True, - help="Print commits between the origin and the translation", + help="Print files that do not have translations", ) parser.add_argument( - "--print-updated-files", - action=BooleanOptionalAction, - default=False, - help="Print files that do no need to be updated", - ) + '--log', + default='INFO', + choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + help='Set the logging level') parser.add_argument( - "--debug", - action=BooleanOptionalAction, - help="Print debug information", - default=False, - ) + '--logfile', + default='checktransupdate.log', + help='Set the logging file (default: checktransupdate.log)') parser.add_argument( "files", nargs="*", help="Files to check, if not specified, check all files" ) args = parser.parse_args() - global flag_p_c, flag_p_uf, flag_debug - flag_p_c = args.print_commits - flag_p_uf = args.print_updated_files - flag_debug = args.debug + # Configure logging based on the --log argument + log_level = getattr(logging, args.log.upper(), logging.INFO) + config_logging(log_level) - # get files related to linux path + # Get files related to linux path files = args.files if len(files) == 0: - if args.locale is not None: - files = ( - os.popen( - "find {}/Documentation/translations/{} -type f".format( - linux_path, args.locale - ) - ) - .read() - .split("\n") - ) - else: - files = ( - os.popen( - "find {}/Documentation/translations -type f".format(linux_path) - ) - .read() - .split("\n") - ) + offical_files = list_files_with_excluding_folders( + os.path.join(linux_path, "Documentation"), ["translations", "output"], "rst" + ) + + for file in offical_files: + # split the path into parts + path_parts = file.split(os.sep) + # find the index of the "Documentation" directory + kindex = path_parts.index("Documentation") + # insert the translations and locale after the Documentation directory + new_path_parts = path_parts[:kindex + 1] + ["translations", args.locale] \ + + path_parts[kindex + 1 :] + # join the path parts back together + new_file = os.sep.join(new_path_parts) + if os.path.isfile(new_file): + files.append(new_file) + else: + if args.print_missing_translations: + logging.info(os.path.relpath(os.path.abspath(file), linux_path)) + logging.info("No translation in the locale of %s\n", args.locale) - files = list(filter(lambda x: x != "", files)) files = list(map(lambda x: os.path.relpath(os.path.abspath(x), linux_path), files)) # cd to linux root directory From d40981350844c2cfa437abfc80596e10ea8f1149 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Fri, 19 Jul 2024 12:13:34 +0800 Subject: [PATCH 011/126] doc-guide: add help documentation checktransupdate.rst This commit adds help documents - doc-guide/checktransupdate.rst and zh_CN/doc-guide/checktransupdate.rst for scripts/checktransupdate.py , including English and Chinese versions Signed-off-by: Dongliang Mu Reviewed-by: Yanteng Si [jc: fixed missing title problem in the new file] Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240719041400.3909775-3-dzm91@hust.edu.cn --- Documentation/doc-guide/checktransupdate.rst | 54 ++++++++++++++++++ Documentation/doc-guide/index.rst | 1 + .../zh_CN/doc-guide/checktransupdate.rst | 55 +++++++++++++++++++ .../translations/zh_CN/doc-guide/index.rst | 1 + 4 files changed, 111 insertions(+) create mode 100644 Documentation/doc-guide/checktransupdate.rst create mode 100644 Documentation/translations/zh_CN/doc-guide/checktransupdate.rst diff --git a/Documentation/doc-guide/checktransupdate.rst b/Documentation/doc-guide/checktransupdate.rst new file mode 100644 index 000000000000..dfaf9d373747 --- /dev/null +++ b/Documentation/doc-guide/checktransupdate.rst @@ -0,0 +1,54 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Checking for needed translation updates +======================================= + +This script helps track the translation status of the documentation in +different locales, i.e., whether the documentation is up-to-date with +the English counterpart. + +How it works +------------ + +It uses ``git log`` command to track the latest English commit from the +translation commit (order by author date) and the latest English commits +from HEAD. If any differences occur, the file is considered as out-of-date, +then commits that need to be updated will be collected and reported. + +Features implemented + +- check all files in a certain locale +- check a single file or a set of files +- provide options to change output format +- track the translation status of files that have no translation + +Usage +----- + +:: + + ./scripts/checktransupdate.py --help + +Please refer to the output of argument parser for usage details. + +Samples + +- ``./scripts/checktransupdate.py -l zh_CN`` + This will print all the files that need to be updated in the zh_CN locale. +- ``./scripts/checktransupdate.py Documentation/translations/zh_CN/dev-tools/testing-overview.rst`` + This will only print the status of the specified file. + +Then the output is something like: + +:: + + Documentation/dev-tools/kfence.rst + No translation in the locale of zh_CN + + Documentation/translations/zh_CN/dev-tools/testing-overview.rst + commit 42fb9cfd5b18 ("Documentation: dev-tools: Add link to RV docs") + 1 commits needs resolving in total + +Features to be implemented + +- files can be a folder instead of only a file diff --git a/Documentation/doc-guide/index.rst b/Documentation/doc-guide/index.rst index 7c7d97784626..24d058faa75c 100644 --- a/Documentation/doc-guide/index.rst +++ b/Documentation/doc-guide/index.rst @@ -12,6 +12,7 @@ How to write kernel documentation parse-headers contributing maintainer-profile + checktransupdate .. only:: subproject and html diff --git a/Documentation/translations/zh_CN/doc-guide/checktransupdate.rst b/Documentation/translations/zh_CN/doc-guide/checktransupdate.rst new file mode 100644 index 000000000000..d20b4ce66b9f --- /dev/null +++ b/Documentation/translations/zh_CN/doc-guide/checktransupdate.rst @@ -0,0 +1,55 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/doc-guide/checktransupdate.rst + +:译者: 慕冬亮 Dongliang Mu + +检查翻译更新 + +这个脚本帮助跟踪不同语言的文档翻译状态,即文档是否与对应的英文版本保持更新。 + +工作原理 +------------ + +它使用 ``git log`` 命令来跟踪翻译提交的最新英文提交(按作者日期排序)和英文文档的 +最新提交。如果有任何差异,则该文件被认为是过期的,然后需要更新的提交将被收集并报告。 + +实现的功能 + +- 检查特定语言中的所有文件 +- 检查单个文件或一组文件 +- 提供更改输出格式的选项 +- 跟踪没有翻译过的文件的翻译状态 + +用法 +----- + +:: + + ./scripts/checktransupdate.py --help + +具体用法请参考参数解析器的输出 + +示例 + +- ``./scripts/checktransupdate.py -l zh_CN`` + 这将打印 zh_CN 语言中需要更新的所有文件。 +- ``./scripts/checktransupdate.py Documentation/translations/zh_CN/dev-tools/testing-overview.rst`` + 这将只打印指定文件的状态。 + +然后输出类似如下的内容: + +:: + + Documentation/dev-tools/kfence.rst + No translation in the locale of zh_CN + + Documentation/translations/zh_CN/dev-tools/testing-overview.rst + commit 42fb9cfd5b18 ("Documentation: dev-tools: Add link to RV docs") + 1 commits needs resolving in total + +待实现的功能 + +- 文件参数可以是文件夹而不仅仅是单个文件 diff --git a/Documentation/translations/zh_CN/doc-guide/index.rst b/Documentation/translations/zh_CN/doc-guide/index.rst index 78c2e9a1697f..0ac1fc9315ea 100644 --- a/Documentation/translations/zh_CN/doc-guide/index.rst +++ b/Documentation/translations/zh_CN/doc-guide/index.rst @@ -18,6 +18,7 @@ parse-headers contributing maintainer-profile + checktransupdate .. only:: subproject and html From 1b2255db3c22b0e6a53781871afe95d7cd1a69a6 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Wed, 17 Jul 2024 16:35:21 -0400 Subject: [PATCH 012/126] Documentation: Add detailed explanation for 'N' taint flag Every taint flag has an entry in the "More detailed explanation" section except for the 'N' flag. That omission was probably just an oversight so add an entry for that flag. Signed-off-by: Benjamin Poirier Acked-by: Luis Chamberlain Reviewed-by: David Gow Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240717203521.514348-1-bpoirier@nvidia.com --- Documentation/admin-guide/tainted-kernels.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/admin-guide/tainted-kernels.rst b/Documentation/admin-guide/tainted-kernels.rst index f92551539e8a..700aa72eecb1 100644 --- a/Documentation/admin-guide/tainted-kernels.rst +++ b/Documentation/admin-guide/tainted-kernels.rst @@ -182,3 +182,5 @@ More detailed explanation for tainting produce extremely unusual kernel structure layouts (even performance pathological ones), which is important to know when debugging. Set at build time. + + 18) ``N`` if an in-kernel test, such as a KUnit test, has been run. From 8663dd38a7ba5b2bfd2c7b4271e6e63bc0ef1e42 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Tue, 30 Jul 2024 13:36:40 +0800 Subject: [PATCH 013/126] docs/zh_CN: fix a broken reference Warning: Documentation/translations/zh_CN/kbuild/index.rst references a file that doesn't exist: Documentation/kbuild/index Fix this by adding its full name. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407300812.1VvDFdxD-lkp@intel.com/ Signed-off-by: Dongliang Mu Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240730053652.4073433-1-dzm91@hust.edu.cn --- Documentation/translations/zh_CN/kbuild/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/translations/zh_CN/kbuild/index.rst b/Documentation/translations/zh_CN/kbuild/index.rst index b9feb56b846a..d906a4e88d0f 100644 --- a/Documentation/translations/zh_CN/kbuild/index.rst +++ b/Documentation/translations/zh_CN/kbuild/index.rst @@ -2,7 +2,7 @@ .. include:: ../disclaimer-zh_CN.rst -:Original: Documentation/kbuild/index +:Original: Documentation/kbuild/index.rst :Translator: 慕冬亮 Dongliang Mu ============ From b4bac279319d3082eb42f074799c7b18ba528c71 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 29 Jul 2024 10:12:02 +0800 Subject: [PATCH 014/126] x86/tsc: Use topology_max_packages() to get package number Commit b50db7095fe0 ("x86/tsc: Disable clocksource watchdog for TSC on qualified platorms") was introduced to solve problem that sometimes TSC clocksource is wrongly judged as unstable by watchdog like 'jiffies', HPET, etc. In it, the hardware package number is a key factor for judging whether to disable the watchdog for TSC, and 'nr_online_nodes' was chosen due to, at that time (kernel v5.1x), it is available in early boot phase before registering 'tsc-early' clocksource, where all non-boot CPUs are not brought up yet. Dave and Rui pointed out there are many cases in which 'nr_online_nodes' is cheated and not accurate, like: * SNC (sub-numa cluster) mode enabled * numa emulation (numa=fake=8 etc.) * numa=off * platforms with CPU-less HBM nodes, CPU-less Optane memory nodes. * 'maxcpus=' cmdline setup, where chopped CPUs could be onlined later * 'nr_cpus=', 'possible_cpus=' cmdline setup, where chopped CPUs can not be onlined after boot The SNC case is the most user-visible case, as many CSP (Cloud Service Provider) enable this feature in their server fleets. When SNC3 enabled, a 2 socket machine will appear to have 6 NUMA nodes, and get impacted by the issue in reality. Thomas' recent patchset of refactoring x86 topology code improves topology_max_packages() greatly, by making it more accurate and available in early boot phase, which works well in most of the above cases. The only exceptions are 'nr_cpus=' and 'possible_cpus=' setup, which may under-estimate the package number. As during topology setup, the boot CPU iterates through all enumerated APIC IDs and either accepts or rejects the APIC ID. For accepted IDs, it figures out which bits of the ID map to the package number. It tracks which package numbers have been seen in a bitmap. topology_max_packages() just returns the number of bits set in that bitmap. 'nr_cpus=' and 'possible_cpus=' can cause more APIC IDs to be rejected and can artificially lower the number of bits in the package bitmap and thus topology_max_packages(). This means that, for example, a system with 8 physical packages might reject all the CPUs on 6 of those packages and be left with only 2 packages and 2 bits set in the package bitmap. It needs the TSC watchdog, but would disable it anyway. This isn't ideal, but it only happens for debug-oriented options. This is fixable by tracking the package numbers for rejected CPUs. But it's not worth the trouble for debugging. So use topology_max_packages() to replace nr_online_nodes(). Reported-by: Dave Hansen Signed-off-by: Feng Tang Signed-off-by: Thomas Gleixner Reviewed-by: Waiman Long Link: https://lore.kernel.org/all/20240729021202.180955-1-feng.tang@intel.com Closes: https://lore.kernel.org/lkml/a4860054-0f16-6513-f121-501048431086@intel.com/ --- arch/x86/kernel/tsc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index d4462fb26299..0ced1870ed40 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -28,6 +28,7 @@ #include #include #include +#include #include unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ @@ -1253,15 +1254,12 @@ static void __init check_system_tsc_reliable(void) * - TSC which does not stop in C-States * - the TSC_ADJUST register which allows to detect even minimal * modifications - * - not more than two sockets. As the number of sockets cannot be - * evaluated at the early boot stage where this has to be - * invoked, check the number of online memory nodes as a - * fallback solution which is an reasonable estimate. + * - not more than four packages */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && boot_cpu_has(X86_FEATURE_TSC_ADJUST) && - nr_online_nodes <= 4) + topology_max_packages() <= 4) tsc_disable_clocksource_watchdog(); } From 39e470057f785eab7b6638feb3f24cbad6816aff Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:41 +0200 Subject: [PATCH 015/126] tools/x86/kcpuid: Remove unused variable Global variable "num_leafs" is set in multiple places but is never read anywhere. Remove it. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240718134755.378115-2-darwi@linutronix.de --- tools/arch/x86/kcpuid/kcpuid.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index 24b7d017ec2c..e1973d8b322e 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -76,7 +76,6 @@ struct cpuid_range { */ struct cpuid_range *leafs_basic, *leafs_ext; -static int num_leafs; static bool is_amd; static bool show_details; static bool show_raw; @@ -246,7 +245,6 @@ struct cpuid_range *setup_cpuid_range(u32 input_eax) allzero = cpuid_store(range, f, subleaf, eax, ebx, ecx, edx); if (allzero) continue; - num_leafs++; if (!has_subleafs(f)) continue; @@ -272,7 +270,6 @@ struct cpuid_range *setup_cpuid_range(u32 input_eax) eax, ebx, ecx, edx); if (allzero) continue; - num_leafs++; } } From a52e735f282c963155090d2d60726324ccd0e4bc Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:42 +0200 Subject: [PATCH 016/126] tools/x86/kcpuid: Properly align long-description columns When kcpuid is invoked with "--all --details", the detailed description column is not properly aligned for all bitfield rows: CPUID_0x4_ECX[0x0]: cache_level : 0x1 - Cache Level ... cache_self_init - Cache Self Initialization This is due to differences in output handling between boolean single-bit "bitflags" and multi-bit bitfields. For the former, the bitfield's value is not outputted as it is implied to be true by just outputting the bitflag's name in its respective line. If long descriptions were requested through the --all parameter, properly align the bitflag's description columns through extra tabs. With that, the sample output above becomes: CPUID_0x4_ECX[0x0]: cache_level : 0x1 - Cache Level ... cache_self_init - Cache Self Initialization Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240718134755.378115-3-darwi@linutronix.de --- tools/arch/x86/kcpuid/kcpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index e1973d8b322e..08f64d9ecb40 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -449,8 +449,9 @@ static void decode_bits(u32 value, struct reg_desc *rdesc, enum cpuid_reg reg) if (start == end) { /* single bit flag */ if (value & (1 << start)) - printf("\t%-20s %s%s\n", + printf("\t%-20s %s%s%s\n", bdesc->simp, + show_flags_only ? "" : "\t\t\t", show_details ? "-" : "", show_details ? bdesc->detail : "" ); From 5dd7ca42475bb15fd93d58d42e925512776f14a4 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:43 +0200 Subject: [PATCH 017/126] tools/x86/kcpuid: Set max possible subleaves count to 64 cpuid.csv will be extended in further commits with all-publicly-known CPUID leaves and bitfields. One of the new leaves is 0xd for extended CPU state enumeration. Depending on XCR0 dword bits, it can export up to 64 subleaves. Set kcpuid.c MAX_SUBLEAF_NUM to 64. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240718134755.378115-4-darwi@linutronix.de --- tools/arch/x86/kcpuid/kcpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index 08f64d9ecb40..a87cddc19554 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -203,7 +203,7 @@ static void raw_dump_range(struct cpuid_range *range) } } -#define MAX_SUBLEAF_NUM 32 +#define MAX_SUBLEAF_NUM 64 struct cpuid_range *setup_cpuid_range(u32 input_eax) { u32 max_func, idx_func; From cf96ab1a966b87b09fdd9e8cc8357d2d00776a3a Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:44 +0200 Subject: [PATCH 018/126] tools/x86/kcpuid: Protect against faulty "max subleaf" values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Protect against the kcpuid code parsing faulty max subleaf numbers through a min() expression. Thus, ensuring that max_subleaf will always be ≤ MAX_SUBLEAF_NUM. Use "u32" for the subleaf numbers since kcpuid is compiled with -Wextra, which includes signed/unsigned comparisons warnings. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240718134755.378115-5-darwi@linutronix.de --- tools/arch/x86/kcpuid/kcpuid.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index a87cddc19554..581d28c861ee 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -7,7 +7,8 @@ #include #include -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define min(a, b) (((a) < (b)) ? (a) : (b)) typedef unsigned int u32; typedef unsigned long long u64; @@ -206,12 +207,9 @@ static void raw_dump_range(struct cpuid_range *range) #define MAX_SUBLEAF_NUM 64 struct cpuid_range *setup_cpuid_range(u32 input_eax) { - u32 max_func, idx_func; - int subleaf; + u32 max_func, idx_func, subleaf, max_subleaf; + u32 eax, ebx, ecx, edx, f = input_eax; struct cpuid_range *range; - u32 eax, ebx, ecx, edx; - u32 f = input_eax; - int max_subleaf; bool allzero; eax = input_eax; @@ -256,7 +254,7 @@ struct cpuid_range *setup_cpuid_range(u32 input_eax) * others have to be tried (0xf) */ if (f == 0x7 || f == 0x14 || f == 0x17 || f == 0x18) - max_subleaf = (eax & 0xff) + 1; + max_subleaf = min((eax & 0xff) + 1, max_subleaf); if (f == 0xb) max_subleaf = 2; From 9ecbc60a5ede928abdd2b152d828ae0ea8a1e3ed Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:45 +0200 Subject: [PATCH 019/126] tools/x86/kcpuid: Strip bitfield names leading/trailing whitespace While parsing and saving bitfield names from the CSV file, an extra leading space is copied verbatim. That extra space is not a big issue now, but further commits will add a new CSV file with much more padding for the bitfield's name column. Strip leading/trailing whitespaces while saving bitfield names. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240718134755.378115-6-darwi@linutronix.de --- tools/arch/x86/kcpuid/kcpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index 581d28c861ee..c4f0ace5c9ff 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -379,7 +379,7 @@ static int parse_line(char *line) if (start) bdesc->start = strtoul(start, NULL, 0); - strcpy(bdesc->simp, tokens[4]); + strcpy(bdesc->simp, strtok(tokens[4], " \t")); strcpy(bdesc->detail, tokens[5]); return 0; From b0a59d14966dbfe0d544b2595dcb29728d41daf3 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:46 +0200 Subject: [PATCH 020/126] tools/x86/kcpuid: Recognize all leaves with subleaves cpuid.csv will be extended in further commits with all-publicly-known CPUID leaves and bitfields. Thus, modify has_subleafs() to identify all known leaves with subleaves. Remove the redundant "is_amd" check since all x86 vendors already report the maxium supported extended leaf at leaf 0x80000000 EAX register. The extra mentioned leaves are: - Leaf 0x12, Intel Software Guard Extensions (SGX) enumeration - Leaf 0x14, Intel process trace (PT) enumeration - Leaf 0x17, Intel SoC vendor attributes enumeration - Leaf 0x1b, Intel PCONFIG (Platform configuration) enumeration - Leaf 0x1d, Intel AMX (Advanced Matrix Extensions) tile information - Leaf 0x1f, Intel v2 extended topology enumeration - Leaf 0x23, Intel ArchPerfmonExt (Architectural PMU ext) enumeration - Leaf 0x80000020, AMD Platform QoS extended features enumeration - Leaf 0x80000026, AMD v2 extended topology enumeration Set the 'max_subleaf' variable for all the newly marked leaves with extra subleaves. Ideally, this should be fetched from the CSV file instead, but the current kcpuid code architecture has two runs: one run to serially invoke the cpuid instructions and save all the output in-memory, and one run to parse this in-memory output through the CSV specification. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240718134755.378115-7-darwi@linutronix.de --- tools/arch/x86/kcpuid/kcpuid.c | 37 ++++++++++++++++------------------ 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index c4f0ace5c9ff..725a7a27fcac 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -98,27 +98,17 @@ static inline void cpuid(u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) static inline bool has_subleafs(u32 f) { - if (f == 0x7 || f == 0xd) - return true; + u32 with_subleaves[] = { + 0x4, 0x7, 0xb, 0xd, 0xf, 0x10, 0x12, + 0x14, 0x17, 0x18, 0x1b, 0x1d, 0x1f, 0x23, + 0x8000001d, 0x80000020, 0x80000026, + }; - if (is_amd) { - if (f == 0x8000001d) + for (unsigned i = 0; i < ARRAY_SIZE(with_subleaves); i++) + if (f == with_subleaves[i]) return true; - return false; - } - switch (f) { - case 0x4: - case 0xb: - case 0xf: - case 0x10: - case 0x14: - case 0x18: - case 0x1f: - return true; - default: - return false; - } + return false; } static void leaf_print_raw(struct subleaf *leaf) @@ -253,11 +243,18 @@ struct cpuid_range *setup_cpuid_range(u32 input_eax) * Some can provide the exact number of subleafs, * others have to be tried (0xf) */ - if (f == 0x7 || f == 0x14 || f == 0x17 || f == 0x18) + if (f == 0x7 || f == 0x14 || f == 0x17 || f == 0x18 || f == 0x1d) max_subleaf = min((eax & 0xff) + 1, max_subleaf); - if (f == 0xb) max_subleaf = 2; + if (f == 0x1f) + max_subleaf = 6; + if (f == 0x23) + max_subleaf = 4; + if (f == 0x80000020) + max_subleaf = 4; + if (f == 0x80000026) + max_subleaf = 5; for (subleaf = 1; subleaf < max_subleaf; subleaf++) { eax = f; From 58921443e9b04bbb1866070ed14ea39578302cea Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:47 +0200 Subject: [PATCH 021/126] tools/x86/kcpuid: Parse subleaf ranges if provided It's a common pattern in cpuid leaves to have the same bitfields format repeated across a number of subleaves. Typically, this is used for enumerating hierarchial structures like cache and TLB levels, CPU topology levels, etc. Modify kcpuid.c to handle subleaf ranges in the CSV file subleaves column. For example, make it able to parse lines in the form: # LEAF, SUBLEAVES, reg, bits, short_name , ... 0xb, 1:0, eax, 4:0, x2apic_id_shift , ... 0xb, 1:0, ebx, 15:0, domain_lcpus_count , ... 0xb, 1:0, ecx, 7:0, domain_nr , ... This way, full output can be printed to the user. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240718134755.378115-8-darwi@linutronix.de --- tools/arch/x86/kcpuid/kcpuid.c | 48 ++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c index 725a7a27fcac..1b25c0a95d3f 100644 --- a/tools/arch/x86/kcpuid/kcpuid.c +++ b/tools/arch/x86/kcpuid/kcpuid.c @@ -305,6 +305,8 @@ static int parse_line(char *line) struct bits_desc *bdesc; int reg_index; char *start, *end; + u32 subleaf_start, subleaf_end; + unsigned bit_start, bit_end; /* Skip comments and NULL line */ if (line[0] == '#' || line[0] == '\n') @@ -343,13 +345,25 @@ static int parse_line(char *line) return 0; /* subleaf */ - sub = strtoul(tokens[1], NULL, 0); - if ((int)sub > func->nr) - return -1; + buf = tokens[1]; + end = strtok(buf, ":"); + start = strtok(NULL, ":"); + subleaf_end = strtoul(end, NULL, 0); - leaf = &func->leafs[sub]; + /* A subleaf range is given? */ + if (start) { + subleaf_start = strtoul(start, NULL, 0); + subleaf_end = min(subleaf_end, (u32)(func->nr - 1)); + if (subleaf_start > subleaf_end) + return 0; + } else { + subleaf_start = subleaf_end; + if (subleaf_start > (u32)(func->nr - 1)) + return 0; + } + + /* register */ buf = tokens[2]; - if (strcasestr(buf, "EAX")) reg_index = R_EAX; else if (strcasestr(buf, "EBX")) @@ -361,23 +375,23 @@ static int parse_line(char *line) else goto err_exit; - reg = &leaf->info[reg_index]; - bdesc = ®->descs[reg->nr++]; - /* bit flag or bits field */ buf = tokens[3]; - end = strtok(buf, ":"); - bdesc->end = strtoul(end, NULL, 0); - bdesc->start = bdesc->end; - - /* start != NULL means it is bit fields */ start = strtok(NULL, ":"); - if (start) - bdesc->start = strtoul(start, NULL, 0); + bit_end = strtoul(end, NULL, 0); + bit_start = (start) ? strtoul(start, NULL, 0) : bit_end; - strcpy(bdesc->simp, strtok(tokens[4], " \t")); - strcpy(bdesc->detail, tokens[5]); + for (sub = subleaf_start; sub <= subleaf_end; sub++) { + leaf = &func->leafs[sub]; + reg = &leaf->info[reg_index]; + bdesc = ®->descs[reg->nr++]; + + bdesc->end = bit_end; + bdesc->start = bit_start; + strcpy(bdesc->simp, strtok(tokens[4], " \t")); + strcpy(bdesc->detail, tokens[5]); + } return 0; err_exit: From cbbd847d107fb750e62670d0f205a7f58b36f893 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:48 +0200 Subject: [PATCH 022/126] tools/x86/kcpuid: Introduce a complete cpuid bitfields CSV file For parsing the cpuid bitfields, kcpuid uses an incomplete CSV file with 300+ bitfields. Use an auto-generated CSV file from the x86-cpuid.org project instead. It provides complete bitfields coverage: 830+ bitfields, all with proper descriptions. The auto-generated file has the following blurb automatically added: # SPDX-License-Identifier: CC0-1.0 # Generator: x86-cpuid-db v1.0 The generator tag includes the project's workspace "git describe" version string. It is intended for projects like KernelCI, to aid in verifying that the auto-generated files have not been tampered with. The file also has the blurb: # Auto-generated file. # Please submit all updates and bugfixes to https://x86-cpuid.org It's thus kindly requested that the Linux kernel's x86 tree maintainers enforce sending all updates to x86-cpuid.org's upstream database first, thus benefiting the whole ecosystem. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://gitlab.com/x86-cpuid.org/x86-cpuid-db/-/blob/v1.0/LICENSE.rst Link: https://gitlab.com/x86-cpuid.org/x86-cpuid-db Link: https://lore.kernel.org/all/20240718134755.378115-9-darwi@linutronix.de --- tools/arch/x86/kcpuid/cpuid.csv | 1352 ++++++++++++++++++++++--------- 1 file changed, 977 insertions(+), 375 deletions(-) diff --git a/tools/arch/x86/kcpuid/cpuid.csv b/tools/arch/x86/kcpuid/cpuid.csv index e0c25b75327e..d751eb8585d0 100644 --- a/tools/arch/x86/kcpuid/cpuid.csv +++ b/tools/arch/x86/kcpuid/cpuid.csv @@ -1,451 +1,1053 @@ +# SPDX-License-Identifier: CC0-1.0 +# Generator: x86-cpuid-db v1.0 + +# +# Auto-generated file. +# Please submit all updates and bugfixes to https://x86-cpuid.org +# + # The basic row format is: -# LEAF, SUBLEAF, register_name, bits, short_name, long_description +# LEAF, SUBLEAVES, reg, bits, short_name , long_description -# Leaf 00H - 0, 0, EAX, 31:0, max_basic_leafs, Max input value for supported subleafs +# Leaf 0H +# Maximum standard leaf number + CPU vendor string -# Leaf 01H - 1, 0, EAX, 3:0, stepping, Stepping ID - 1, 0, EAX, 7:4, model, Model - 1, 0, EAX, 11:8, family, Family ID - 1, 0, EAX, 13:12, processor, Processor Type - 1, 0, EAX, 19:16, model_ext, Extended Model ID - 1, 0, EAX, 27:20, family_ext, Extended Family ID + 0, 0, eax, 31:0, max_std_leaf , Highest cpuid standard leaf supported + 0, 0, ebx, 31:0, cpu_vendorid_0 , CPU vendor ID string bytes 0 - 3 + 0, 0, ecx, 31:0, cpu_vendorid_2 , CPU vendor ID string bytes 8 - 11 + 0, 0, edx, 31:0, cpu_vendorid_1 , CPU vendor ID string bytes 4 - 7 - 1, 0, EBX, 7:0, brand, Brand Index - 1, 0, EBX, 15:8, clflush_size, CLFLUSH line size (value * 8) in bytes - 1, 0, EBX, 23:16, max_cpu_id, Maxim number of addressable logic cpu in this package - 1, 0, EBX, 31:24, apic_id, Initial APIC ID +# Leaf 1H +# CPU FMS (Family/Model/Stepping) + standard feature flags - 1, 0, ECX, 0, sse3, Streaming SIMD Extensions 3(SSE3) - 1, 0, ECX, 1, pclmulqdq, PCLMULQDQ instruction supported - 1, 0, ECX, 2, dtes64, DS area uses 64-bit layout - 1, 0, ECX, 3, mwait, MONITOR/MWAIT supported - 1, 0, ECX, 4, ds_cpl, CPL Qualified Debug Store which allows for branch message storage qualified by CPL - 1, 0, ECX, 5, vmx, Virtual Machine Extensions supported - 1, 0, ECX, 6, smx, Safer Mode Extension supported - 1, 0, ECX, 7, eist, Enhanced Intel SpeedStep Technology - 1, 0, ECX, 8, tm2, Thermal Monitor 2 - 1, 0, ECX, 9, ssse3, Supplemental Streaming SIMD Extensions 3 (SSSE3) - 1, 0, ECX, 10, l1_ctx_id, L1 data cache could be set to either adaptive mode or shared mode (check IA32_MISC_ENABLE bit 24 definition) - 1, 0, ECX, 11, sdbg, IA32_DEBUG_INTERFACE MSR for silicon debug supported - 1, 0, ECX, 12, fma, FMA extensions using YMM state supported - 1, 0, ECX, 13, cmpxchg16b, 'CMPXCHG16B - Compare and Exchange Bytes' supported - 1, 0, ECX, 14, xtpr_update, xTPR Update Control supported - 1, 0, ECX, 15, pdcm, Perfmon and Debug Capability present - 1, 0, ECX, 17, pcid, Process-Context Identifiers feature present - 1, 0, ECX, 18, dca, Prefetching data from a memory mapped device supported - 1, 0, ECX, 19, sse4_1, SSE4.1 feature present - 1, 0, ECX, 20, sse4_2, SSE4.2 feature present - 1, 0, ECX, 21, x2apic, x2APIC supported - 1, 0, ECX, 22, movbe, MOVBE instruction supported - 1, 0, ECX, 23, popcnt, POPCNT instruction supported - 1, 0, ECX, 24, tsc_deadline_timer, LAPIC supports one-shot operation using a TSC deadline value - 1, 0, ECX, 25, aesni, AESNI instruction supported - 1, 0, ECX, 26, xsave, XSAVE/XRSTOR processor extended states (XSETBV/XGETBV/XCR0) - 1, 0, ECX, 27, osxsave, OS has set CR4.OSXSAVE bit to enable XSETBV/XGETBV/XCR0 - 1, 0, ECX, 28, avx, AVX instruction supported - 1, 0, ECX, 29, f16c, 16-bit floating-point conversion instruction supported - 1, 0, ECX, 30, rdrand, RDRAND instruction supported + 1, 0, eax, 3:0, stepping , Stepping ID + 1, 0, eax, 7:4, base_model , Base CPU model ID + 1, 0, eax, 11:8, base_family_id , Base CPU family ID + 1, 0, eax, 13:12, cpu_type , CPU type + 1, 0, eax, 19:16, ext_model , Extended CPU model ID + 1, 0, eax, 27:20, ext_family , Extended CPU family ID + 1, 0, ebx, 7:0, brand_id , Brand index + 1, 0, ebx, 15:8, clflush_size , CLFLUSH instruction cache line size + 1, 0, ebx, 23:16, n_logical_cpu , Logical CPU (HW threads) count + 1, 0, ebx, 31:24, local_apic_id , Initial local APIC physical ID + 1, 0, ecx, 0, pni , Streaming SIMD Extensions 3 (SSE3) + 1, 0, ecx, 1, pclmulqdq , PCLMULQDQ instruction support + 1, 0, ecx, 2, dtes64 , 64-bit DS save area + 1, 0, ecx, 3, monitor , MONITOR/MWAIT support + 1, 0, ecx, 4, ds_cpl , CPL Qualified Debug Store + 1, 0, ecx, 5, vmx , Virtual Machine Extensions + 1, 0, ecx, 6, smx , Safer Mode Extensions + 1, 0, ecx, 7, est , Enhanced Intel SpeedStep + 1, 0, ecx, 8, tm2 , Thermal Monitor 2 + 1, 0, ecx, 9, ssse3 , Supplemental SSE3 + 1, 0, ecx, 10, cid , L1 Context ID + 1, 0, ecx, 11, sdbg , Sillicon Debug + 1, 0, ecx, 12, fma , FMA extensions using YMM state + 1, 0, ecx, 13, cx16 , CMPXCHG16B instruction support + 1, 0, ecx, 14, xtpr , xTPR Update Control + 1, 0, ecx, 15, pdcm , Perfmon and Debug Capability + 1, 0, ecx, 17, pcid , Process-context identifiers + 1, 0, ecx, 18, dca , Direct Cache Access + 1, 0, ecx, 19, sse4_1 , SSE4.1 + 1, 0, ecx, 20, sse4_2 , SSE4.2 + 1, 0, ecx, 21, x2apic , X2APIC support + 1, 0, ecx, 22, movbe , MOVBE instruction support + 1, 0, ecx, 23, popcnt , POPCNT instruction support + 1, 0, ecx, 24, tsc_deadline_timer , APIC timer one-shot operation + 1, 0, ecx, 25, aes , AES instructions + 1, 0, ecx, 26, xsave , XSAVE (and related instructions) support + 1, 0, ecx, 27, osxsave , XSAVE (and related instructions) are enabled by OS + 1, 0, ecx, 28, avx , AVX instructions support + 1, 0, ecx, 29, f16c , Half-precision floating-point conversion support + 1, 0, ecx, 30, rdrand , RDRAND instruction support + 1, 0, ecx, 31, guest_status , System is running as guest; (para-)virtualized system + 1, 0, edx, 0, fpu , Floating-Point Unit on-chip (x87) + 1, 0, edx, 1, vme , Virtual-8086 Mode Extensions + 1, 0, edx, 2, de , Debugging Extensions + 1, 0, edx, 3, pse , Page Size Extension + 1, 0, edx, 4, tsc , Time Stamp Counter + 1, 0, edx, 5, msr , Model-Specific Registers (RDMSR and WRMSR support) + 1, 0, edx, 6, pae , Physical Address Extensions + 1, 0, edx, 7, mce , Machine Check Exception + 1, 0, edx, 8, cx8 , CMPXCHG8B instruction + 1, 0, edx, 9, apic , APIC on-chip + 1, 0, edx, 11, sep , SYSENTER, SYSEXIT, and associated MSRs + 1, 0, edx, 12, mtrr , Memory Type Range Registers + 1, 0, edx, 13, pge , Page Global Extensions + 1, 0, edx, 14, mca , Machine Check Architecture + 1, 0, edx, 15, cmov , Conditional Move Instruction + 1, 0, edx, 16, pat , Page Attribute Table + 1, 0, edx, 17, pse36 , Page Size Extension (36-bit) + 1, 0, edx, 18, pn , Processor Serial Number + 1, 0, edx, 19, clflush , CLFLUSH instruction + 1, 0, edx, 21, dts , Debug Store + 1, 0, edx, 22, acpi , Thermal monitor and clock control + 1, 0, edx, 23, mmx , MMX instructions + 1, 0, edx, 24, fxsr , FXSAVE and FXRSTOR instructions + 1, 0, edx, 25, sse , SSE instructions + 1, 0, edx, 26, sse2 , SSE2 instructions + 1, 0, edx, 27, ss , Self Snoop + 1, 0, edx, 28, ht , Hyper-threading + 1, 0, edx, 29, tm , Thermal Monitor + 1, 0, edx, 30, ia64 , Legacy IA-64 (Itanium) support bit, now resreved + 1, 0, edx, 31, pbe , Pending Break Enable - 1, 0, EDX, 0, fpu, x87 FPU on chip - 1, 0, EDX, 1, vme, Virtual-8086 Mode Enhancement - 1, 0, EDX, 2, de, Debugging Extensions - 1, 0, EDX, 3, pse, Page Size Extensions - 1, 0, EDX, 4, tsc, Time Stamp Counter - 1, 0, EDX, 5, msr, RDMSR and WRMSR Support - 1, 0, EDX, 6, pae, Physical Address Extensions - 1, 0, EDX, 7, mce, Machine Check Exception - 1, 0, EDX, 8, cx8, CMPXCHG8B instr - 1, 0, EDX, 9, apic, APIC on Chip - 1, 0, EDX, 11, sep, SYSENTER and SYSEXIT instrs - 1, 0, EDX, 12, mtrr, Memory Type Range Registers - 1, 0, EDX, 13, pge, Page Global Bit - 1, 0, EDX, 14, mca, Machine Check Architecture - 1, 0, EDX, 15, cmov, Conditional Move Instrs - 1, 0, EDX, 16, pat, Page Attribute Table - 1, 0, EDX, 17, pse36, 36-Bit Page Size Extension - 1, 0, EDX, 18, psn, Processor Serial Number - 1, 0, EDX, 19, clflush, CLFLUSH instr -# 1, 0, EDX, 20, - 1, 0, EDX, 21, ds, Debug Store - 1, 0, EDX, 22, acpi, Thermal Monitor and Software Controlled Clock Facilities - 1, 0, EDX, 23, mmx, Intel MMX Technology - 1, 0, EDX, 24, fxsr, XSAVE and FXRSTOR Instrs - 1, 0, EDX, 25, sse, SSE - 1, 0, EDX, 26, sse2, SSE2 - 1, 0, EDX, 27, ss, Self Snoop - 1, 0, EDX, 28, hit, Max APIC IDs - 1, 0, EDX, 29, tm, Thermal Monitor -# 1, 0, EDX, 30, - 1, 0, EDX, 31, pbe, Pending Break Enable +# Leaf 2H +# Intel cache and TLB information one-byte descriptors -# Leaf 02H -# cache and TLB descriptor info + 2, 0, eax, 7:0, iteration_count , Number of times this CPUD leaf must be queried + 2, 0, eax, 15:8, desc1 , Descriptor #1 + 2, 0, eax, 23:16, desc2 , Descriptor #2 + 2, 0, eax, 30:24, desc3 , Descriptor #3 + 2, 0, eax, 31, eax_invalid , Descriptors 1-3 are invalid if set + 2, 0, ebx, 7:0, desc4 , Descriptor #4 + 2, 0, ebx, 15:8, desc5 , Descriptor #5 + 2, 0, ebx, 23:16, desc6 , Descriptor #6 + 2, 0, ebx, 30:24, desc7 , Descriptor #7 + 2, 0, ebx, 31, ebx_invalid , Descriptors 4-7 are invalid if set + 2, 0, ecx, 7:0, desc8 , Descriptor #8 + 2, 0, ecx, 15:8, desc9 , Descriptor #9 + 2, 0, ecx, 23:16, desc10 , Descriptor #10 + 2, 0, ecx, 30:24, desc11 , Descriptor #11 + 2, 0, ecx, 31, ecx_invalid , Descriptors 8-11 are invalid if set + 2, 0, edx, 7:0, desc12 , Descriptor #12 + 2, 0, edx, 15:8, desc13 , Descriptor #13 + 2, 0, edx, 23:16, desc14 , Descriptor #14 + 2, 0, edx, 30:24, desc15 , Descriptor #15 + 2, 0, edx, 31, edx_invalid , Descriptors 12-15 are invalid if set -# Leaf 03H -# Precessor Serial Number, introduced on Pentium III, not valid for -# latest models +# Leaf 4H +# Intel deterministic cache parameters -# Leaf 04H -# thread/core and cache topology - 4, 0, EAX, 4:0, cache_type, Cache type like instr/data or unified - 4, 0, EAX, 7:5, cache_level, Cache Level (starts at 1) - 4, 0, EAX, 8, cache_self_init, Cache Self Initialization - 4, 0, EAX, 9, fully_associate, Fully Associative cache -# 4, 0, EAX, 13:10, resvd, resvd - 4, 0, EAX, 25:14, max_logical_id, Max number of addressable IDs for logical processors sharing the cache - 4, 0, EAX, 31:26, max_phy_id, Max number of addressable IDs for processors in phy package + 4, 31:0, eax, 4:0, cache_type , Cache type field + 4, 31:0, eax, 7:5, cache_level , Cache level (1-based) + 4, 31:0, eax, 8, cache_self_init , Self-initialializing cache level + 4, 31:0, eax, 9, fully_associative , Fully-associative cache + 4, 31:0, eax, 25:14, num_threads_sharing , Number logical CPUs sharing this cache + 4, 31:0, eax, 31:26, num_cores_on_die , Number of cores in the physical package + 4, 31:0, ebx, 11:0, cache_linesize , System coherency line size (0-based) + 4, 31:0, ebx, 21:12, cache_npartitions , Physical line partitions (0-based) + 4, 31:0, ebx, 31:22, cache_nways , Ways of associativity (0-based) + 4, 31:0, ecx, 30:0, cache_nsets , Cache number of sets (0-based) + 4, 31:0, edx, 0, wbinvd_rll_no_guarantee, WBINVD/INVD not guaranteed for Remote Lower-Level caches + 4, 31:0, edx, 1, ll_inclusive , Cache is inclusive of Lower-Level caches + 4, 31:0, edx, 2, complex_indexing , Not a direct-mapped cache (complex function) - 4, 0, EBX, 11:0, cache_linesize, Size of a cache line in bytes - 4, 0, EBX, 21:12, cache_partition, Physical Line partitions - 4, 0, EBX, 31:22, cache_ways, Ways of associativity - 4, 0, ECX, 31:0, cache_sets, Number of Sets - 1 - 4, 0, EDX, 0, c_wbinvd, 1 means WBINVD/INVD is not ganranteed to act upon lower level caches of non-originating threads sharing this cache - 4, 0, EDX, 1, c_incl, Whether cache is inclusive of lower cache level - 4, 0, EDX, 2, c_comp_index, Complex Cache Indexing +# Leaf 5H +# MONITOR/MWAIT instructions enumeration -# Leaf 05H -# MONITOR/MWAIT - 5, 0, EAX, 15:0, min_mon_size, Smallest monitor line size in bytes - 5, 0, EBX, 15:0, max_mon_size, Largest monitor line size in bytes - 5, 0, ECX, 0, mwait_ext, Enum of Monitor-Mwait extensions supported - 5, 0, ECX, 1, mwait_irq_break, Largest monitor line size in bytes - 5, 0, EDX, 3:0, c0_sub_stats, Number of C0* sub C-states supported using MWAIT - 5, 0, EDX, 7:4, c1_sub_stats, Number of C1* sub C-states supported using MWAIT - 5, 0, EDX, 11:8, c2_sub_stats, Number of C2* sub C-states supported using MWAIT - 5, 0, EDX, 15:12, c3_sub_stats, Number of C3* sub C-states supported using MWAIT - 5, 0, EDX, 19:16, c4_sub_stats, Number of C4* sub C-states supported using MWAIT - 5, 0, EDX, 23:20, c5_sub_stats, Number of C5* sub C-states supported using MWAIT - 5, 0, EDX, 27:24, c6_sub_stats, Number of C6* sub C-states supported using MWAIT - 5, 0, EDX, 31:28, c7_sub_stats, Number of C7* sub C-states supported using MWAIT + 5, 0, eax, 15:0, min_mon_size , Smallest monitor-line size, in bytes + 5, 0, ebx, 15:0, max_mon_size , Largest monitor-line size, in bytes + 5, 0, ecx, 0, mwait_ext , Enumeration of MONITOR/MWAIT extensions is supported + 5, 0, ecx, 1, mwait_irq_break , Interrupts as a break-event for MWAIT is supported + 5, 0, edx, 3:0, n_c0_substates , Number of C0 sub C-states supported using MWAIT + 5, 0, edx, 7:4, n_c1_substates , Number of C1 sub C-states supported using MWAIT + 5, 0, edx, 11:8, n_c2_substates , Number of C2 sub C-states supported using MWAIT + 5, 0, edx, 15:12, n_c3_substates , Number of C3 sub C-states supported using MWAIT + 5, 0, edx, 19:16, n_c4_substates , Number of C4 sub C-states supported using MWAIT + 5, 0, edx, 23:20, n_c5_substates , Number of C5 sub C-states supported using MWAIT + 5, 0, edx, 27:24, n_c6_substates , Number of C6 sub C-states supported using MWAIT + 5, 0, edx, 31:28, n_c7_substates , Number of C7 sub C-states supported using MWAIT -# Leaf 06H -# Thermal & Power Management +# Leaf 6H +# Thermal and Power Management enumeration - 6, 0, EAX, 0, dig_temp, Digital temperature sensor supported - 6, 0, EAX, 1, turbo, Intel Turbo Boost - 6, 0, EAX, 2, arat, Always running APIC timer -# 6, 0, EAX, 3, resv, Reserved - 6, 0, EAX, 4, pln, Power limit notifications supported - 6, 0, EAX, 5, ecmd, Clock modulation duty cycle extension supported - 6, 0, EAX, 6, ptm, Package thermal management supported - 6, 0, EAX, 7, hwp, HWP base register - 6, 0, EAX, 8, hwp_notify, HWP notification - 6, 0, EAX, 9, hwp_act_window, HWP activity window - 6, 0, EAX, 10, hwp_energy, HWP energy performance preference - 6, 0, EAX, 11, hwp_pkg_req, HWP package level request -# 6, 0, EAX, 12, resv, Reserved - 6, 0, EAX, 13, hdc, HDC base registers supported - 6, 0, EAX, 14, turbo3, Turbo Boost Max 3.0 - 6, 0, EAX, 15, hwp_cap, Highest Performance change supported - 6, 0, EAX, 16, hwp_peci, HWP PECI override is supported - 6, 0, EAX, 17, hwp_flex, Flexible HWP is supported - 6, 0, EAX, 18, hwp_fast, Fast access mode for the IA32_HWP_REQUEST MSR is supported -# 6, 0, EAX, 19, resv, Reserved - 6, 0, EAX, 20, hwp_ignr, Ignoring Idle Logical Processor HWP request is supported + 6, 0, eax, 0, dtherm , Digital temprature sensor + 6, 0, eax, 1, turbo_boost , Intel Turbo Boost + 6, 0, eax, 2, arat , Always-Running APIC Timer (not affected by p-state) + 6, 0, eax, 4, pln , Power Limit Notification (PLN) event + 6, 0, eax, 5, ecmd , Clock modulation duty cycle extension + 6, 0, eax, 6, pts , Package thermal management + 6, 0, eax, 7, hwp , HWP (Hardware P-states) base registers are supported + 6, 0, eax, 8, hwp_notify , HWP notification (IA32_HWP_INTERRUPT MSR) + 6, 0, eax, 9, hwp_act_window , HWP activity window (IA32_HWP_REQUEST[bits 41:32]) supported + 6, 0, eax, 10, hwp_epp , HWP Energy Performance Preference + 6, 0, eax, 11, hwp_pkg_req , HWP Package Level Request + 6, 0, eax, 13, hdc_base_regs , HDC base registers are supported + 6, 0, eax, 14, turbo_boost_3_0 , Intel Turbo Boost Max 3.0 + 6, 0, eax, 15, hwp_capabilities , HWP Highest Performance change + 6, 0, eax, 16, hwp_peci_override , HWP PECI override + 6, 0, eax, 17, hwp_flexible , Flexible HWP + 6, 0, eax, 18, hwp_fast , IA32_HWP_REQUEST MSR fast access mode + 6, 0, eax, 19, hfi , HW_FEEDBACK MSRs supported + 6, 0, eax, 20, hwp_ignore_idle , Ignoring idle logical CPU HWP req is supported + 6, 0, eax, 23, thread_director , Intel thread director support + 6, 0, eax, 24, therm_interrupt_bit25 , IA32_THERM_INTERRUPT MSR bit 25 is supported + 6, 0, ebx, 3:0, n_therm_thresholds , Digital thermometer thresholds + 6, 0, ecx, 0, aperfmperf , MPERF/APERF MSRs (effective frequency interface) + 6, 0, ecx, 3, epb , IA32_ENERGY_PERF_BIAS MSR support + 6, 0, ecx, 15:8, thrd_director_nclasses , Number of classes, Intel thread director + 6, 0, edx, 0, perfcap_reporting , Performance capability reporting + 6, 0, edx, 1, encap_reporting , Energy efficiency capability reporting + 6, 0, edx, 11:8, feedback_sz , HW feedback interface struct size, in 4K pages + 6, 0, edx, 31:16, this_lcpu_hwfdbk_idx , This logical CPU index @ HW feedback struct, 0-based - 6, 0, EBX, 3:0, therm_irq_thresh, Number of Interrupt Thresholds in Digital Thermal Sensor - 6, 0, ECX, 0, aperfmperf, Presence of IA32_MPERF and IA32_APERF - 6, 0, ECX, 3, energ_bias, Performance-energy bias preference supported +# Leaf 7H +# Extended CPU features enumeration -# Leaf 07H -# ECX == 0 -# AVX512 refers to https://en.wikipedia.org/wiki/AVX-512 -# XXX: Do we really need to enumerate each and every AVX512 sub features + 7, 0, eax, 31:0, leaf7_n_subleaves , Number of cpuid 0x7 subleaves + 7, 0, ebx, 0, fsgsbase , FSBASE/GSBASE read/write support + 7, 0, ebx, 1, tsc_adjust , IA32_TSC_ADJUST MSR supported + 7, 0, ebx, 2, sgx , Intel SGX (Software Guard Extensions) + 7, 0, ebx, 3, bmi1 , Bit manipulation extensions group 1 + 7, 0, ebx, 4, hle , Hardware Lock Elision + 7, 0, ebx, 5, avx2 , AVX2 instruction set + 7, 0, ebx, 6, fdp_excptn_only , FPU Data Pointer updated only on x87 exceptions + 7, 0, ebx, 7, smep , Supervisor Mode Execution Protection + 7, 0, ebx, 8, bmi2 , Bit manipulation extensions group 2 + 7, 0, ebx, 9, erms , Enhanced REP MOVSB/STOSB + 7, 0, ebx, 10, invpcid , INVPCID instruction (Invalidate Processor Context ID) + 7, 0, ebx, 11, rtm , Intel restricted transactional memory + 7, 0, ebx, 12, cqm , Intel RDT-CMT / AMD Platform-QoS cache monitoring + 7, 0, ebx, 13, zero_fcs_fds , Deprecated FPU CS/DS (stored as zero) + 7, 0, ebx, 14, mpx , Intel memory protection extensions + 7, 0, ebx, 15, rdt_a , Intel RDT / AMD Platform-QoS Enforcemeent + 7, 0, ebx, 16, avx512f , AVX-512 foundation instructions + 7, 0, ebx, 17, avx512dq , AVX-512 double/quadword instructions + 7, 0, ebx, 18, rdseed , RDSEED instruction + 7, 0, ebx, 19, adx , ADCX/ADOX instructions + 7, 0, ebx, 20, smap , Supervisor mode access prevention + 7, 0, ebx, 21, avx512ifma , AVX-512 integer fused multiply add + 7, 0, ebx, 23, clflushopt , CLFLUSHOPT instruction + 7, 0, ebx, 24, clwb , CLWB instruction + 7, 0, ebx, 25, intel_pt , Intel processor trace + 7, 0, ebx, 26, avx512pf , AVX-512 prefetch instructions + 7, 0, ebx, 27, avx512er , AVX-512 exponent/reciprocal instrs + 7, 0, ebx, 28, avx512cd , AVX-512 conflict detection instrs + 7, 0, ebx, 29, sha_ni , SHA/SHA256 instructions + 7, 0, ebx, 30, avx512bw , AVX-512 BW (byte/word granular) instructions + 7, 0, ebx, 31, avx512vl , AVX-512 VL (128/256 vector length) extensions + 7, 0, ecx, 0, prefetchwt1 , PREFETCHWT1 (Intel Xeon Phi only) + 7, 0, ecx, 1, avx512vbmi , AVX-512 Vector byte manipulation instrs + 7, 0, ecx, 2, umip , User mode instruction protection + 7, 0, ecx, 3, pku , Protection keys for user-space + 7, 0, ecx, 4, ospke , OS protection keys enable + 7, 0, ecx, 5, waitpkg , WAITPKG instructions + 7, 0, ecx, 6, avx512_vbmi2 , AVX-512 vector byte manipulation instrs group 2 + 7, 0, ecx, 7, cet_ss , CET shadow stack features + 7, 0, ecx, 8, gfni , Galois field new instructions + 7, 0, ecx, 9, vaes , Vector AES instrs + 7, 0, ecx, 10, vpclmulqdq , VPCLMULQDQ 256-bit instruction support + 7, 0, ecx, 11, avx512_vnni , Vector neural network instructions + 7, 0, ecx, 12, avx512_bitalg , AVX-512 bit count/shiffle + 7, 0, ecx, 13, tme , Intel total memory encryption + 7, 0, ecx, 14, avx512_vpopcntdq , AVX-512: POPCNT for vectors of DW/QW + 7, 0, ecx, 16, la57 , 57-bit linear addreses (five-level paging) + 7, 0, ecx, 21:17, mawau_val_lm , BNDLDX/BNDSTX MAWAU value in 64-bit mode + 7, 0, ecx, 22, rdpid , RDPID instruction + 7, 0, ecx, 23, key_locker , Intel key locker support + 7, 0, ecx, 24, bus_lock_detect , OS bus-lock detection + 7, 0, ecx, 25, cldemote , CLDEMOTE instruction + 7, 0, ecx, 27, movdiri , MOVDIRI instruction + 7, 0, ecx, 28, movdir64b , MOVDIR64B instruction + 7, 0, ecx, 29, enqcmd , Enqueue stores supported (ENQCMD{,S}) + 7, 0, ecx, 30, sgx_lc , Intel SGX launch configuration + 7, 0, ecx, 31, pks , Protection keys for supervisor-mode pages + 7, 0, edx, 1, sgx_keys , Intel SGX attestation services + 7, 0, edx, 2, avx512_4vnniw , AVX-512 neural network instructions + 7, 0, edx, 3, avx512_4fmaps , AVX-512 multiply accumulation single precision + 7, 0, edx, 4, fsrm , Fast short REP MOV + 7, 0, edx, 5, uintr , CPU supports user interrupts + 7, 0, edx, 8, avx512_vp2intersect , VP2INTERSECT{D,Q} instructions + 7, 0, edx, 9, srdbs_ctrl , SRBDS mitigation MSR available + 7, 0, edx, 10, md_clear , VERW MD_CLEAR microcode support + 7, 0, edx, 11, rtm_always_abort , XBEGIN (RTM transaction) always aborts + 7, 0, edx, 13, tsx_force_abort , MSR TSX_FORCE_ABORT, RTM_ABORT bit, supported + 7, 0, edx, 14, serialize , SERIALIZE instruction + 7, 0, edx, 15, hybrid_cpu , The CPU is identified as a 'hybrid part' + 7, 0, edx, 16, tsxldtrk , TSX suspend/resume load address tracking + 7, 0, edx, 18, pconfig , PCONFIG instruction + 7, 0, edx, 19, arch_lbr , Intel architectural LBRs + 7, 0, edx, 20, ibt , CET indirect branch tracking + 7, 0, edx, 22, amx_bf16 , AMX-BF16: tile bfloat16 support + 7, 0, edx, 23, avx512_fp16 , AVX-512 FP16 instructions + 7, 0, edx, 24, amx_tile , AMX-TILE: tile architecture support + 7, 0, edx, 25, amx_int8 , AMX-INT8: tile 8-bit integer support + 7, 0, edx, 26, spec_ctrl , Speculation Control (IBRS/IBPB: indirect branch restrictions) + 7, 0, edx, 27, intel_stibp , Single thread indirect branch predictors + 7, 0, edx, 28, flush_l1d , FLUSH L1D cache: IA32_FLUSH_CMD MSR + 7, 0, edx, 29, arch_capabilities , Intel IA32_ARCH_CAPABILITIES MSR + 7, 0, edx, 30, core_capabilities , IA32_CORE_CAPABILITIES MSR + 7, 0, edx, 31, spec_ctrl_ssbd , Speculative store bypass disable + 7, 1, eax, 4, avx_vnni , AVX-VNNI instructions + 7, 1, eax, 5, avx512_bf16 , AVX-512 bFloat16 instructions + 7, 1, eax, 6, lass , Linear address space separation + 7, 1, eax, 7, cmpccxadd , CMPccXADD instructions + 7, 1, eax, 8, arch_perfmon_ext , ArchPerfmonExt: CPUID leaf 0x23 is supported + 7, 1, eax, 10, fzrm , Fast zero-length REP MOVSB + 7, 1, eax, 11, fsrs , Fast short REP STOSB + 7, 1, eax, 12, fsrc , Fast Short REP CMPSB/SCASB + 7, 1, eax, 17, fred , FRED: Flexible return and event delivery transitions + 7, 1, eax, 18, lkgs , LKGS: Load 'kernel' (userspace) GS + 7, 1, eax, 19, wrmsrns , WRMSRNS instr (WRMSR-non-serializing) + 7, 1, eax, 21, amx_fp16 , AMX-FP16: FP16 tile operations + 7, 1, eax, 22, hreset , History reset support + 7, 1, eax, 23, avx_ifma , Integer fused multiply add + 7, 1, eax, 26, lam , Linear address masking + 7, 1, eax, 27, rd_wr_msrlist , RDMSRLIST/WRMSRLIST instructions + 7, 1, ebx, 0, intel_ppin , Protected processor inventory number (PPIN{,_CTL} MSRs) + 7, 1, edx, 4, avx_vnni_int8 , AVX-VNNI-INT8 instructions + 7, 1, edx, 5, avx_ne_convert , AVX-NE-CONVERT instructions + 7, 1, edx, 8, amx_complex , AMX-COMPLEX instructions (starting from Granite Rapids) + 7, 1, edx, 14, prefetchit_0_1 , PREFETCHIT0/1 instructions + 7, 1, edx, 18, cet_sss , CET supervisor shadow stacks safe to use + 7, 2, edx, 0, intel_psfd , Intel predictive store forward disable + 7, 2, edx, 1, ipred_ctrl , MSR bits IA32_SPEC_CTRL.IPRED_DIS_{U,S} + 7, 2, edx, 2, rrsba_ctrl , MSR bits IA32_SPEC_CTRL.RRSBA_DIS_{U,S} + 7, 2, edx, 3, ddp_ctrl , MSR bit IA32_SPEC_CTRL.DDPD_U + 7, 2, edx, 4, bhi_ctrl , MSR bit IA32_SPEC_CTRL.BHI_DIS_S + 7, 2, edx, 5, mcdt_no , MCDT mitigation not needed + 7, 2, edx, 6, uclock_disable , UC-lock disable is supported - 7, 0, EBX, 0, fsgsbase, RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE supported - 7, 0, EBX, 1, tsc_adjust, TSC_ADJUST MSR supported - 7, 0, EBX, 2, sgx, Software Guard Extensions - 7, 0, EBX, 3, bmi1, BMI1 - 7, 0, EBX, 4, hle, Hardware Lock Elision - 7, 0, EBX, 5, avx2, AVX2 -# 7, 0, EBX, 6, fdp_excp_only, x87 FPU Data Pointer updated only on x87 exceptions - 7, 0, EBX, 7, smep, Supervisor-Mode Execution Prevention - 7, 0, EBX, 8, bmi2, BMI2 - 7, 0, EBX, 9, rep_movsb, Enhanced REP MOVSB/STOSB - 7, 0, EBX, 10, invpcid, INVPCID instruction - 7, 0, EBX, 11, rtm, Restricted Transactional Memory - 7, 0, EBX, 12, rdt_m, Intel RDT Monitoring capability - 7, 0, EBX, 13, depc_fpu_cs_ds, Deprecates FPU CS and FPU DS - 7, 0, EBX, 14, mpx, Memory Protection Extensions - 7, 0, EBX, 15, rdt_a, Intel RDT Allocation capability - 7, 0, EBX, 16, avx512f, AVX512 Foundation instr - 7, 0, EBX, 17, avx512dq, AVX512 Double and Quadword AVX512 instr - 7, 0, EBX, 18, rdseed, RDSEED instr - 7, 0, EBX, 19, adx, ADX instr - 7, 0, EBX, 20, smap, Supervisor Mode Access Prevention - 7, 0, EBX, 21, avx512ifma, AVX512 Integer Fused Multiply Add -# 7, 0, EBX, 22, resvd, resvd - 7, 0, EBX, 23, clflushopt, CLFLUSHOPT instr - 7, 0, EBX, 24, clwb, CLWB instr - 7, 0, EBX, 25, intel_pt, Intel Processor Trace instr - 7, 0, EBX, 26, avx512pf, Prefetch - 7, 0, EBX, 27, avx512er, AVX512 Exponent Reciproca instr - 7, 0, EBX, 28, avx512cd, AVX512 Conflict Detection instr - 7, 0, EBX, 29, sha, Intel Secure Hash Algorithm Extensions instr - 7, 0, EBX, 30, avx512bw, AVX512 Byte & Word instr - 7, 0, EBX, 31, avx512vl, AVX512 Vector Length Extentions (VL) - 7, 0, ECX, 0, prefetchwt1, X - 7, 0, ECX, 1, avx512vbmi, AVX512 Vector Byte Manipulation Instructions - 7, 0, ECX, 2, umip, User-mode Instruction Prevention +# Leaf 9H +# Intel DCA (Direct Cache Access) enumeration - 7, 0, ECX, 3, pku, Protection Keys for User-mode pages - 7, 0, ECX, 4, ospke, CR4 PKE set to enable protection keys -# 7, 0, ECX, 16:5, resvd, resvd - 7, 0, ECX, 21:17, mawau, The value of MAWAU used by the BNDLDX and BNDSTX instructions in 64-bit mode - 7, 0, ECX, 22, rdpid, RDPID and IA32_TSC_AUX -# 7, 0, ECX, 29:23, resvd, resvd - 7, 0, ECX, 30, sgx_lc, SGX Launch Configuration -# 7, 0, ECX, 31, resvd, resvd + 9, 0, eax, 0, dca_enabled_in_bios , DCA is enabled in BIOS -# Leaf 08H -# +# Leaf AH +# Intel PMU (Performance Monitoring Unit) enumeration + 0xa, 0, eax, 7:0, pmu_version , Performance monitoring unit version ID + 0xa, 0, eax, 15:8, pmu_n_gcounters , Number of general PMU counters per logical CPU + 0xa, 0, eax, 23:16, pmu_gcounters_nbits , Bitwidth of PMU general counters + 0xa, 0, eax, 31:24, pmu_cpuid_ebx_bits , Length of cpuid leaf 0xa EBX bit vector + 0xa, 0, ebx, 0, no_core_cycle_evt , Core cycle event not available + 0xa, 0, ebx, 1, no_insn_retired_evt , Instruction retired event not available + 0xa, 0, ebx, 2, no_refcycle_evt , Reference cycles event not available + 0xa, 0, ebx, 3, no_llc_ref_evt , LLC-reference event not available + 0xa, 0, ebx, 4, no_llc_miss_evt , LLC-misses event not available + 0xa, 0, ebx, 5, no_br_insn_ret_evt , Branch instruction retired event not available + 0xa, 0, ebx, 6, no_br_mispredict_evt , Branch mispredict retired event not available + 0xa, 0, ebx, 7, no_td_slots_evt , Topdown slots event not available + 0xa, 0, ecx, 31:0, pmu_fcounters_bitmap , Fixed-function PMU counters support bitmap + 0xa, 0, edx, 4:0, pmu_n_fcounters , Number of fixed PMU counters + 0xa, 0, edx, 12:5, pmu_fcounters_nbits , Bitwidth of PMU fixed counters + 0xa, 0, edx, 15, anythread_depr , AnyThread deprecation -# Leaf 09H -# Direct Cache Access (DCA) information - 9, 0, ECX, 31:0, dca_cap, The value of IA32_PLATFORM_DCA_CAP +# Leaf BH +# CPUs v1 extended topology enumeration -# Leaf 0AH -# Architectural Performance Monitoring -# -# Do we really need to print out the PMU related stuff? -# Does normal user really care about it? -# - 0xA, 0, EAX, 7:0, pmu_ver, Performance Monitoring Unit version - 0xA, 0, EAX, 15:8, pmu_gp_cnt_num, Numer of general-purose PMU counters per logical CPU - 0xA, 0, EAX, 23:16, pmu_cnt_bits, Bit wideth of PMU counter - 0xA, 0, EAX, 31:24, pmu_ebx_bits, Length of EBX bit vector to enumerate PMU events + 0xb, 1:0, eax, 4:0, x2apic_id_shift , Bit width of this level (previous levels inclusive) + 0xb, 1:0, ebx, 15:0, domain_lcpus_count , Logical CPUs count across all instances of this domain + 0xb, 1:0, ecx, 7:0, domain_nr , This domain level (subleaf ID) + 0xb, 1:0, ecx, 15:8, domain_type , This domain type + 0xb, 1:0, edx, 31:0, x2apic_id , x2APIC ID of current logical CPU - 0xA, 0, EBX, 0, pmu_no_core_cycle_evt, Core cycle event not available - 0xA, 0, EBX, 1, pmu_no_instr_ret_evt, Instruction retired event not available - 0xA, 0, EBX, 2, pmu_no_ref_cycle_evt, Reference cycles event not available - 0xA, 0, EBX, 3, pmu_no_llc_ref_evt, Last-level cache reference event not available - 0xA, 0, EBX, 4, pmu_no_llc_mis_evt, Last-level cache misses event not available - 0xA, 0, EBX, 5, pmu_no_br_instr_ret_evt, Branch instruction retired event not available - 0xA, 0, EBX, 6, pmu_no_br_mispredict_evt, Branch mispredict retired event not available +# Leaf DH +# Processor extended state enumeration - 0xA, 0, ECX, 4:0, pmu_fixed_cnt_num, Performance Monitoring Unit version - 0xA, 0, ECX, 12:5, pmu_fixed_cnt_bits, Numer of PMU counters per logical CPU + 0xd, 0, eax, 0, xcr0_x87 , XCR0.X87 (bit 0) supported + 0xd, 0, eax, 1, xcr0_sse , XCR0.SEE (bit 1) supported + 0xd, 0, eax, 2, xcr0_avx , XCR0.AVX (bit 2) supported + 0xd, 0, eax, 3, xcr0_mpx_bndregs , XCR0.BNDREGS (bit 3) supported (MPX BND0-BND3 regs) + 0xd, 0, eax, 4, xcr0_mpx_bndcsr , XCR0.BNDCSR (bit 4) supported (MPX BNDCFGU/BNDSTATUS regs) + 0xd, 0, eax, 5, xcr0_avx512_opmask , XCR0.OPMASK (bit 5) supported (AVX-512 k0-k7 regs) + 0xd, 0, eax, 6, xcr0_avx512_zmm_hi256 , XCR0.ZMM_Hi256 (bit 6) supported (AVX-512 ZMM0->ZMM7/15 regs) + 0xd, 0, eax, 7, xcr0_avx512_hi16_zmm , XCR0.HI16_ZMM (bit 7) supported (AVX-512 ZMM16->ZMM31 regs) + 0xd, 0, eax, 9, xcr0_pkru , XCR0.PKRU (bit 9) supported (XSAVE PKRU reg) + 0xd, 0, eax, 11, xcr0_cet_u , AMD XCR0.CET_U (bit 11) supported (CET supervisor state) + 0xd, 0, eax, 12, xcr0_cet_s , AMD XCR0.CET_S (bit 12) support (CET user state) + 0xd, 0, eax, 17, xcr0_tileconfig , XCR0.TILECONFIG (bit 17) supported (AMX can manage TILECONFIG) + 0xd, 0, eax, 18, xcr0_tiledata , XCR0.TILEDATA (bit 18) supported (AMX can manage TILEDATA) + 0xd, 0, ebx, 31:0, xsave_sz_xcr0_enabled , XSAVE/XRSTR area byte size, for XCR0 enabled features + 0xd, 0, ecx, 31:0, xsave_sz_max , XSAVE/XRSTR area max byte size, all CPU features + 0xd, 0, edx, 30, xcr0_lwp , AMD XCR0.LWP (bit 62) supported (Light-weight Profiling) + 0xd, 1, eax, 0, xsaveopt , XSAVEOPT instruction + 0xd, 1, eax, 1, xsavec , XSAVEC instruction + 0xd, 1, eax, 2, xgetbv1 , XGETBV instruction with ECX = 1 + 0xd, 1, eax, 3, xsaves , XSAVES/XRSTORS instructions (and XSS MSR) + 0xd, 1, eax, 4, xfd , Extended feature disable support + 0xd, 1, ebx, 31:0, xsave_sz_xcr0_xmms_enabled, XSAVE area size, all XCR0 and XMMS features enabled + 0xd, 1, ecx, 8, xss_pt , PT state, supported + 0xd, 1, ecx, 10, xss_pasid , PASID state, supported + 0xd, 1, ecx, 11, xss_cet_u , CET user state, supported + 0xd, 1, ecx, 12, xss_cet_p , CET supervisor state, supported + 0xd, 1, ecx, 13, xss_hdc , HDC state, supported + 0xd, 1, ecx, 14, xss_uintr , UINTR state, supported + 0xd, 1, ecx, 15, xss_lbr , LBR state, supported + 0xd, 1, ecx, 16, xss_hwp , HWP state, supported + 0xd, 63:2, eax, 31:0, xsave_sz , Size of save area for subleaf-N feature, in bytes + 0xd, 63:2, ebx, 31:0, xsave_offset , Offset of save area for subleaf-N feature, in bytes + 0xd, 63:2, ecx, 0, is_xss_bit , Subleaf N describes an XSS bit, otherwise XCR0 bit + 0xd, 63:2, ecx, 1, compacted_xsave_64byte_aligned, When compacted, subleaf-N feature xsave area is 64-byte aligned -# Leaf 0BH -# Extended Topology Enumeration Leaf -# +# Leaf FH +# Intel RDT / AMD PQoS resource monitoring - 0xB, 0, EAX, 4:0, id_shift, Number of bits to shift right on x2APIC ID to get a unique topology ID of the next level type - 0xB, 0, EBX, 15:0, cpu_nr, Number of logical processors at this level type - 0xB, 0, ECX, 15:8, lvl_type, 0-Invalid 1-SMT 2-Core - 0xB, 0, EDX, 31:0, x2apic_id, x2APIC ID the current logical processor - - -# Leaf 0DH -# Processor Extended State - - 0xD, 0, EAX, 0, x87, X87 state - 0xD, 0, EAX, 1, sse, SSE state - 0xD, 0, EAX, 2, avx, AVX state - 0xD, 0, EAX, 4:3, mpx, MPX state - 0xD, 0, EAX, 7:5, avx512, AVX-512 state - 0xD, 0, EAX, 9, pkru, PKRU state - - 0xD, 0, EBX, 31:0, max_sz_xcr0, Maximum size (bytes) required by enabled features in XCR0 - 0xD, 0, ECX, 31:0, max_sz_xsave, Maximum size (bytes) of the XSAVE/XRSTOR save area - - 0xD, 1, EAX, 0, xsaveopt, XSAVEOPT available - 0xD, 1, EAX, 1, xsavec, XSAVEC and compacted form supported - 0xD, 1, EAX, 2, xgetbv, XGETBV supported - 0xD, 1, EAX, 3, xsaves, XSAVES/XRSTORS and IA32_XSS supported - - 0xD, 1, EBX, 31:0, max_sz_xcr0, Maximum size (bytes) required by enabled features in XCR0 - 0xD, 1, ECX, 8, pt, PT state - 0xD, 1, ECX, 11, cet_usr, CET user state - 0xD, 1, ECX, 12, cet_supv, CET supervisor state - 0xD, 1, ECX, 13, hdc, HDC state - 0xD, 1, ECX, 16, hwp, HWP state - -# Leaf 0FH -# Intel RDT Monitoring - - 0xF, 0, EBX, 31:0, rmid_range, Maximum range (zero-based) of RMID within this physical processor of all types - 0xF, 0, EDX, 1, l3c_rdt_mon, L3 Cache RDT Monitoring supported - - 0xF, 1, ECX, 31:0, rmid_range, Maximum range (zero-based) of RMID of this types - 0xF, 1, EDX, 0, l3c_ocp_mon, L3 Cache occupancy Monitoring supported - 0xF, 1, EDX, 1, l3c_tbw_mon, L3 Cache Total Bandwidth Monitoring supported - 0xF, 1, EDX, 2, l3c_lbw_mon, L3 Cache Local Bandwidth Monitoring supported + 0xf, 0, ebx, 31:0, core_rmid_max , RMID max, within this core, all types (0-based) + 0xf, 0, edx, 1, cqm_llc , LLC QoS-monitoring supported + 0xf, 1, eax, 7:0, l3c_qm_bitwidth , L3 QoS-monitoring counter bitwidth (24-based) + 0xf, 1, eax, 8, l3c_qm_overflow_bit , QM_CTR MSR bit 61 is an overflow bit + 0xf, 1, ebx, 31:0, l3c_qm_conver_factor , QM_CTR MSR conversion factor to bytes + 0xf, 1, ecx, 31:0, l3c_qm_rmid_max , L3 QoS-monitoring max RMID + 0xf, 1, edx, 0, cqm_occup_llc , L3 QoS occupancy monitoring supported + 0xf, 1, edx, 1, cqm_mbm_total , L3 QoS total bandwidth monitoring supported + 0xf, 1, edx, 2, cqm_mbm_local , L3 QoS local bandwidth monitoring supported # Leaf 10H -# Intel RDT Allocation - - 0x10, 0, EBX, 1, l3c_rdt_alloc, L3 Cache Allocation supported - 0x10, 0, EBX, 2, l2c_rdt_alloc, L2 Cache Allocation supported - 0x10, 0, EBX, 3, mem_bw_alloc, Memory Bandwidth Allocation supported +# Intel RDT / AMD PQoS allocation enumeration + 0x10, 0, ebx, 1, cat_l3 , L3 Cache Allocation Technology supported + 0x10, 0, ebx, 2, cat_l2 , L2 Cache Allocation Technology supported + 0x10, 0, ebx, 3, mba , Memory Bandwidth Allocation supported + 0x10, 2:1, eax, 4:0, cat_cbm_len , L3/L2_CAT capacity bitmask length, minus-one notation + 0x10, 2:1, ebx, 31:0, cat_units_bitmap , L3/L2_CAT bitmap of allocation units + 0x10, 2:1, ecx, 1, l3_cat_cos_infreq_updates, L3_CAT COS updates should be infrequent + 0x10, 2:1, ecx, 2, cdp_l3 , L3/L2_CAT CDP (Code and Data Prioritization) + 0x10, 2:1, ecx, 3, cat_sparse_1s , L3/L2_CAT non-contiguous 1s value supported + 0x10, 2:1, edx, 15:0, cat_cos_max , L3/L2_CAT max COS (Class of Service) supported + 0x10, 3, eax, 11:0, mba_max_delay , Max MBA throttling value; minus-one notation + 0x10, 3, ecx, 0, per_thread_mba , Per-thread MBA controls are supported + 0x10, 3, ecx, 2, mba_delay_linear , Delay values are linear + 0x10, 3, edx, 15:0, mba_cos_max , MBA max Class of Service supported # Leaf 12H -# SGX Capability -# -# Some detailed SGX features not added yet - - 0x12, 0, EAX, 0, sgx1, L3 Cache Allocation supported - 0x12, 1, EAX, 0, sgx2, L3 Cache Allocation supported +# Intel Software Guard Extensions (SGX) enumeration + 0x12, 0, eax, 0, sgx1 , SGX1 leaf functions supported + 0x12, 0, eax, 1, sgx2 , SGX2 leaf functions supported + 0x12, 0, eax, 5, enclv_leaves , ENCLV leaves (E{INC,DEC}VIRTCHILD, ESETCONTEXT) supported + 0x12, 0, eax, 6, encls_leaves , ENCLS leaves (ENCLS ETRACKC, ERDINFO, ELDBC, ELDUC) supported + 0x12, 0, eax, 7, enclu_everifyreport2 , ENCLU leaf EVERIFYREPORT2 supported + 0x12, 0, eax, 10, encls_eupdatesvn , ENCLS leaf EUPDATESVN supported + 0x12, 0, eax, 11, sgx_edeccssa , ENCLU leaf EDECCSSA supported + 0x12, 0, ebx, 0, miscselect_exinfo , SSA.MISC frame: reporting #PF and #GP exceptions inside enclave supported + 0x12, 0, ebx, 1, miscselect_cpinfo , SSA.MISC frame: reporting #CP exceptions inside enclave supported + 0x12, 0, edx, 7:0, max_enclave_sz_not64 , Maximum enclave size in non-64-bit mode (log2) + 0x12, 0, edx, 15:8, max_enclave_sz_64 , Maximum enclave size in 64-bit mode (log2) + 0x12, 1, eax, 0, secs_attr_init , ATTRIBUTES.INIT supported (enclave initialized by EINIT) + 0x12, 1, eax, 1, secs_attr_debug , ATTRIBUTES.DEBUG supported (enclave permits debugger read/write) + 0x12, 1, eax, 2, secs_attr_mode64bit , ATTRIBUTES.MODE64BIT supported (enclave runs in 64-bit mode) + 0x12, 1, eax, 4, secs_attr_provisionkey , ATTRIBUTES.PROVISIONKEY supported (provisioning key available) + 0x12, 1, eax, 5, secs_attr_einittoken_key, ATTRIBUTES.EINITTOKEN_KEY supported (EINIT token key available) + 0x12, 1, eax, 6, secs_attr_cet , ATTRIBUTES.CET supported (enable CET attributes) + 0x12, 1, eax, 7, secs_attr_kss , ATTRIBUTES.KSS supported (Key Separation and Sharing enabled) + 0x12, 1, eax, 10, secs_attr_aexnotify , ATTRIBUTES.AEXNOTIFY supported (enclave threads may get AEX notifications + 0x12, 1, ecx, 0, xfrm_x87 , Enclave XFRM.X87 (bit 0) supported + 0x12, 1, ecx, 1, xfrm_sse , Enclave XFRM.SEE (bit 1) supported + 0x12, 1, ecx, 2, xfrm_avx , Enclave XFRM.AVX (bit 2) supported + 0x12, 1, ecx, 3, xfrm_mpx_bndregs , Enclave XFRM.BNDREGS (bit 3) supported (MPX BND0-BND3 regs) + 0x12, 1, ecx, 4, xfrm_mpx_bndcsr , Enclave XFRM.BNDCSR (bit 4) supported (MPX BNDCFGU/BNDSTATUS regs) + 0x12, 1, ecx, 5, xfrm_avx512_opmask , Enclave XFRM.OPMASK (bit 5) supported (AVX-512 k0-k7 regs) + 0x12, 1, ecx, 6, xfrm_avx512_zmm_hi256 , Enclave XFRM.ZMM_Hi256 (bit 6) supported (AVX-512 ZMM0->ZMM7/15 regs) + 0x12, 1, ecx, 7, xfrm_avx512_hi16_zmm , Enclave XFRM.HI16_ZMM (bit 7) supported (AVX-512 ZMM16->ZMM31 regs) + 0x12, 1, ecx, 9, xfrm_pkru , Enclave XFRM.PKRU (bit 9) supported (XSAVE PKRU reg) + 0x12, 1, ecx, 17, xfrm_tileconfig , Enclave XFRM.TILECONFIG (bit 17) supported (AMX can manage TILECONFIG) + 0x12, 1, ecx, 18, xfrm_tiledata , Enclave XFRM.TILEDATA (bit 18) supported (AMX can manage TILEDATA) + 0x12, 31:2, eax, 3:0, subleaf_type , Subleaf type (dictates output layout) + 0x12, 31:2, eax, 31:12, epc_sec_base_addr_0 , EPC section base addr, bits[12:31] + 0x12, 31:2, ebx, 19:0, epc_sec_base_addr_1 , EPC section base addr, bits[32:51] + 0x12, 31:2, ecx, 3:0, epc_sec_type , EPC section type / property encoding + 0x12, 31:2, ecx, 31:12, epc_sec_size_0 , EPC section size, bits[12:31] + 0x12, 31:2, edx, 19:0, epc_sec_size_1 , EPC section size, bits[32:51] # Leaf 14H -# Intel Processor Tracer -# +# Intel Processor Trace enumeration + + 0x14, 0, eax, 31:0, pt_max_subleaf , Max cpuid 0x14 subleaf + 0x14, 0, ebx, 0, cr3_filtering , IA32_RTIT_CR3_MATCH is accessible + 0x14, 0, ebx, 1, psb_cyc , Configurable PSB and cycle-accurate mode + 0x14, 0, ebx, 2, ip_filtering , IP/TraceStop filtering; Warm-reset PT MSRs preservation + 0x14, 0, ebx, 3, mtc_timing , MTC timing packet; COFI-based packets suppression + 0x14, 0, ebx, 4, ptwrite , PTWRITE support + 0x14, 0, ebx, 5, power_event_trace , Power Event Trace support + 0x14, 0, ebx, 6, psb_pmi_preserve , PSB and PMI preservation support + 0x14, 0, ebx, 7, event_trace , Event Trace packet generation through IA32_RTIT_CTL.EventEn + 0x14, 0, ebx, 8, tnt_disable , TNT packet generation disable through IA32_RTIT_CTL.DisTNT + 0x14, 0, ecx, 0, topa_output , ToPA output scheme support + 0x14, 0, ecx, 1, topa_multiple_entries , ToPA tables can hold multiple entries + 0x14, 0, ecx, 2, single_range_output , Single-range output scheme supported + 0x14, 0, ecx, 3, trance_transport_output, Trace Transport subsystem output support + 0x14, 0, ecx, 31, ip_payloads_lip , IP payloads have LIP values (CS base included) + 0x14, 1, eax, 2:0, num_address_ranges , Filtering number of configurable Address Ranges + 0x14, 1, eax, 31:16, mtc_periods_bmp , Bitmap of supported MTC period encodings + 0x14, 1, ebx, 15:0, cycle_thresholds_bmp , Bitmap of supported Cycle Threshold encodings + 0x14, 1, ebx, 31:16, psb_periods_bmp , Bitmap of supported Configurable PSB frequency encodings # Leaf 15H -# Time Stamp Counter and Nominal Core Crystal Clock Information +# Intel TSC (Time Stamp Counter) enumeration - 0x15, 0, EAX, 31:0, tsc_denominator, The denominator of the TSC/”core crystal clock” ratio - 0x15, 0, EBX, 31:0, tsc_numerator, The numerator of the TSC/”core crystal clock” ratio - 0x15, 0, ECX, 31:0, nom_freq, Nominal frequency of the core crystal clock in Hz + 0x15, 0, eax, 31:0, tsc_denominator , Denominator of the TSC/'core crystal clock' ratio + 0x15, 0, ebx, 31:0, tsc_numerator , Numerator of the TSC/'core crystal clock' ratio + 0x15, 0, ecx, 31:0, cpu_crystal_hz , Core crystal clock nominal frequency, in Hz # Leaf 16H -# Processor Frequency Information +# Intel processor fequency enumeration - 0x16, 0, EAX, 15:0, cpu_base_freq, Processor Base Frequency in MHz - 0x16, 0, EBX, 15:0, cpu_max_freq, Maximum Frequency in MHz - 0x16, 0, ECX, 15:0, bus_freq, Bus (Reference) Frequency in MHz + 0x16, 0, eax, 15:0, cpu_base_mhz , Processor base frequency, in MHz + 0x16, 0, ebx, 15:0, cpu_max_mhz , Processor max frequency, in MHz + 0x16, 0, ecx, 15:0, bus_mhz , Bus reference frequency, in MHz # Leaf 17H -# System-On-Chip Vendor Attribute +# Intel SoC vendor attributes enumeration - 0x17, 0, EAX, 31:0, max_socid, Maximum input value of supported sub-leaf - 0x17, 0, EBX, 15:0, soc_vid, SOC Vendor ID - 0x17, 0, EBX, 16, std_vid, SOC Vendor ID is assigned via an industry standard scheme - 0x17, 0, ECX, 31:0, soc_pid, SOC Project ID assigned by vendor - 0x17, 0, EDX, 31:0, soc_sid, SOC Stepping ID + 0x17, 0, eax, 31:0, soc_max_subleaf , Max cpuid leaf 0x17 subleaf + 0x17, 0, ebx, 15:0, soc_vendor_id , SoC vendor ID + 0x17, 0, ebx, 16, is_vendor_scheme , Assigned by industry enumaeratoion scheme (not Intel) + 0x17, 0, ecx, 31:0, soc_proj_id , SoC project ID, assigned by vendor + 0x17, 0, edx, 31:0, soc_stepping_id , Soc project stepping ID, assigned by vendor + 0x17, 3:1, eax, 31:0, vendor_brand_a , Vendor Brand ID string, bytes subleaf_nr * (0 -> 3) + 0x17, 3:1, ebx, 31:0, vendor_brand_b , Vendor Brand ID string, bytes subleaf_nr * (4 -> 7) + 0x17, 3:1, ecx, 31:0, vendor_brand_c , Vendor Brand ID string, bytes subleaf_nr * (8 -> 11) + 0x17, 3:1, edx, 31:0, vendor_brand_d , Vendor Brand ID string, bytes subleaf_nr * (12 -> 15) # Leaf 18H -# Deterministic Address Translation Parameters +# Intel determenestic address translation (TLB) parameters + 0x18, 31:0, eax, 31:0, tlb_max_subleaf , Max cpuid 0x18 subleaf + 0x18, 31:0, ebx, 0, tlb_4k_page , TLB 4KB-page entries supported + 0x18, 31:0, ebx, 1, tlb_2m_page , TLB 2MB-page entries supported + 0x18, 31:0, ebx, 2, tlb_4m_page , TLB 4MB-page entries supported + 0x18, 31:0, ebx, 3, tlb_1g_page , TLB 1GB-page entries supported + 0x18, 31:0, ebx, 10:8, hard_partitioning , (Hard/Soft) partitioning between logical CPUs sharing this struct + 0x18, 31:0, ebx, 31:16, n_way_associative , Ways of associativity + 0x18, 31:0, ecx, 31:0, n_sets , Number of sets + 0x18, 31:0, edx, 4:0, tlb_type , Translation cache type (TLB type) + 0x18, 31:0, edx, 7:5, tlb_cache_level , Translation cache level (1-based) + 0x18, 31:0, edx, 8, is_fully_associative , Fully-associative structure + 0x18, 31:0, edx, 25:14, tlb_max_addressible_ids, Max num of addressible IDs for logical CPUs sharing this TLB - 1 # Leaf 19H -# Key Locker Leaf +# Intel Key Locker enumeration + 0x19, 0, eax, 0, kl_cpl0_only , CPL0-only key Locker restriction supported + 0x19, 0, eax, 1, kl_no_encrypt , No-encrypt key locker restriction supported + 0x19, 0, eax, 2, kl_no_decrypt , No-decrypt key locker restriction supported + 0x19, 0, ebx, 0, aes_keylocker , AES key locker instructions supported + 0x19, 0, ebx, 2, aes_keylocker_wide , AES wide key locker instructions supported + 0x19, 0, ebx, 4, kl_msr_iwkey , Key locker MSRs and IWKEY backups supported + 0x19, 0, ecx, 0, loadiwkey_no_backup , LOADIWKEY NoBackup parameter supported + 0x19, 0, ecx, 1, iwkey_rand , IWKEY randomization (KeySource encoding 1) supported # Leaf 1AH -# Hybrid Information +# Intel hybrid CPUs identification (e.g. Atom, Core) - 0x1A, 0, EAX, 31:24, core_type, 20H-Intel_Atom 40H-Intel_Core + 0x1a, 0, eax, 23:0, core_native_model , This core's native model ID + 0x1a, 0, eax, 31:24, core_type , This core's type +# Leaf 1BH +# Intel PCONFIG (Platform configuration) enumeration + + 0x1b, 31:0, eax, 11:0, pconfig_subleaf_type , CPUID 0x1b subleaf type + 0x1b, 31:0, ebx, 31:0, pconfig_target_id_x , A supported PCONFIG target ID + 0x1b, 31:0, ecx, 31:0, pconfig_target_id_y , A supported PCONFIG target ID + 0x1b, 31:0, edx, 31:0, pconfig_target_id_z , A supported PCONFIG target ID + +# Leaf 1CH +# Intel LBR (Last Branch Record) enumeration + + 0x1c, 0, eax, 0, lbr_depth_8 , Max stack depth (number of LBR entries) = 8 + 0x1c, 0, eax, 1, lbr_depth_16 , Max stack depth (number of LBR entries) = 16 + 0x1c, 0, eax, 2, lbr_depth_24 , Max stack depth (number of LBR entries) = 24 + 0x1c, 0, eax, 3, lbr_depth_32 , Max stack depth (number of LBR entries) = 32 + 0x1c, 0, eax, 4, lbr_depth_40 , Max stack depth (number of LBR entries) = 40 + 0x1c, 0, eax, 5, lbr_depth_48 , Max stack depth (number of LBR entries) = 48 + 0x1c, 0, eax, 6, lbr_depth_56 , Max stack depth (number of LBR entries) = 56 + 0x1c, 0, eax, 7, lbr_depth_64 , Max stack depth (number of LBR entries) = 64 + 0x1c, 0, eax, 30, lbr_deep_c_reset , LBRs maybe cleared on MWAIT C-state > C1 + 0x1c, 0, eax, 31, lbr_ip_is_lip , LBR IP contain Last IP, otherwise effective IP + 0x1c, 0, ebx, 0, lbr_cpl , CPL filtering (non-zero IA32_LBR_CTL[2:1]) supported + 0x1c, 0, ebx, 1, lbr_branch_filter , Branch filtering (non-zero IA32_LBR_CTL[22:16]) supported + 0x1c, 0, ebx, 2, lbr_call_stack , Call-stack mode (IA32_LBR_CTL[3] = 1) supported + 0x1c, 0, ecx, 0, lbr_mispredict , Branch misprediction bit supported (IA32_LBR_x_INFO[63]) + 0x1c, 0, ecx, 1, lbr_timed_lbr , Timed LBRs (CPU cycles since last LBR entry) supported + 0x1c, 0, ecx, 2, lbr_branch_type , Branch type field (IA32_LBR_INFO_x[59:56]) supported + 0x1c, 0, ecx, 19:16, lbr_events_gpc_bmp , LBR PMU-events logging support; bitmap for first 4 GP (general-purpose) Counters + +# Leaf 1DH +# Intel AMX (Advanced Matrix Extensions) tile information + + 0x1d, 0, eax, 31:0, amx_max_palette , Highest palette ID / subleaf ID + 0x1d, 1, eax, 15:0, amx_palette_size , AMX palette total tiles size, in bytes + 0x1d, 1, eax, 31:16, amx_tile_size , AMX single tile's size, in bytes + 0x1d, 1, ebx, 15:0, amx_tile_row_size , AMX tile single row's size, in bytes + 0x1d, 1, ebx, 31:16, amx_palette_nr_tiles , AMX palette number of tiles + 0x1d, 1, ecx, 15:0, amx_tile_nr_rows , AMX tile max number of rows + +# Leaf 1EH +# Intel AMX, TMUL (Tile-matrix MULtiply) accelerator unit enumeration + + 0x1e, 0, ebx, 7:0, tmul_maxk , TMUL unit maximum height, K (rows or columns) + 0x1e, 0, ebx, 23:8, tmul_maxn , TMUL unit maxiumum SIMD dimension, N (column bytes) # Leaf 1FH -# V2 Extended Topology - A preferred superset to leaf 0BH +# Intel extended topology enumeration v2 + 0x1f, 5:0, eax, 4:0, x2apic_id_shift , Bit width of this level (previous levels inclusive) + 0x1f, 5:0, ebx, 15:0, domain_lcpus_count , Logical CPUs count across all instances of this domain + 0x1f, 5:0, ecx, 7:0, domain_level , This domain level (subleaf ID) + 0x1f, 5:0, ecx, 15:8, domain_type , This domain type + 0x1f, 5:0, edx, 31:0, x2apic_id , x2APIC ID of current logical CPU -# According to SDM -# 40000000H - 4FFFFFFFH is invalid range +# Leaf 20H +# Intel HRESET (History Reset) enumeration + + 0x20, 0, eax, 31:0, hreset_nr_subleaves , CPUID 0x20 max subleaf + 1 + 0x20, 0, ebx, 0, hreset_thread_director , HRESET of Intel thread director is supported + +# Leaf 21H +# Intel TD (Trust Domain) guest execution environment enumeration + + 0x21, 0, ebx, 31:0, tdx_vendorid_0 , TDX vendor ID string bytes 0 - 3 + 0x21, 0, ecx, 31:0, tdx_vendorid_2 , CPU vendor ID string bytes 8 - 11 + 0x21, 0, edx, 31:0, tdx_vendorid_1 , CPU vendor ID string bytes 4 - 7 + +# Leaf 23H +# Intel Architectural Performance Monitoring Extended (ArchPerfmonExt) + + 0x23, 0, eax, 1, subleaf_1_counters , Subleaf 1, PMU counters bitmaps, is valid + 0x23, 0, eax, 3, subleaf_3_events , Subleaf 3, PMU events bitmaps, is valid + 0x23, 0, ebx, 0, unitmask2 , IA32_PERFEVTSELx MSRs UnitMask2 is supported + 0x23, 0, ebx, 1, zbit , IA32_PERFEVTSELx MSRs Z-bit is supported + 0x23, 1, eax, 31:0, pmu_gp_counters_bitmap , General-purpose PMU counters bitmap + 0x23, 1, ebx, 31:0, pmu_f_counters_bitmap , Fixed PMU counters bitmap + 0x23, 3, eax, 0, core_cycles_evt , Core cycles event supported + 0x23, 3, eax, 1, insn_retired_evt , Instructions retired event supported + 0x23, 3, eax, 2, ref_cycles_evt , Reference cycles event supported + 0x23, 3, eax, 3, llc_refs_evt , Last-level cache references event supported + 0x23, 3, eax, 4, llc_misses_evt , Last-level cache misses event supported + 0x23, 3, eax, 5, br_insn_ret_evt , Branch instruction retired event supported + 0x23, 3, eax, 6, br_mispr_evt , Branch mispredict retired event supported + 0x23, 3, eax, 7, td_slots_evt , Topdown slots event supported + 0x23, 3, eax, 8, td_backend_bound_evt , Topdown backend bound event supported + 0x23, 3, eax, 9, td_bad_spec_evt , Topdown bad speculation event supported + 0x23, 3, eax, 10, td_frontend_bound_evt , Topdown frontend bound event supported + 0x23, 3, eax, 11, td_retiring_evt , Topdown retiring event support + +# Leaf 40000000H +# Maximum hypervisor standard leaf + hypervisor vendor string + +0x40000000, 0, eax, 31:0, max_hyp_leaf , Maximum hypervisor standard leaf number +0x40000000, 0, ebx, 31:0, hypervisor_id_0 , Hypervisor ID string bytes 0 - 3 +0x40000000, 0, ecx, 31:0, hypervisor_id_1 , Hypervisor ID string bytes 4 - 7 +0x40000000, 0, edx, 31:0, hypervisor_id_2 , Hypervisor ID string bytes 8 - 11 + +# Leaf 80000000H +# Maximum extended leaf number + CPU vendor string (AMD) + +0x80000000, 0, eax, 31:0, max_ext_leaf , Maximum extended cpuid leaf supported +0x80000000, 0, ebx, 31:0, cpu_vendorid_0 , Vendor ID string bytes 0 - 3 +0x80000000, 0, ecx, 31:0, cpu_vendorid_2 , Vendor ID string bytes 8 - 11 +0x80000000, 0, edx, 31:0, cpu_vendorid_1 , Vendor ID string bytes 4 - 7 # Leaf 80000001H -# Extended Processor Signature and Feature Bits +# Extended CPU feature identifiers -0x80000001, 0, EAX, 27:20, extfamily, Extended family -0x80000001, 0, EAX, 19:16, extmodel, Extended model -0x80000001, 0, EAX, 11:8, basefamily, Description of Family -0x80000001, 0, EAX, 11:8, basemodel, Model numbers vary with product -0x80000001, 0, EAX, 3:0, stepping, Processor stepping (revision) for a specific model +0x80000001, 0, eax, 3:0, e_stepping_id , Stepping ID +0x80000001, 0, eax, 7:4, e_base_model , Base processor model +0x80000001, 0, eax, 11:8, e_base_family , Base processor family +0x80000001, 0, eax, 19:16, e_ext_model , Extended processor model +0x80000001, 0, eax, 27:20, e_ext_family , Extended processor family +0x80000001, 0, ebx, 15:0, brand_id , Brand ID +0x80000001, 0, ebx, 31:28, pkg_type , Package type +0x80000001, 0, ecx, 0, lahf_lm , LAHF and SAHF in 64-bit mode +0x80000001, 0, ecx, 1, cmp_legacy , Multi-processing legacy mode (No HT) +0x80000001, 0, ecx, 2, svm , Secure Virtual Machine +0x80000001, 0, ecx, 3, extapic , Extended APIC space +0x80000001, 0, ecx, 4, cr8_legacy , LOCK MOV CR0 means MOV CR8 +0x80000001, 0, ecx, 5, abm , LZCNT advanced bit manipulation +0x80000001, 0, ecx, 6, sse4a , SSE4A support +0x80000001, 0, ecx, 7, misalignsse , Misaligned SSE mode +0x80000001, 0, ecx, 8, 3dnowprefetch , 3DNow PREFETCH/PREFETCHW support +0x80000001, 0, ecx, 9, osvw , OS visible workaround +0x80000001, 0, ecx, 10, ibs , Instruction based sampling +0x80000001, 0, ecx, 11, xop , XOP: extended operation (AVX instructions) +0x80000001, 0, ecx, 12, skinit , SKINIT/STGI support +0x80000001, 0, ecx, 13, wdt , Watchdog timer support +0x80000001, 0, ecx, 15, lwp , Lightweight profiling +0x80000001, 0, ecx, 16, fma4 , 4-operand FMA instruction +0x80000001, 0, ecx, 17, tce , Translation cache extension +0x80000001, 0, ecx, 19, nodeid_msr , NodeId MSR (0xc001100c) +0x80000001, 0, ecx, 21, tbm , Trailing bit manipulations +0x80000001, 0, ecx, 22, topoext , Topology Extensions (cpuid leaf 0x8000001d) +0x80000001, 0, ecx, 23, perfctr_core , Core performance counter extensions +0x80000001, 0, ecx, 24, perfctr_nb , NB/DF performance counter extensions +0x80000001, 0, ecx, 26, bpext , Data access breakpoint extension +0x80000001, 0, ecx, 27, ptsc , Performance time-stamp counter +0x80000001, 0, ecx, 28, perfctr_llc , LLC (L3) performance counter extensions +0x80000001, 0, ecx, 29, mwaitx , MWAITX/MONITORX support +0x80000001, 0, ecx, 30, addr_mask_ext , Breakpoint address mask extension (to bit 31) +0x80000001, 0, edx, 0, e_fpu , Floating-Point Unit on-chip (x87) +0x80000001, 0, edx, 1, e_vme , Virtual-8086 Mode Extensions +0x80000001, 0, edx, 2, e_de , Debugging Extensions +0x80000001, 0, edx, 3, e_pse , Page Size Extension +0x80000001, 0, edx, 4, e_tsc , Time Stamp Counter +0x80000001, 0, edx, 5, e_msr , Model-Specific Registers (RDMSR and WRMSR support) +0x80000001, 0, edx, 6, pae , Physical Address Extensions +0x80000001, 0, edx, 7, mce , Machine Check Exception +0x80000001, 0, edx, 8, cx8 , CMPXCHG8B instruction +0x80000001, 0, edx, 9, apic , APIC on-chip +0x80000001, 0, edx, 11, syscall , SYSCALL and SYSRET instructions +0x80000001, 0, edx, 12, mtrr , Memory Type Range Registers +0x80000001, 0, edx, 13, pge , Page Global Extensions +0x80000001, 0, edx, 14, mca , Machine Check Architecture +0x80000001, 0, edx, 15, cmov , Conditional Move Instruction +0x80000001, 0, edx, 16, pat , Page Attribute Table +0x80000001, 0, edx, 17, pse36 , Page Size Extension (36-bit) +0x80000001, 0, edx, 19, mp , Out-of-spec AMD Multiprocessing bit +0x80000001, 0, edx, 20, nx , No-execute page protection +0x80000001, 0, edx, 22, mmxext , AMD MMX extensions +0x80000001, 0, edx, 24, e_fxsr , FXSAVE and FXRSTOR instructions +0x80000001, 0, edx, 25, fxsr_opt , FXSAVE and FXRSTOR optimizations +0x80000001, 0, edx, 26, pdpe1gb , 1-GB large page support +0x80000001, 0, edx, 27, rdtscp , RDTSCP instruction +0x80000001, 0, edx, 29, lm , Long mode (x86-64, 64-bit support) +0x80000001, 0, edx, 30, 3dnowext , AMD 3DNow extensions +0x80000001, 0, edx, 31, 3dnow , 3DNow instructions -0x80000001, 0, EBX, 31:28, pkgtype, Specifies the package type +# Leaf 80000002H +# CPU brand ID string, bytes 0 - 15 -0x80000001, 0, ECX, 0, lahf_lm, LAHF/SAHF available in 64-bit mode -0x80000001, 0, ECX, 1, cmplegacy, Core multi-processing legacy mode -0x80000001, 0, ECX, 2, svm, Indicates support for: VMRUN, VMLOAD, VMSAVE, CLGI, VMMCALL, and INVLPGA -0x80000001, 0, ECX, 3, extapicspace, Extended APIC register space -0x80000001, 0, ECX, 4, altmovecr8, Indicates support for LOCK MOV CR0 means MOV CR8 -0x80000001, 0, ECX, 5, lzcnt, LZCNT -0x80000001, 0, ECX, 6, sse4a, EXTRQ, INSERTQ, MOVNTSS, and MOVNTSD instruction support -0x80000001, 0, ECX, 7, misalignsse, Misaligned SSE Mode -0x80000001, 0, ECX, 8, prefetchw, PREFETCHW -0x80000001, 0, ECX, 9, osvw, OS Visible Work-around support -0x80000001, 0, ECX, 10, ibs, Instruction Based Sampling -0x80000001, 0, ECX, 11, xop, Extended operation support -0x80000001, 0, ECX, 12, skinit, SKINIT and STGI support -0x80000001, 0, ECX, 13, wdt, Watchdog timer support -0x80000001, 0, ECX, 15, lwp, Lightweight profiling support -0x80000001, 0, ECX, 16, fma4, Four-operand FMA instruction support -0x80000001, 0, ECX, 17, tce, Translation cache extension -0x80000001, 0, ECX, 22, TopologyExtensions, Indicates support for Core::X86::Cpuid::CachePropEax0 and Core::X86::Cpuid::ExtApicId -0x80000001, 0, ECX, 23, perfctrextcore, Indicates support for Core::X86::Msr::PERF_CTL0 - 5 and Core::X86::Msr::PERF_CTR -0x80000001, 0, ECX, 24, perfctrextdf, Indicates support for Core::X86::Msr::DF_PERF_CTL and Core::X86::Msr::DF_PERF_CTR -0x80000001, 0, ECX, 26, databreakpointextension, Indicates data breakpoint support for Core::X86::Msr::DR0_ADDR_MASK, Core::X86::Msr::DR1_ADDR_MASK, Core::X86::Msr::DR2_ADDR_MASK and Core::X86::Msr::DR3_ADDR_MASK -0x80000001, 0, ECX, 27, perftsc, Performance time-stamp counter supported -0x80000001, 0, ECX, 28, perfctrextllc, Indicates support for L3 performance counter extensions -0x80000001, 0, ECX, 29, mwaitextended, MWAITX and MONITORX capability is supported -0x80000001, 0, ECX, 30, admskextn, Indicates support for address mask extension (to 32 bits and to all 4 DRs) for instruction breakpoints +0x80000002, 0, eax, 31:0, cpu_brandid_0 , CPU brand ID string, bytes 0 - 3 +0x80000002, 0, ebx, 31:0, cpu_brandid_1 , CPU brand ID string, bytes 4 - 7 +0x80000002, 0, ecx, 31:0, cpu_brandid_2 , CPU brand ID string, bytes 8 - 11 +0x80000002, 0, edx, 31:0, cpu_brandid_3 , CPU brand ID string, bytes 12 - 15 -0x80000001, 0, EDX, 0, fpu, x87 floating point unit on-chip -0x80000001, 0, EDX, 1, vme, Virtual-mode enhancements -0x80000001, 0, EDX, 2, de, Debugging extensions, IO breakpoints, CR4.DE -0x80000001, 0, EDX, 3, pse, Page-size extensions (4 MB pages) -0x80000001, 0, EDX, 4, tsc, Time stamp counter, RDTSC/RDTSCP instructions, CR4.TSD -0x80000001, 0, EDX, 5, msr, Model-specific registers (MSRs), with RDMSR and WRMSR instructions -0x80000001, 0, EDX, 6, pae, Physical-address extensions (PAE) -0x80000001, 0, EDX, 7, mce, Machine Check Exception, CR4.MCE -0x80000001, 0, EDX, 8, cmpxchg8b, CMPXCHG8B instruction -0x80000001, 0, EDX, 9, apic, advanced programmable interrupt controller (APIC) exists and is enabled -0x80000001, 0, EDX, 11, sysret, SYSCALL/SYSRET supported -0x80000001, 0, EDX, 12, mtrr, Memory-type range registers -0x80000001, 0, EDX, 13, pge, Page global extension, CR4.PGE -0x80000001, 0, EDX, 14, mca, Machine check architecture, MCG_CAP -0x80000001, 0, EDX, 15, cmov, Conditional move instructions, CMOV, FCOMI, FCMOV -0x80000001, 0, EDX, 16, pat, Page attribute table -0x80000001, 0, EDX, 17, pse36, Page-size extensions -0x80000001, 0, EDX, 20, exec_dis, Execute Disable Bit available -0x80000001, 0, EDX, 22, mmxext, AMD extensions to MMX instructions -0x80000001, 0, EDX, 23, mmx, MMX instructions -0x80000001, 0, EDX, 24, fxsr, FXSAVE and FXRSTOR instructions -0x80000001, 0, EDX, 25, ffxsr, FXSAVE and FXRSTOR instruction optimizations -0x80000001, 0, EDX, 26, 1gb_page, 1GB page supported -0x80000001, 0, EDX, 27, rdtscp, RDTSCP and IA32_TSC_AUX are available -0x80000001, 0, EDX, 29, lm, 64b Architecture supported -0x80000001, 0, EDX, 30, threednowext, AMD extensions to 3DNow! instructions -0x80000001, 0, EDX, 31, threednow, 3DNow! instructions +# Leaf 80000003H +# CPU brand ID string, bytes 16 - 31 -# Leaf 80000002H/80000003H/80000004H -# Processor Brand String +0x80000003, 0, eax, 31:0, cpu_brandid_4 , CPU brand ID string bytes, 16 - 19 +0x80000003, 0, ebx, 31:0, cpu_brandid_5 , CPU brand ID string bytes, 20 - 23 +0x80000003, 0, ecx, 31:0, cpu_brandid_6 , CPU brand ID string bytes, 24 - 27 +0x80000003, 0, edx, 31:0, cpu_brandid_7 , CPU brand ID string bytes, 28 - 31 + +# Leaf 80000004H +# CPU brand ID string, bytes 32 - 47 + +0x80000004, 0, eax, 31:0, cpu_brandid_8 , CPU brand ID string, bytes 32 - 35 +0x80000004, 0, ebx, 31:0, cpu_brandid_9 , CPU brand ID string, bytes 36 - 39 +0x80000004, 0, ecx, 31:0, cpu_brandid_10 , CPU brand ID string, bytes 40 - 43 +0x80000004, 0, edx, 31:0, cpu_brandid_11 , CPU brand ID string, bytes 44 - 47 # Leaf 80000005H -# Reserved +# AMD L1 cache and L1 TLB enumeration + +0x80000005, 0, eax, 7:0, l1_itlb_2m_4m_nentries , L1 ITLB #entires, 2M and 4M pages +0x80000005, 0, eax, 15:8, l1_itlb_2m_4m_assoc , L1 ITLB associativity, 2M and 4M pages +0x80000005, 0, eax, 23:16, l1_dtlb_2m_4m_nentries , L1 DTLB #entires, 2M and 4M pages +0x80000005, 0, eax, 31:24, l1_dtlb_2m_4m_assoc , L1 DTLB associativity, 2M and 4M pages +0x80000005, 0, ebx, 7:0, l1_itlb_4k_nentries , L1 ITLB #entries, 4K pages +0x80000005, 0, ebx, 15:8, l1_itlb_4k_assoc , L1 ITLB associativity, 4K pages +0x80000005, 0, ebx, 23:16, l1_dtlb_4k_nentries , L1 DTLB #entries, 4K pages +0x80000005, 0, ebx, 31:24, l1_dtlb_4k_assoc , L1 DTLB associativity, 4K pages +0x80000005, 0, ecx, 7:0, l1_dcache_line_size , L1 dcache line size, in bytes +0x80000005, 0, ecx, 15:8, l1_dcache_nlines , L1 dcache lines per tag +0x80000005, 0, ecx, 23:16, l1_dcache_assoc , L1 dcache associativity +0x80000005, 0, ecx, 31:24, l1_dcache_size_kb , L1 dcache size, in KB +0x80000005, 0, edx, 7:0, l1_icache_line_size , L1 icache line size, in bytes +0x80000005, 0, edx, 15:8, l1_icache_nlines , L1 icache lines per tag +0x80000005, 0, edx, 23:16, l1_icache_assoc , L1 icache associativity +0x80000005, 0, edx, 31:24, l1_icache_size_kb , L1 icache size, in KB # Leaf 80000006H -# Extended L2 Cache Features - -0x80000006, 0, ECX, 7:0, clsize, Cache Line size in bytes -0x80000006, 0, ECX, 15:12, l2c_assoc, L2 Associativity -0x80000006, 0, ECX, 31:16, csize, Cache size in 1K units +# (Mostly AMD) L2 TLB, L2 cache, and L3 cache enumeration +0x80000006, 0, eax, 11:0, l2_itlb_2m_4m_nentries , L2 iTLB #entries, 2M and 4M pages +0x80000006, 0, eax, 15:12, l2_itlb_2m_4m_assoc , L2 iTLB associativity, 2M and 4M pages +0x80000006, 0, eax, 27:16, l2_dtlb_2m_4m_nentries , L2 dTLB #entries, 2M and 4M pages +0x80000006, 0, eax, 31:28, l2_dtlb_2m_4m_assoc , L2 dTLB associativity, 2M and 4M pages +0x80000006, 0, ebx, 11:0, l2_itlb_4k_nentries , L2 iTLB #entries, 4K pages +0x80000006, 0, ebx, 15:12, l2_itlb_4k_assoc , L2 iTLB associativity, 4K pages +0x80000006, 0, ebx, 27:16, l2_dtlb_4k_nentries , L2 dTLB #entries, 4K pages +0x80000006, 0, ebx, 31:28, l2_dtlb_4k_assoc , L2 dTLB associativity, 4K pages +0x80000006, 0, ecx, 7:0, l2_line_size , L2 cache line size, in bytes +0x80000006, 0, ecx, 11:8, l2_nlines , L2 cache number of lines per tag +0x80000006, 0, ecx, 15:12, l2_assoc , L2 cache associativity +0x80000006, 0, ecx, 31:16, l2_size_kb , L2 cache size, in KB +0x80000006, 0, edx, 7:0, l3_line_size , L3 cache line size, in bytes +0x80000006, 0, edx, 11:8, l3_nlines , L3 cache number of lines per tag +0x80000006, 0, edx, 15:12, l3_assoc , L3 cache associativity +0x80000006, 0, edx, 31:18, l3_size_range , L3 cache size range # Leaf 80000007H +# CPU power management (mostly AMD) and AMD RAS enumeration -0x80000007, 0, EDX, 8, nonstop_tsc, Invariant TSC available - +0x80000007, 0, ebx, 0, overflow_recov , MCA overflow conditions not fatal +0x80000007, 0, ebx, 1, succor , Software containment of UnCORRectable errors +0x80000007, 0, ebx, 2, hw_assert , Hardware assert MSRs +0x80000007, 0, ebx, 3, smca , Scalable MCA (MCAX MSRs) +0x80000007, 0, ecx, 31:0, cpu_pwr_sample_ratio , CPU power sample time ratio +0x80000007, 0, edx, 0, digital_temp , Digital temprature sensor +0x80000007, 0, edx, 1, powernow_freq_id , PowerNOW! frequency scaling +0x80000007, 0, edx, 2, powernow_volt_id , PowerNOW! voltage scaling +0x80000007, 0, edx, 3, thermal_trip , THERMTRIP (Thermal Trip) +0x80000007, 0, edx, 4, hw_thermal_control , Hardware thermal control +0x80000007, 0, edx, 5, sw_thermal_control , Software thermal control +0x80000007, 0, edx, 6, 100mhz_steps , 100 MHz multiplier control +0x80000007, 0, edx, 7, hw_pstate , Hardware P-state control +0x80000007, 0, edx, 8, constant_tsc , TSC ticks at constant rate across all P and C states +0x80000007, 0, edx, 9, cpb , Core performance boost +0x80000007, 0, edx, 10, eff_freq_ro , Read-only effective frequency interface +0x80000007, 0, edx, 11, proc_feedback , Processor feedback interface (deprecated) +0x80000007, 0, edx, 12, acc_power , Processor power reporting interface +0x80000007, 0, edx, 13, connected_standby , CPU Connected Standby support +0x80000007, 0, edx, 14, rapl , Runtime Average Power Limit interface # Leaf 80000008H +# CPU capacity parameters and extended feature flags (mostly AMD) -0x80000008, 0, EAX, 7:0, phy_adr_bits, Physical Address Bits -0x80000008, 0, EAX, 15:8, lnr_adr_bits, Linear Address Bits -0x80000007, 0, EBX, 9, wbnoinvd, WBNOINVD +0x80000008, 0, eax, 7:0, phys_addr_bits , Max physical address bits +0x80000008, 0, eax, 15:8, virt_addr_bits , Max virtual address bits +0x80000008, 0, eax, 23:16, guest_phys_addr_bits , Max nested-paging guest physical address bits +0x80000008, 0, ebx, 0, clzero , CLZERO supported +0x80000008, 0, ebx, 1, irperf , Instruction retired counter MSR +0x80000008, 0, ebx, 2, xsaveerptr , XSAVE/XRSTOR always saves/restores FPU error pointers +0x80000008, 0, ebx, 3, invlpgb , INVLPGB broadcasts a TLB invalidate to all threads +0x80000008, 0, ebx, 4, rdpru , RDPRU (Read Processor Register at User level) supported +0x80000008, 0, ebx, 6, mba , Memory Bandwidth Allocation (AMD bit) +0x80000008, 0, ebx, 8, mcommit , MCOMMIT (Memory commit) supported +0x80000008, 0, ebx, 9, wbnoinvd , WBNOINVD supported +0x80000008, 0, ebx, 12, amd_ibpb , Indirect Branch Prediction Barrier +0x80000008, 0, ebx, 13, wbinvd_int , Interruptible WBINVD/WBNOINVD +0x80000008, 0, ebx, 14, amd_ibrs , Indirect Branch Restricted Speculation +0x80000008, 0, ebx, 15, amd_stibp , Single Thread Indirect Branch Prediction mode +0x80000008, 0, ebx, 16, ibrs_always_on , IBRS always-on preferred +0x80000008, 0, ebx, 17, amd_stibp_always_on , STIBP always-on preferred +0x80000008, 0, ebx, 18, ibrs_fast , IBRS is preferred over software solution +0x80000008, 0, ebx, 19, ibrs_same_mode , IBRS provides same mode protection +0x80000008, 0, ebx, 20, no_efer_lmsle , EFER[LMSLE] bit (Long-Mode Segment Limit Enable) unsupported +0x80000008, 0, ebx, 21, tlb_flush_nested , INVLPGB RAX[5] bit can be set (nested translations) +0x80000008, 0, ebx, 23, amd_ppin , Protected Processor Inventory Number +0x80000008, 0, ebx, 24, amd_ssbd , Speculative Store Bypass Disable +0x80000008, 0, ebx, 25, virt_ssbd , virtualized SSBD (Speculative Store Bypass Disable) +0x80000008, 0, ebx, 26, amd_ssb_no , SSBD not needed (fixed in HW) +0x80000008, 0, ebx, 27, cppc , Collaborative Processor Performance Control +0x80000008, 0, ebx, 28, amd_psfd , Predictive Store Forward Disable +0x80000008, 0, ebx, 29, btc_no , CPU not affected by Branch Type Confusion +0x80000008, 0, ebx, 30, ibpb_ret , IBPB clears RSB/RAS too +0x80000008, 0, ebx, 31, brs , Branch Sampling supported +0x80000008, 0, ecx, 7:0, cpu_nthreads , Number of physical threads - 1 +0x80000008, 0, ecx, 15:12, apicid_coreid_len , Number of thread core ID bits (shift) in APIC ID +0x80000008, 0, ecx, 17:16, perf_tsc_len , Performance time-stamp counter size +0x80000008, 0, edx, 15:0, invlpgb_max_pages , INVLPGB maximum page count +0x80000008, 0, edx, 31:16, rdpru_max_reg_id , RDPRU max register ID (ECX input) -# 0x8000001E -# EAX: Extended APIC ID -0x8000001E, 0, EAX, 31:0, extended_apic_id, Extended APIC ID -# EBX: Core Identifiers -0x8000001E, 0, EBX, 7:0, core_id, Identifies the logical core ID -0x8000001E, 0, EBX, 15:8, threads_per_core, The number of threads per core is threads_per_core + 1 -# ECX: Node Identifiers -0x8000001E, 0, ECX, 7:0, node_id, Node ID -0x8000001E, 0, ECX, 10:8, nodes_per_processor, Nodes per processor { 0: 1 node, else reserved } +# Leaf 8000000AH +# AMD SVM (Secure Virtual Machine) enumeration -# 8000001F: AMD Secure Encryption -0x8000001F, 0, EAX, 0, sme, Secure Memory Encryption -0x8000001F, 0, EAX, 1, sev, Secure Encrypted Virtualization -0x8000001F, 0, EAX, 2, vmpgflush, VM Page Flush MSR -0x8000001F, 0, EAX, 3, seves, SEV Encrypted State -0x8000001F, 0, EBX, 5:0, c-bit, Page table bit number used to enable memory encryption -0x8000001F, 0, EBX, 11:6, mem_encrypt_physaddr_width, Reduction of physical address space in bits with SME enabled -0x8000001F, 0, ECX, 31:0, num_encrypted_guests, Maximum ASID value that may be used for an SEV-enabled guest -0x8000001F, 0, EDX, 31:0, minimum_sev_asid, Minimum ASID value that must be used for an SEV-enabled, SEV-ES-disabled guest +0x8000000a, 0, eax, 7:0, svm_version , SVM revision number +0x8000000a, 0, ebx, 31:0, svm_nasid , Number of address space identifiers (ASID) +0x8000000a, 0, edx, 0, npt , Nested paging +0x8000000a, 0, edx, 1, lbrv , LBR virtualization +0x8000000a, 0, edx, 2, svm_lock , SVM lock +0x8000000a, 0, edx, 3, nrip_save , NRIP save support on #VMEXIT +0x8000000a, 0, edx, 4, tsc_scale , MSR based TSC rate control +0x8000000a, 0, edx, 5, vmcb_clean , VMCB clean bits support +0x8000000a, 0, edx, 6, flushbyasid , Flush by ASID + Extended VMCB TLB_Control +0x8000000a, 0, edx, 7, decodeassists , Decode Assists support +0x8000000a, 0, edx, 10, pausefilter , Pause intercept filter +0x8000000a, 0, edx, 12, pfthreshold , Pause filter threshold +0x8000000a, 0, edx, 13, avic , Advanced virtual interrupt controller +0x8000000a, 0, edx, 15, v_vmsave_vmload , Virtual VMSAVE/VMLOAD (nested virt) +0x8000000a, 0, edx, 16, vgif , Virtualize the Global Interrupt Flag +0x8000000a, 0, edx, 17, gmet , Guest mode execution trap +0x8000000a, 0, edx, 18, x2avic , Virtual x2APIC +0x8000000a, 0, edx, 19, sss_check , Supervisor Shadow Stack restrictions +0x8000000a, 0, edx, 20, v_spec_ctrl , Virtual SPEC_CTRL +0x8000000a, 0, edx, 21, ro_gpt , Read-Only guest page table support +0x8000000a, 0, edx, 23, h_mce_override , Host MCE override +0x8000000a, 0, edx, 24, tlbsync_int , TLBSYNC intercept + INVLPGB/TLBSYNC in VMCB +0x8000000a, 0, edx, 25, vnmi , NMI virtualization +0x8000000a, 0, edx, 26, ibs_virt , IBS Virtualization +0x8000000a, 0, edx, 27, ext_lvt_off_chg , Extended LVT offset fault change +0x8000000a, 0, edx, 28, svme_addr_chk , Guest SVME addr check + +# Leaf 80000019H +# AMD TLB 1G-pages enumeration + +0x80000019, 0, eax, 11:0, l1_itlb_1g_nentries , L1 iTLB #entries, 1G pages +0x80000019, 0, eax, 15:12, l1_itlb_1g_assoc , L1 iTLB associativity, 1G pages +0x80000019, 0, eax, 27:16, l1_dtlb_1g_nentries , L1 dTLB #entries, 1G pages +0x80000019, 0, eax, 31:28, l1_dtlb_1g_assoc , L1 dTLB associativity, 1G pages +0x80000019, 0, ebx, 11:0, l2_itlb_1g_nentries , L2 iTLB #entries, 1G pages +0x80000019, 0, ebx, 15:12, l2_itlb_1g_assoc , L2 iTLB associativity, 1G pages +0x80000019, 0, ebx, 27:16, l2_dtlb_1g_nentries , L2 dTLB #entries, 1G pages +0x80000019, 0, ebx, 31:28, l2_dtlb_1g_assoc , L2 dTLB associativity, 1G pages + +# Leaf 8000001AH +# AMD instruction optimizations enumeration + +0x8000001a, 0, eax, 0, fp_128 , Internal FP/SIMD exec data path is 128-bits wide +0x8000001a, 0, eax, 1, movu_preferred , SSE: MOVU* better than MOVL*/MOVH* +0x8000001a, 0, eax, 2, fp_256 , internal FP/SSE exec data path is 256-bits wide + +# Leaf 8000001BH +# AMD IBS (Instruction-Based Sampling) enumeration + +0x8000001b, 0, eax, 0, ibs_flags_valid , IBS feature flags valid +0x8000001b, 0, eax, 1, ibs_fetch_sampling , IBS fetch sampling supported +0x8000001b, 0, eax, 2, ibs_op_sampling , IBS execution sampling supported +0x8000001b, 0, eax, 3, ibs_rdwr_op_counter , IBS read/write of op counter supported +0x8000001b, 0, eax, 4, ibs_op_count , IBS OP counting mode supported +0x8000001b, 0, eax, 5, ibs_branch_target , IBS branch target address reporting supported +0x8000001b, 0, eax, 6, ibs_op_counters_ext , IBS IbsOpCurCnt/IbsOpMaxCnt extend by 7 bits +0x8000001b, 0, eax, 7, ibs_rip_invalid_chk , IBS invalid RIP indication supported +0x8000001b, 0, eax, 8, ibs_op_branch_fuse , IBS fused branch micro-op indication supported +0x8000001b, 0, eax, 9, ibs_fetch_ctl_ext , IBS Fetch Control Extended MSR (0xc001103c) supported +0x8000001b, 0, eax, 10, ibs_op_data_4 , IBS op data 4 MSR supported +0x8000001b, 0, eax, 11, ibs_l3_miss_filter , IBS L3-miss filtering supported (Zen4+) + +# Leaf 8000001CH +# AMD LWP (Lightweight Profiling) + +0x8000001c, 0, eax, 0, os_lwp_avail , LWP is available to application programs (supported by OS) +0x8000001c, 0, eax, 1, os_lpwval , LWPVAL instruction (EventId=1) is supported by OS +0x8000001c, 0, eax, 2, os_lwp_ire , Instructions Retired Event (EventId=2) is supported by OS +0x8000001c, 0, eax, 3, os_lwp_bre , Branch Retired Event (EventId=3) is supported by OS +0x8000001c, 0, eax, 4, os_lwp_dme , DCache Miss Event (EventId=4) is supported by OS +0x8000001c, 0, eax, 5, os_lwp_cnh , CPU Clocks Not Halted event (EventId=5) is supported by OS +0x8000001c, 0, eax, 6, os_lwp_rnh , CPU Reference clocks Not Halted event (EventId=6) is supported by OS +0x8000001c, 0, eax, 29, os_lwp_cont , LWP sampling in continuous mode is supported by OS +0x8000001c, 0, eax, 30, os_lwp_ptsc , Performance Time Stamp Counter in event records is supported by OS +0x8000001c, 0, eax, 31, os_lwp_int , Interrupt on threshold overflow is supported by OS +0x8000001c, 0, ebx, 7:0, lwp_lwpcb_sz , LWP Control Block size, in quadwords +0x8000001c, 0, ebx, 15:8, lwp_event_sz , LWP event record size, in bytes +0x8000001c, 0, ebx, 23:16, lwp_max_events , LWP max supported EventId value (EventID 255 not included) +0x8000001c, 0, ebx, 31:24, lwp_event_offset , LWP events area offset in the LWP Control Block +0x8000001c, 0, ecx, 4:0, lwp_latency_max , Num of bits in cache latency counters (10 to 31) +0x8000001c, 0, ecx, 5, lwp_data_adddr , Cache miss events report the data address of the reference +0x8000001c, 0, ecx, 8:6, lwp_latency_rnd , Amount by which cache latency is rounded +0x8000001c, 0, ecx, 15:9, lwp_version , LWP implementation version +0x8000001c, 0, ecx, 23:16, lwp_buf_min_sz , LWP event ring buffer min size, in units of 32 event records +0x8000001c, 0, ecx, 28, lwp_branch_predict , Branches Retired events can be filtered +0x8000001c, 0, ecx, 29, lwp_ip_filtering , IP filtering (IPI, IPF, BaseIP, and LimitIP @ LWPCP) supported +0x8000001c, 0, ecx, 30, lwp_cache_levels , Cache-related events can be filtered by cache level +0x8000001c, 0, ecx, 31, lwp_cache_latency , Cache-related events can be filtered by latency +0x8000001c, 0, edx, 0, hw_lwp_avail , LWP is available in Hardware +0x8000001c, 0, edx, 1, hw_lpwval , LWPVAL instruction (EventId=1) is available in HW +0x8000001c, 0, edx, 2, hw_lwp_ire , Instructions Retired Event (EventId=2) is available in HW +0x8000001c, 0, edx, 3, hw_lwp_bre , Branch Retired Event (EventId=3) is available in HW +0x8000001c, 0, edx, 4, hw_lwp_dme , DCache Miss Event (EventId=4) is available in HW +0x8000001c, 0, edx, 5, hw_lwp_cnh , CPU Clocks Not Halted event (EventId=5) is available in HW +0x8000001c, 0, edx, 6, hw_lwp_rnh , CPU Reference clocks Not Halted event (EventId=6) is available in HW +0x8000001c, 0, edx, 29, hw_lwp_cont , LWP sampling in continuous mode is available in HW +0x8000001c, 0, edx, 30, hw_lwp_ptsc , Performance Time Stamp Counter in event records is available in HW +0x8000001c, 0, edx, 31, hw_lwp_int , Interrupt on threshold overflow is available in HW + +# Leaf 8000001DH +# AMD deterministic cache parameters + +0x8000001d, 31:0, eax, 4:0, cache_type , Cache type field +0x8000001d, 31:0, eax, 7:5, cache_level , Cache level (1-based) +0x8000001d, 31:0, eax, 8, cache_self_init , Self-initializing cache level +0x8000001d, 31:0, eax, 9, fully_associative , Fully-associative cache +0x8000001d, 31:0, eax, 25:14, num_threads_sharing , Number of logical CPUs sharing cache +0x8000001d, 31:0, ebx, 11:0, cache_linesize , System coherency line size (0-based) +0x8000001d, 31:0, ebx, 21:12, cache_npartitions , Physical line partitions (0-based) +0x8000001d, 31:0, ebx, 31:22, cache_nways , Ways of associativity (0-based) +0x8000001d, 31:0, ecx, 30:0, cache_nsets , Cache number of sets (0-based) +0x8000001d, 31:0, edx, 0, wbinvd_rll_no_guarantee, WBINVD/INVD not guaranteed for Remote Lower-Level caches +0x8000001d, 31:0, edx, 1, ll_inclusive , Cache is inclusive of Lower-Level caches + +# Leaf 8000001EH +# AMD CPU topology enumeration + +0x8000001e, 0, eax, 31:0, ext_apic_id , Extended APIC ID +0x8000001e, 0, ebx, 7:0, core_id , Unique per-socket logical core unit ID +0x8000001e, 0, ebx, 15:8, core_nthreas , #Threads per core (zero-based) +0x8000001e, 0, ecx, 7:0, node_id , Node (die) ID of invoking logical CPU +0x8000001e, 0, ecx, 10:8, nnodes_per_socket , #nodes in invoking logical CPU's package/socket + +# Leaf 8000001FH +# AMD encrypted memory capabilities enumeration (SME/SEV) + +0x8000001f, 0, eax, 0, sme , Secure Memory Encryption supported +0x8000001f, 0, eax, 1, sev , Secure Encrypted Virtualization supported +0x8000001f, 0, eax, 2, vm_page_flush , VM Page Flush MSR (0xc001011e) available +0x8000001f, 0, eax, 3, sev_es , SEV Encrypted State supported +0x8000001f, 0, eax, 4, sev_nested_paging , SEV secure nested paging supported +0x8000001f, 0, eax, 5, vm_permission_levels , VMPL supported +0x8000001f, 0, eax, 6, rpmquery , RPMQUERY instruction supported +0x8000001f, 0, eax, 7, vmpl_sss , VMPL supervisor shadwo stack supported +0x8000001f, 0, eax, 8, secure_tsc , Secure TSC supported +0x8000001f, 0, eax, 9, v_tsc_aux , Hardware virtualizes TSC_AUX +0x8000001f, 0, eax, 10, sme_coherent , HW enforces cache coherency across encryption domains +0x8000001f, 0, eax, 11, req_64bit_hypervisor , SEV guest mandates 64-bit hypervisor +0x8000001f, 0, eax, 12, restricted_injection , Restricted Injection supported +0x8000001f, 0, eax, 13, alternate_injection , Alternate Injection supported +0x8000001f, 0, eax, 14, debug_swap , SEV-ES: full debug state swap is supported +0x8000001f, 0, eax, 15, disallow_host_ibs , SEV-ES: Disallowing IBS use by the host is supported +0x8000001f, 0, eax, 16, virt_transparent_enc , Virtual Transparent Encryption +0x8000001f, 0, eax, 17, vmgexit_paremeter , VmgexitParameter is supported in SEV_FEATURES +0x8000001f, 0, eax, 18, virt_tom_msr , Virtual TOM MSR is supported +0x8000001f, 0, eax, 19, virt_ibs , IBS state virtualization is supported for SEV-ES guests +0x8000001f, 0, eax, 24, vmsa_reg_protection , VMSA register protection is supported +0x8000001f, 0, eax, 25, smt_protection , SMT protection is supported +0x8000001f, 0, eax, 28, svsm_page_msr , SVSM communication page MSR (0xc001f000h) is supported +0x8000001f, 0, eax, 29, nested_virt_snp_msr , VIRT_RMPUPDATE/VIRT_PSMASH MSRs are supported +0x8000001f, 0, ebx, 5:0, pte_cbit_pos , PTE bit number used to enable memory encryption +0x8000001f, 0, ebx, 11:6, phys_addr_reduction_nbits, Reduction of phys address space when encryption is enabled, in bits +0x8000001f, 0, ebx, 15:12, vmpl_count , Number of VM permission levels (VMPL) supported +0x8000001f, 0, ecx, 31:0, enc_guests_max , Max supported number of simultaneous encrypted guests +0x8000001f, 0, edx, 31:0, min_sev_asid_no_sev_es , Mininum ASID for SEV-enabled SEV-ES-disabled guest + +# Leaf 80000020H +# AMD Platform QoS extended feature IDs + +0x80000020, 0, ebx, 1, mba , Memory Bandwidth Allocation support +0x80000020, 0, ebx, 2, smba , Slow Memory Bandwidth Allocation support +0x80000020, 0, ebx, 3, bmec , Bandwidth Monitoring Event Configuration support +0x80000020, 0, ebx, 4, l3rr , L3 Range Reservation support +0x80000020, 1, eax, 31:0, mba_limit_len , MBA enforcement limit size +0x80000020, 1, edx, 31:0, mba_cos_max , MBA max Class of Service number (zero-based) +0x80000020, 2, eax, 31:0, smba_limit_len , SMBA enforcement limit size +0x80000020, 2, edx, 31:0, smba_cos_max , SMBA max Class of Service number (zero-based) +0x80000020, 3, ebx, 7:0, bmec_num_events , BMEC number of bandwidth events available +0x80000020, 3, ecx, 0, bmec_local_reads , Local NUMA reads can be tracked +0x80000020, 3, ecx, 1, bmec_remote_reads , Remote NUMA reads can be tracked +0x80000020, 3, ecx, 2, bmec_local_nontemp_wr , Local NUMA non-temporal writes can be tracked +0x80000020, 3, ecx, 3, bmec_remote_nontemp_wr , Remote NUMA non-temporal writes can be tracked +0x80000020, 3, ecx, 4, bmec_local_slow_mem_rd , Local NUMA slow-memory reads can be tracked +0x80000020, 3, ecx, 5, bmec_remote_slow_mem_rd, Remote NUMA slow-memory reads can be tracked +0x80000020, 3, ecx, 6, bmec_all_dirty_victims , Dirty QoS victims to all types of memory can be tracked + +# Leaf 80000021H +# AMD extended features enumeration 2 + +0x80000021, 0, eax, 0, no_nested_data_bp , No nested data breakpoints +0x80000021, 0, eax, 1, fsgs_non_serializing , WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing +0x80000021, 0, eax, 2, lfence_rdtsc , LFENCE always serializing / synchronizes RDTSC +0x80000021, 0, eax, 3, smm_page_cfg_lock , SMM paging configuration lock is supported +0x80000021, 0, eax, 6, null_sel_clr_base , Null selector clears base +0x80000021, 0, eax, 7, upper_addr_ignore , EFER MSR Upper Address Ignore Enable bit supported +0x80000021, 0, eax, 8, autoibrs , EFER MSR Automatic IBRS enable bit supported +0x80000021, 0, eax, 9, no_smm_ctl_msr , SMM_CTL MSR (0xc0010116) is not present +0x80000021, 0, eax, 10, fsrs_supported , Fast Short Rep Stosb (FSRS) is supported +0x80000021, 0, eax, 11, fsrc_supported , Fast Short Repe Cmpsb (FSRC) is supported +0x80000021, 0, eax, 13, prefetch_ctl_msr , Prefetch control MSR is supported +0x80000021, 0, eax, 17, user_cpuid_disable , #GP when executing CPUID at CPL > 0 is supported +0x80000021, 0, eax, 18, epsf_supported , Enhanced Predictive Store Forwarding (EPSF) is supported +0x80000021, 0, ebx, 11:0, microcode_patch_size , Size of microcode patch, in 16-byte units + +# Leaf 80000022H +# AMD Performance Monitoring v2 enumeration + +0x80000022, 0, eax, 0, perfmon_v2 , Performance monitoring v2 supported +0x80000022, 0, eax, 1, lbr_v2 , Last Branch Record v2 extensions (LBR Stack) +0x80000022, 0, eax, 2, lbr_pmc_freeze , Freezing core performance counters / LBR Stack supported +0x80000022, 0, ebx, 3:0, n_pmc_core , Number of core perfomance counters +0x80000022, 0, ebx, 9:4, lbr_v2_stack_size , Number of available LBR stack entries +0x80000022, 0, ebx, 15:10, n_pmc_northbridge , Number of available northbridge (data fabric) performance counters +0x80000022, 0, ebx, 21:16, n_pmc_umc , Number of available UMC performance counters +0x80000022, 0, ecx, 31:0, active_umc_bitmask , Active UMCs bitmask + +# Leaf 80000023H +# AMD Secure Multi-key Encryption enumeration + +0x80000023, 0, eax, 0, mem_hmk_mode , MEM-HMK encryption mode is supported +0x80000023, 0, ebx, 15:0, mem_hmk_avail_keys , MEM-HMK mode: total num of available encryption keys + +# Leaf 80000026H +# AMD extended topology enumeration v2 + +0x80000026, 3:0, eax, 4:0, x2apic_id_shift , Bit width of this level (previous levels inclusive) +0x80000026, 3:0, eax, 29, core_has_pwreff_ranking, This core has a power efficiency ranking +0x80000026, 3:0, eax, 30, domain_has_hybrid_cores, This domain level has hybrid (E, P) cores +0x80000026, 3:0, eax, 31, domain_core_count_asymm, The 'Core' domain has asymmetric cores count +0x80000026, 3:0, ebx, 15:0, domain_lcpus_count , Number of logical CPUs at this domain instance +0x80000026, 3:0, ebx, 23:16, core_pwreff_ranking , This core's static power efficiency ranking +0x80000026, 3:0, ebx, 27:24, core_native_model_id , This core's native model ID +0x80000026, 3:0, ebx, 31:28, core_type , This core's type +0x80000026, 3:0, ecx, 7:0, domain_level , This domain level (subleaf ID) +0x80000026, 3:0, ecx, 15:8, domain_type , This domain type +0x80000026, 3:0, edx, 31:0, x2apic_id , x2APIC ID of current logical CPU From ea66e7107bdc4efb530878f2216390b8bc5bae0d Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 18 Jul 2024 15:47:49 +0200 Subject: [PATCH 023/126] MAINTAINERS: Add x86 cpuid database entry Add a specific entry for the x86 architecture cpuid database. Reference the x86-cpuid.org development mailing list to facilitate easy tracking by external stakeholders such as the Xen developers. Include myself as a reviewer. Note, this MAINTAINERS entry will also cover the auto-generated C cpuid bitfields header files to be submitted in a future series. Signed-off-by: Ahmed S. Darwish Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240718134755.378115-10-darwi@linutronix.de --- MAINTAINERS | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 42decde38320..6daab3f05958 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24773,6 +24773,16 @@ F: Documentation/arch/x86/ F: Documentation/devicetree/bindings/x86/ F: arch/x86/ +X86 CPUID DATABASE +M: Borislav Petkov +M: Thomas Gleixner +M: x86@kernel.org +R: Ahmed S. Darwish +L: x86-cpuid@lists.linux.dev +S: Maintained +W: https://x86-cpuid.org +F: tools/arch/x86/kcpuid/cpuid.csv + X86 ENTRY CODE M: Andy Lutomirski L: linux-kernel@vger.kernel.org From 24cf2bc982ffe02aeffb4a3885c71751a2c7023b Mon Sep 17 00:00:00 2001 From: Aruna Ramakrishna Date: Fri, 2 Aug 2024 06:13:14 +0000 Subject: [PATCH 024/126] x86/pkeys: Add PKRU as a parameter in signal handling functions Assume there's a multithreaded application that runs untrusted user code. Each thread has its stack/code protected by a non-zero PKEY, and the PKRU register is set up such that only that particular non-zero PKEY is enabled. Each thread also sets up an alternate signal stack to handle signals, which is protected by PKEY zero. The PKEYs man page documents that the PKRU will be reset to init_pkru when the signal handler is invoked, which means that PKEY zero access will be enabled. But this reset happens after the kernel attempts to push fpu state to the alternate stack, which is not (yet) accessible by the kernel, which leads to a new SIGSEGV being sent to the application, terminating it. Enabling both the non-zero PKEY (for the thread) and PKEY zero in userspace will not work for this use case. It cannot have the alt stack writeable by all - the rationale here is that the code running in that thread (using a non-zero PKEY) is untrusted and should not have access to the alternate signal stack (that uses PKEY zero), to prevent the return address of a function from being changed. The expectation is that kernel should be able to set up the alternate signal stack and deliver the signal to the application even if PKEY zero is explicitly disabled by the application. The signal handler accessibility should not be dictated by whatever PKRU value the thread sets up. The PKRU register is managed by XSAVE, which means the sigframe contents must match the register contents - which is not the case here. It's required that the signal frame contains the user-defined PKRU value (so that it is restored correctly from sigcontext) but the actual register must be reset to init_pkru so that the alt stack is accessible and the signal can be delivered to the application. It seems that the proper fix here would be to remove PKRU from the XSAVE framework and manage it separately, which is quite complicated. As a workaround, do this: orig_pkru = rdpkru(); wrpkru(orig_pkru & init_pkru_value); xsave_to_user_sigframe(); put_user(pkru_sigframe_addr, orig_pkru) In preparation for writing PKRU to sigframe, pass PKRU as an additional parameter down the call chain from get_sigframe(). No functional change. Signed-off-by: Aruna Ramakrishna Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240802061318.2140081-2-aruna.ramakrishna@oracle.com --- arch/x86/include/asm/fpu/signal.h | 2 +- arch/x86/kernel/fpu/signal.c | 6 +++--- arch/x86/kernel/signal.c | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 611fa41711af..eccc75bc9c4f 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -29,7 +29,7 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long fpu__get_fpstate_size(void); -extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); +extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size, u32 pkru); extern void fpu__clear_user_states(struct fpu *fpu); extern bool fpu__restore_sig(void __user *buf, int ia32_frame); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 247f2225aa9f..2b3b9e140dd4 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -156,7 +156,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, return !err; } -static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) +static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pkru) { if (use_xsave()) return xsave_to_user_sigframe(buf); @@ -185,7 +185,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * For [f]xsave state, update the SW reserved fields in the [f]xsave frame * indicating the absence/presence of the extended state to the user. */ -bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) +bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru) { struct task_struct *tsk = current; struct fpstate *fpstate = tsk->thread.fpu.fpstate; @@ -228,7 +228,7 @@ retry: fpregs_restore_userregs(); pagefault_disable(); - ret = copy_fpregs_to_sigframe(buf_fx); + ret = copy_fpregs_to_sigframe(buf_fx, pkru); pagefault_enable(); fpregs_unlock(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 31b6f5dddfc2..1f1e8e0ac5a3 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -84,6 +84,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, unsigned long math_size = 0; unsigned long sp = regs->sp; unsigned long buf_fx = 0; + u32 pkru = read_pkru(); /* redzone */ if (!ia32_frame) @@ -139,7 +140,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, } /* save i387 and extended state */ - if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size)) + if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) return (void __user *)-1L; return (void __user *)sp; From 84ee6e8d195e4af4c6c4c961bbf9266bdc8b90ac Mon Sep 17 00:00:00 2001 From: Aruna Ramakrishna Date: Fri, 2 Aug 2024 06:13:15 +0000 Subject: [PATCH 025/126] x86/pkeys: Add helper functions to update PKRU on the sigframe In the case where a user thread sets up an alternate signal stack protected by the default PKEY (i.e. PKEY 0), while the thread's stack is protected by a non-zero PKEY, both these PKEYS have to be enabled in the PKRU register for the signal to be delivered to the application correctly. However, the PKRU value restored after handling the signal must not enable this extra PKEY (i.e. PKEY 0) - i.e., the PKRU value in the sigframe has to be overwritten with the user-defined value. Add helper functions that will update PKRU value in the sigframe after XSAVE. Note that sig_prepare_pkru() makes no assumption about which PKEY could be used to protect the altstack (i.e. it may not be part of init_pkru), and so enables all PKEYS. No functional change. Signed-off-by: Aruna Ramakrishna Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240802061318.2140081-3-aruna.ramakrishna@oracle.com --- arch/x86/kernel/fpu/signal.c | 10 ++++++++++ arch/x86/kernel/fpu/xstate.c | 13 +++++++++++++ arch/x86/kernel/fpu/xstate.h | 2 ++ arch/x86/kernel/signal.c | 18 ++++++++++++++++++ 4 files changed, 43 insertions(+) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 2b3b9e140dd4..931c5469d7f3 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -63,6 +63,16 @@ setfx: return true; } +/* + * Update the value of PKRU register that was already pushed onto the signal frame. + */ +static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru) +{ + if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE))) + return 0; + return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU)); +} + /* * Signal frame handlers. */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c5a026fee5e0..fa7628bb541b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -993,6 +993,19 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) } EXPORT_SYMBOL_GPL(get_xsave_addr); +/* + * Given an xstate feature nr, calculate where in the xsave buffer the state is. + * The xsave buffer should be in standard format, not compacted (e.g. user mode + * signal frames). + */ +void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr) +{ + if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) + return NULL; + + return (void __user *)xsave + xstate_offsets[xfeature_nr]; +} + #ifdef CONFIG_ARCH_HAS_PKEYS /* diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 2ee0b9c53dcc..5f057e50df81 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -54,6 +54,8 @@ extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void extern void fpu__init_cpu_xstate(void); extern void fpu__init_system_xstate(unsigned int legacy_size); +extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr); + static inline u64 xfeatures_mask_supervisor(void) { return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 1f1e8e0ac5a3..9dc77ad03a0e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -60,6 +60,24 @@ static inline int is_x32_frame(struct ksignal *ksig) ksig->ka.sa.sa_flags & SA_X32_ABI; } +/* + * Enable all pkeys temporarily, so as to ensure that both the current + * execution stack as well as the alternate signal stack are writeable. + * The application can use any of the available pkeys to protect the + * alternate signal stack, and we don't know which one it is, so enable + * all. The PKRU register will be reset to init_pkru later in the flow, + * in fpu__clear_user_states(), and it is the application's responsibility + * to enable the appropriate pkey as the first step in the signal handler + * so that the handler does not segfault. + */ +static inline u32 sig_prepare_pkru(void) +{ + u32 orig_pkru = read_pkru(); + + write_pkru(0); + return orig_pkru; +} + /* * Set up a signal frame. */ From 70044df250d022572e26cd301bddf75eac1fe50e Mon Sep 17 00:00:00 2001 From: Aruna Ramakrishna Date: Fri, 2 Aug 2024 06:13:16 +0000 Subject: [PATCH 026/126] x86/pkeys: Update PKRU to enable all pkeys before XSAVE If the alternate signal stack is protected by a different PKEY than the current execution stack, copying XSAVE data to the sigaltstack will fail if its PKEY is not enabled in the PKRU register. It's unknown which pkey was used by the application for the altstack, so enable all PKEYS before XSAVE. But this updated PKRU value is also pushed onto the sigframe, which means the register value restored from sigcontext will be different from the user-defined one, which is incorrect. Fix that by overwriting the PKRU value on the sigframe with the original, user-defined PKRU. Signed-off-by: Aruna Ramakrishna Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240802061318.2140081-4-aruna.ramakrishna@oracle.com --- arch/x86/kernel/fpu/signal.c | 11 +++++++++-- arch/x86/kernel/signal.c | 12 ++++++++++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 931c5469d7f3..1065ab995305 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -168,8 +168,15 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pkru) { - if (use_xsave()) - return xsave_to_user_sigframe(buf); + int err = 0; + + if (use_xsave()) { + err = xsave_to_user_sigframe(buf); + if (!err) + err = update_pkru_in_sigframe(buf, pkru); + return err; + } + if (use_fxsr()) return fxsave_to_user_sigframe((struct fxregs_state __user *) buf); else diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 9dc77ad03a0e..5f441039b572 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -102,7 +102,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, unsigned long math_size = 0; unsigned long sp = regs->sp; unsigned long buf_fx = 0; - u32 pkru = read_pkru(); + u32 pkru; /* redzone */ if (!ia32_frame) @@ -157,9 +157,17 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, return (void __user *)-1L; } + /* Update PKRU to enable access to the alternate signal stack. */ + pkru = sig_prepare_pkru(); /* save i387 and extended state */ - if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) + if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) { + /* + * Restore PKRU to the original, user-defined value; disable + * extra pkeys enabled for the alternate signal stack, if any. + */ + write_pkru(pkru); return (void __user *)-1L; + } return (void __user *)sp; } From d10b554919d4cc8fa8fe2e95b57ad2624728c8e4 Mon Sep 17 00:00:00 2001 From: Aruna Ramakrishna Date: Fri, 2 Aug 2024 06:13:17 +0000 Subject: [PATCH 027/126] x86/pkeys: Restore altstack access in sigreturn() A process can disable access to the alternate signal stack by not enabling the altstack's PKEY in the PKRU register. Nevertheless, the kernel updates the PKRU temporarily for signal handling. However, in sigreturn(), restore_sigcontext() will restore the PKRU to the user-defined PKRU value. This will cause restore_altstack() to fail with a SIGSEGV as it needs read access to the altstack which is prohibited by the user-defined PKRU value. Fix this by restoring altstack before restoring PKRU. Signed-off-by: Aruna Ramakrishna Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240802061318.2140081-5-aruna.ramakrishna@oracle.com --- arch/x86/kernel/signal_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 8a94053c5444..ee9453891901 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -260,15 +260,15 @@ SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); + if (restore_altstack(&frame->uc.uc_stack)) + goto badframe; + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (restore_signal_shadow_stack()) goto badframe; - if (restore_altstack(&frame->uc.uc_stack)) - goto badframe; - return regs->ax; badframe: From 6998a73efbb8a87f4dd0bddde90b7f5b0d47b5e0 Mon Sep 17 00:00:00 2001 From: Keith Lucas Date: Fri, 2 Aug 2024 06:13:18 +0000 Subject: [PATCH 028/126] selftests/mm: Add new testcases for pkeys Add a few new tests to exercise the signal handler flow, especially with PKEY 0 disabled: - Verify that the SIGSEGV handler is invoked when pkey 0 is disabled. - Verify that a thread which disables PKEY 0 segfaults with PKUERR when accessing the stack. - Verify that the SIGSEGV handler that uses an alternate signal stack is correctly invoked when the thread disabled PKEY 0 - Verify that the PKRU value set by the application is correctly restored upon return from signal handling. - Verify that sigreturn() is able to restore the altstack even if the thread had PKEY 0 disabled [ Aruna: Adapted to upstream ] [ tglx: Made it actually compile. Restored protection_keys compile. Added useful info to the changelog instead of bare function names. ] Signed-off-by: Keith Lucas Signed-off-by: Aruna Ramakrishna Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240802061318.2140081-6-aruna.ramakrishna@oracle.com --- tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/pkey-helpers.h | 13 +- .../selftests/mm/pkey_sighandler_tests.c | 481 ++++++++++++++++++ tools/testing/selftests/mm/protection_keys.c | 10 - 4 files changed, 494 insertions(+), 11 deletions(-) create mode 100644 tools/testing/selftests/mm/pkey_sighandler_tests.c diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 901e0d07765b..1f176fff7054 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -88,6 +88,7 @@ CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_pr CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) VMTARGETS := protection_keys +VMTARGETS += pkey_sighandler_tests BINARIES_32 := $(VMTARGETS:%=%_32) BINARIES_64 := $(VMTARGETS:%=%_64) diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index 1af3156a9db8..4d31a309a46b 100644 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -79,7 +79,18 @@ extern void abort_hooks(void); } \ } while (0) -__attribute__((noinline)) int read_ptr(int *ptr); +#define barrier() __asm__ __volatile__("": : :"memory") +#ifndef noinline +# define noinline __attribute__((noinline)) +#endif + +noinline int read_ptr(int *ptr) +{ + /* Keep GCC from optimizing this away somehow */ + barrier(); + return *ptr; +} + void expected_pkey_fault(int pkey); int sys_pkey_alloc(unsigned long flags, unsigned long init_val); int sys_pkey_free(unsigned long pkey); diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c new file mode 100644 index 000000000000..a8088b645ad6 --- /dev/null +++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst) + * + * The testcases in this file exercise various flows related to signal handling, + * using an alternate signal stack, with the default pkey (pkey 0) disabled. + * + * Compile with: + * gcc -mxsave -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm + * gcc -mxsave -m32 -o pkey_sighandler_tests -O2 -g -std=gnu99 -pthread -Wall pkey_sighandler_tests.c -I../../../../tools/include -lrt -ldl -lm + */ +#define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pkey-helpers.h" + +#define STACK_SIZE PTHREAD_STACK_MIN + +void expected_pkey_fault(int pkey) {} + +pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_cond_t cond = PTHREAD_COND_INITIALIZER; +siginfo_t siginfo = {0}; + +/* + * We need to use inline assembly instead of glibc's syscall because glibc's + * syscall will attempt to access the PLT in order to call a library function + * which is protected by MPK 0 which we don't have access to. + */ +static inline __always_inline +long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6) +{ + unsigned long ret; +#ifdef __x86_64__ + register long r10 asm("r10") = a4; + register long r8 asm("r8") = a5; + register long r9 asm("r9") = a6; + asm volatile ("syscall" + : "=a"(ret) + : "a"(n), "D"(a1), "S"(a2), "d"(a3), "r"(r10), "r"(r8), "r"(r9) + : "rcx", "r11", "memory"); +#elif defined __i386__ + asm volatile ("int $0x80" + : "=a"(ret) + : "a"(n), "b"(a1), "c"(a2), "d"(a3), "S"(a4), "D"(a5) + : "memory"); +#else +# error syscall_raw() not implemented +#endif + return ret; +} + +static void sigsegv_handler(int signo, siginfo_t *info, void *ucontext) +{ + pthread_mutex_lock(&mutex); + + memcpy(&siginfo, info, sizeof(siginfo_t)); + + pthread_cond_signal(&cond); + pthread_mutex_unlock(&mutex); + + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); +} + +static void sigusr1_handler(int signo, siginfo_t *info, void *ucontext) +{ + pthread_mutex_lock(&mutex); + + memcpy(&siginfo, info, sizeof(siginfo_t)); + + pthread_cond_signal(&cond); + pthread_mutex_unlock(&mutex); +} + +static void sigusr2_handler(int signo, siginfo_t *info, void *ucontext) +{ + /* + * pkru should be the init_pkru value which enabled MPK 0 so + * we can use library functions. + */ + printf("%s invoked.\n", __func__); +} + +static void raise_sigusr2(void) +{ + pid_t tid = 0; + + tid = syscall_raw(SYS_gettid, 0, 0, 0, 0, 0, 0); + + syscall_raw(SYS_tkill, tid, SIGUSR2, 0, 0, 0, 0); + + /* + * We should return from the signal handler here and be able to + * return to the interrupted thread. + */ +} + +static void *thread_segv_with_pkey0_disabled(void *ptr) +{ + /* Disable MPK 0 (and all others too) */ + __write_pkey_reg(0x55555555); + + /* Segfault (with SEGV_MAPERR) */ + *(int *) (0x1) = 1; + return NULL; +} + +static void *thread_segv_pkuerr_stack(void *ptr) +{ + /* Disable MPK 0 (and all others too) */ + __write_pkey_reg(0x55555555); + + /* After we disable MPK 0, we can't access the stack to return */ + return NULL; +} + +static void *thread_segv_maperr_ptr(void *ptr) +{ + stack_t *stack = ptr; + int *bad = (int *)1; + + /* + * Setup alternate signal stack, which should be pkey_mprotect()ed by + * MPK 0. The thread's stack cannot be used for signals because it is + * not accessible by the default init_pkru value of 0x55555554. + */ + syscall_raw(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0); + + /* Disable MPK 0. Only MPK 1 is enabled. */ + __write_pkey_reg(0x55555551); + + /* Segfault */ + *bad = 1; + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + return NULL; +} + +/* + * Verify that the sigsegv handler is invoked when pkey 0 is disabled. + * Note that the new thread stack and the alternate signal stack is + * protected by MPK 0. + */ +static void test_sigsegv_handler_with_pkey0_disabled(void) +{ + struct sigaction sa; + pthread_attr_t attr; + pthread_t thr; + + sa.sa_flags = SA_SIGINFO; + + sa.sa_sigaction = sigsegv_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + memset(&siginfo, 0, sizeof(siginfo)); + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + + pthread_create(&thr, &attr, thread_segv_with_pkey0_disabled, NULL); + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + ksft_test_result(siginfo.si_signo == SIGSEGV && + siginfo.si_code == SEGV_MAPERR && + siginfo.si_addr == (void *)1, + "%s\n", __func__); +} + +/* + * Verify that the sigsegv handler is invoked when pkey 0 is disabled. + * Note that the new thread stack and the alternate signal stack is + * protected by MPK 0, which renders them inaccessible when MPK 0 + * is disabled. So just the return from the thread should cause a + * segfault with SEGV_PKUERR. + */ +static void test_sigsegv_handler_cannot_access_stack(void) +{ + struct sigaction sa; + pthread_attr_t attr; + pthread_t thr; + + sa.sa_flags = SA_SIGINFO; + + sa.sa_sigaction = sigsegv_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + memset(&siginfo, 0, sizeof(siginfo)); + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + + pthread_create(&thr, &attr, thread_segv_pkuerr_stack, NULL); + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + ksft_test_result(siginfo.si_signo == SIGSEGV && + siginfo.si_code == SEGV_PKUERR, + "%s\n", __func__); +} + +/* + * Verify that the sigsegv handler that uses an alternate signal stack + * is correctly invoked for a thread which uses a non-zero MPK to protect + * its own stack, and disables all other MPKs (including 0). + */ +static void test_sigsegv_handler_with_different_pkey_for_stack(void) +{ + struct sigaction sa; + static stack_t sigstack; + void *stack; + int pkey; + int parent_pid = 0; + int child_pid = 0; + + sa.sa_flags = SA_SIGINFO | SA_ONSTACK; + + sa.sa_sigaction = sigsegv_handler; + + sigemptyset(&sa.sa_mask); + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + assert(stack != MAP_FAILED); + + /* Allow access to MPK 0 and MPK 1 */ + __write_pkey_reg(0x55555550); + + /* Protect the new stack with MPK 1 */ + pkey = pkey_alloc(0, 0); + pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); + + /* Set up alternate signal stack that will use the default MPK */ + sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + sigstack.ss_flags = 0; + sigstack.ss_size = STACK_SIZE; + + memset(&siginfo, 0, sizeof(siginfo)); + + /* Use clone to avoid newer glibcs using rseq on new threads */ + long ret = syscall_raw(SYS_clone, + CLONE_VM | CLONE_FS | CLONE_FILES | + CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | + CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | + CLONE_DETACHED, + (long) ((char *)(stack) + STACK_SIZE), + (long) &parent_pid, + (long) &child_pid, 0, 0); + + if (ret < 0) { + errno = -ret; + perror("clone"); + } else if (ret == 0) { + thread_segv_maperr_ptr(&sigstack); + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + } + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + ksft_test_result(siginfo.si_signo == SIGSEGV && + siginfo.si_code == SEGV_MAPERR && + siginfo.si_addr == (void *)1, + "%s\n", __func__); +} + +/* + * Verify that the PKRU value set by the application is correctly + * restored upon return from signal handling. + */ +static void test_pkru_preserved_after_sigusr1(void) +{ + struct sigaction sa; + unsigned long pkru = 0x45454544; + + sa.sa_flags = SA_SIGINFO; + + sa.sa_sigaction = sigusr1_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGUSR1, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + memset(&siginfo, 0, sizeof(siginfo)); + + __write_pkey_reg(pkru); + + raise(SIGUSR1); + + pthread_mutex_lock(&mutex); + while (siginfo.si_signo == 0) + pthread_cond_wait(&cond, &mutex); + pthread_mutex_unlock(&mutex); + + /* Ensure the pkru value is the same after returning from signal. */ + ksft_test_result(pkru == __read_pkey_reg() && + siginfo.si_signo == SIGUSR1, + "%s\n", __func__); +} + +static noinline void *thread_sigusr2_self(void *ptr) +{ + /* + * A const char array like "Resuming after SIGUSR2" won't be stored on + * the stack and the code could access it via an offset from the program + * counter. This makes sure it's on the function's stack frame. + */ + char str[] = {'R', 'e', 's', 'u', 'm', 'i', 'n', 'g', ' ', + 'a', 'f', 't', 'e', 'r', ' ', + 'S', 'I', 'G', 'U', 'S', 'R', '2', + '.', '.', '.', '\n', '\0'}; + stack_t *stack = ptr; + + /* + * Setup alternate signal stack, which should be pkey_mprotect()ed by + * MPK 0. The thread's stack cannot be used for signals because it is + * not accessible by the default init_pkru value of 0x55555554. + */ + syscall(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0); + + /* Disable MPK 0. Only MPK 2 is enabled. */ + __write_pkey_reg(0x55555545); + + raise_sigusr2(); + + /* Do something, to show the thread resumed execution after the signal */ + syscall_raw(SYS_write, 1, (long) str, sizeof(str) - 1, 0, 0, 0); + + /* + * We can't return to test_pkru_sigreturn because it + * will attempt to use a %rbp value which is on the stack + * of the main thread. + */ + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + return NULL; +} + +/* + * Verify that sigreturn is able to restore altstack even if the thread had + * disabled pkey 0. + */ +static void test_pkru_sigreturn(void) +{ + struct sigaction sa = {0}; + static stack_t sigstack; + void *stack; + int pkey; + int parent_pid = 0; + int child_pid = 0; + + sa.sa_handler = SIG_DFL; + sa.sa_flags = 0; + sigemptyset(&sa.sa_mask); + + /* + * For this testcase, we do not want to handle SIGSEGV. Reset handler + * to default so that the application can crash if it receives SIGSEGV. + */ + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + sa.sa_flags = SA_SIGINFO | SA_ONSTACK; + sa.sa_sigaction = sigusr2_handler; + sigemptyset(&sa.sa_mask); + + if (sigaction(SIGUSR2, &sa, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + stack = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + assert(stack != MAP_FAILED); + + /* + * Allow access to MPK 0 and MPK 2. The child thread (to be created + * later in this flow) will have its stack protected by MPK 2, whereas + * the current thread's stack is protected by the default MPK 0. Hence + * both need to be enabled. + */ + __write_pkey_reg(0x55555544); + + /* Protect the stack with MPK 2 */ + pkey = pkey_alloc(0, 0); + pkey_mprotect(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); + + /* Set up alternate signal stack that will use the default MPK */ + sigstack.ss_sp = mmap(0, STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + sigstack.ss_flags = 0; + sigstack.ss_size = STACK_SIZE; + + /* Use clone to avoid newer glibcs using rseq on new threads */ + long ret = syscall_raw(SYS_clone, + CLONE_VM | CLONE_FS | CLONE_FILES | + CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | + CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | + CLONE_DETACHED, + (long) ((char *)(stack) + STACK_SIZE), + (long) &parent_pid, + (long) &child_pid, 0, 0); + + if (ret < 0) { + errno = -ret; + perror("clone"); + } else if (ret == 0) { + thread_sigusr2_self(&sigstack); + syscall_raw(SYS_exit, 0, 0, 0, 0, 0, 0); + } + + child_pid = ret; + /* Check that thread exited */ + do { + sched_yield(); + ret = syscall_raw(SYS_tkill, child_pid, 0, 0, 0, 0, 0); + } while (ret != -ESRCH && ret != -EINVAL); + + ksft_test_result_pass("%s\n", __func__); +} + +static void (*pkey_tests[])(void) = { + test_sigsegv_handler_with_pkey0_disabled, + test_sigsegv_handler_cannot_access_stack, + test_sigsegv_handler_with_different_pkey_for_stack, + test_pkru_preserved_after_sigusr1, + test_pkru_sigreturn +}; + +int main(int argc, char *argv[]) +{ + int i; + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(pkey_tests)); + + for (i = 0; i < ARRAY_SIZE(pkey_tests); i++) + (*pkey_tests[i])(); + + ksft_finished(); + return 0; +} diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index eaa6d1fc5328..cc6de1644360 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -950,16 +950,6 @@ void close_test_fds(void) nr_test_fds = 0; } -#define barrier() __asm__ __volatile__("": : :"memory") -__attribute__((noinline)) int read_ptr(int *ptr) -{ - /* - * Keep GCC from optimizing this away somehow - */ - barrier(); - return *ptr; -} - void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) { int i, err; From 70e6b7d9ae3c63df90a7bba7700e8d5c300c3c60 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 2 Aug 2024 14:55:54 +0100 Subject: [PATCH 029/126] x86/i8253: Disable PIT timer 0 when not in use Leaving the PIT interrupt running can cause noticeable steal time for virtual guests. The VMM generally has a timer which toggles the IRQ input to the PIC and I/O APIC, which takes CPU time away from the guest. Even on real hardware, running the counter may use power needlessly (albeit not much). Make sure it's turned off if it isn't going to be used. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Link: https://lore.kernel.org/all/20240802135555.564941-1-dwmw2@infradead.org --- arch/x86/kernel/i8253.c | 11 +++++++++-- drivers/clocksource/i8253.c | 13 +++++++++---- include/linux/i8253.h | 1 + 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 2b7999a1a50a..80e262bb627f 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -39,9 +40,15 @@ static bool __init use_pit(void) bool __init pit_timer_init(void) { - if (!use_pit()) + if (!use_pit()) { + /* + * Don't just ignore the PIT. Ensure it's stopped, because + * VMMs otherwise steal CPU time just to pointlessly waggle + * the (masked) IRQ. + */ + clockevent_i8253_disable(); return false; - + } clockevent_i8253_init(true); global_clock_event = &i8253_clockevent; return true; diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c index d4350bb10b83..cb215e6f2e83 100644 --- a/drivers/clocksource/i8253.c +++ b/drivers/clocksource/i8253.c @@ -108,11 +108,8 @@ int __init clocksource_i8253_init(void) #endif #ifdef CONFIG_CLKEVT_I8253 -static int pit_shutdown(struct clock_event_device *evt) +void clockevent_i8253_disable(void) { - if (!clockevent_state_oneshot(evt) && !clockevent_state_periodic(evt)) - return 0; - raw_spin_lock(&i8253_lock); outb_p(0x30, PIT_MODE); @@ -123,6 +120,14 @@ static int pit_shutdown(struct clock_event_device *evt) } raw_spin_unlock(&i8253_lock); +} + +static int pit_shutdown(struct clock_event_device *evt) +{ + if (!clockevent_state_oneshot(evt) && !clockevent_state_periodic(evt)) + return 0; + + clockevent_i8253_disable(); return 0; } diff --git a/include/linux/i8253.h b/include/linux/i8253.h index 8336b2f6f834..bf169cfef7f1 100644 --- a/include/linux/i8253.h +++ b/include/linux/i8253.h @@ -24,6 +24,7 @@ extern raw_spinlock_t i8253_lock; extern bool i8253_clear_counter_on_shutdown; extern struct clock_event_device i8253_clockevent; extern void clockevent_i8253_init(bool oneshot); +extern void clockevent_i8253_disable(void); extern void setup_pit_timer(void); From 531b2ca0a940ac9db03f246c8b77c4201de72b00 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 2 Aug 2024 14:55:55 +0100 Subject: [PATCH 030/126] clockevents/drivers/i8253: Fix stop sequence for timer 0 According to the data sheet, writing the MODE register should stop the counter (and thus the interrupts). This appears to work on real hardware, at least modern Intel and AMD systems. It should also work on Hyper-V. However, on some buggy virtual machines the mode change doesn't have any effect until the counter is subsequently loaded (or perhaps when the IRQ next fires). So, set MODE 0 and then load the counter, to ensure that those buggy VMs do the right thing and the interrupts stop. And then write MODE 0 *again* to stop the counter on compliant implementations too. Apparently, Hyper-V keeps firing the IRQ *repeatedly* even in mode zero when it should only happen once, but the second MODE write stops that too. Userspace test program (mostly written by tglx): ===== #include #include #include #include #include static __always_inline void __out##bwl(type value, uint16_t port) \ { \ asm volatile("out" #bwl " %" #bw "0, %w1" \ : : "a"(value), "Nd"(port)); \ } \ \ static __always_inline type __in##bwl(uint16_t port) \ { \ type value; \ asm volatile("in" #bwl " %w1, %" #bw "0" \ : "=a"(value) : "Nd"(port)); \ return value; \ } BUILDIO(b, b, uint8_t) #define inb __inb #define outb __outb #define PIT_MODE 0x43 #define PIT_CH0 0x40 #define PIT_CH2 0x42 static int is8254; static void dump_pit(void) { if (is8254) { // Latch and output counter and status outb(0xC2, PIT_MODE); printf("%02x %02x %02x\n", inb(PIT_CH0), inb(PIT_CH0), inb(PIT_CH0)); } else { // Latch and output counter outb(0x0, PIT_MODE); printf("%02x %02x\n", inb(PIT_CH0), inb(PIT_CH0)); } } int main(int argc, char* argv[]) { int nr_counts = 2; if (argc > 1) nr_counts = atoi(argv[1]); if (argc > 2) is8254 = 1; if (ioperm(0x40, 4, 1) != 0) return 1; dump_pit(); printf("Set oneshot\n"); outb(0x38, PIT_MODE); outb(0x00, PIT_CH0); outb(0x0F, PIT_CH0); dump_pit(); usleep(1000); dump_pit(); printf("Set periodic\n"); outb(0x34, PIT_MODE); outb(0x00, PIT_CH0); outb(0x0F, PIT_CH0); dump_pit(); usleep(1000); dump_pit(); dump_pit(); usleep(100000); dump_pit(); usleep(100000); dump_pit(); printf("Set stop (%d counter writes)\n", nr_counts); outb(0x30, PIT_MODE); while (nr_counts--) outb(0xFF, PIT_CH0); dump_pit(); usleep(100000); dump_pit(); usleep(100000); dump_pit(); printf("Set MODE 0\n"); outb(0x30, PIT_MODE); dump_pit(); usleep(100000); dump_pit(); usleep(100000); dump_pit(); return 0; } ===== Suggested-by: Sean Christopherson Co-developed-by: Li RongQing Signed-off-by: Li RongQing Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Link: https://lore.kernel.org/all/20240802135555.564941-2-dwmw2@infradead.org --- arch/x86/kernel/cpu/mshyperv.c | 11 ----------- drivers/clocksource/i8253.c | 36 +++++++++++++++++++++++----------- include/linux/i8253.h | 1 - 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index e0fd57a8ba84..3d4237f27569 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -522,16 +521,6 @@ static void __init ms_hyperv_init_platform(void) if (efi_enabled(EFI_BOOT)) x86_platform.get_nmi_reason = hv_get_nmi_reason; - /* - * Hyper-V VMs have a PIT emulation quirk such that zeroing the - * counter register during PIT shutdown restarts the PIT. So it - * continues to interrupt @18.2 HZ. Setting i8253_clear_counter - * to false tells pit_shutdown() not to zero the counter so that - * the PIT really is shutdown. Generation 2 VMs don't have a PIT, - * and setting this value has no effect. - */ - i8253_clear_counter_on_shutdown = false; - #if IS_ENABLED(CONFIG_HYPERV) if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) || ms_hyperv.paravisor_present) diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c index cb215e6f2e83..39f7c2d736d1 100644 --- a/drivers/clocksource/i8253.c +++ b/drivers/clocksource/i8253.c @@ -20,13 +20,6 @@ DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -/* - * Handle PIT quirk in pit_shutdown() where zeroing the counter register - * restarts the PIT, negating the shutdown. On platforms with the quirk, - * platform specific code can set this to false. - */ -bool i8253_clear_counter_on_shutdown __ro_after_init = true; - #ifdef CONFIG_CLKSRC_I8253 /* * Since the PIT overflows every tick, its not very useful @@ -112,12 +105,33 @@ void clockevent_i8253_disable(void) { raw_spin_lock(&i8253_lock); + /* + * Writing the MODE register should stop the counter, according to + * the datasheet. This appears to work on real hardware (well, on + * modern Intel and AMD boxes; I didn't dig the Pegasos out of the + * shed). + * + * However, some virtual implementations differ, and the MODE change + * doesn't have any effect until either the counter is written (KVM + * in-kernel PIT) or the next interrupt (QEMU). And in those cases, + * it may not stop the *count*, only the interrupts. Although in + * the virt case, that probably doesn't matter, as the value of the + * counter will only be calculated on demand if the guest reads it; + * it's the interrupts which cause steal time. + * + * Hyper-V apparently has a bug where even in mode 0, the IRQ keeps + * firing repeatedly if the counter is running. But it *does* do the + * right thing when the MODE register is written. + * + * So: write the MODE and then load the counter, which ensures that + * the IRQ is stopped on those buggy virt implementations. And then + * write the MODE again, which is the right way to stop it. + */ outb_p(0x30, PIT_MODE); + outb_p(0, PIT_CH0); + outb_p(0, PIT_CH0); - if (i8253_clear_counter_on_shutdown) { - outb_p(0, PIT_CH0); - outb_p(0, PIT_CH0); - } + outb_p(0x30, PIT_MODE); raw_spin_unlock(&i8253_lock); } diff --git a/include/linux/i8253.h b/include/linux/i8253.h index bf169cfef7f1..56c280eb2d4f 100644 --- a/include/linux/i8253.h +++ b/include/linux/i8253.h @@ -21,7 +21,6 @@ #define PIT_LATCH ((PIT_TICK_RATE + HZ/2) / HZ) extern raw_spinlock_t i8253_lock; -extern bool i8253_clear_counter_on_shutdown; extern struct clock_event_device i8253_clockevent; extern void clockevent_i8253_init(bool oneshot); extern void clockevent_i8253_disable(void); From e7ff4ebffe3bedf55560ef861d80f6500ff0d76f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 2 Aug 2024 08:46:18 -0700 Subject: [PATCH 031/126] x86/tsc: Check for sockets instead of CPUs to make code match comment The unsynchronized_tsc() eventually checks num_possible_cpus(), and if the system is non-Intel and the number of possible CPUs is greater than one, assumes that TSCs are unsynchronized. This despite the comment saying "assume multi socket systems are not synchronized", that is, socket rather than CPU. This behavior was preserved by commit 8fbbc4b45ce3 ("x86: merge tsc_init and clocksource code") and by the previous relevant commit 7e69f2b1ead2 ("clocksource: Remove the update callback"). The clocksource drivers were added by commit 5d0cf410e94b ("Time: i386 Clocksource Drivers") back in 2006, and the comment still said "socket" rather than "CPU". Therefore, bravely (and perhaps foolishly) make the code match the comment. Note that it is possible to bypass both code and comment by booting with tsc=reliable, but this also disables the clocksource watchdog, which is undesirable when trust in the TSC is strictly limited. Reported-by: Zhengxu Chen Reported-by: Danielle Costantino Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240802154618.4149953-5-paulmck@kernel.org --- arch/x86/kernel/tsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 0ced1870ed40..dfe6847fd99e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1288,7 +1288,7 @@ int unsynchronized_tsc(void) */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { /* assume multi socket systems are not synchronized: */ - if (num_possible_cpus() > 1) + if (topology_max_packages() > 1) return 1; } From 5760929f6545c651682de3c2c6c6786816b17bb1 Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Wed, 17 Jul 2024 16:31:20 -0500 Subject: [PATCH 032/126] x86/kexec: Add EFI config table identity mapping for kexec kernel A kexec kernel boot failure is sometimes observed on AMD CPUs due to an unmapped EFI config table array. This can be seen when "nogbpages" is on the kernel command line, and has been observed as a full BIOS reboot rather than a successful kexec. This was also the cause of reported regressions attributed to Commit 7143c5f4cf20 ("x86/mm/ident_map: Use gbpages only where full GB page should be mapped.") which was subsequently reverted. To avoid this page fault, explicitly include the EFI config table array in the kexec identity map. Further explanation: The following 2 commits caused the EFI config table array to be accessed when enabling sev at kernel startup. commit ec1c66af3a30 ("x86/compressed/64: Detect/setup SEV/SME features earlier during boot") commit c01fce9cef84 ("x86/compressed: Add SEV-SNP feature detection/setup") This is in the code that examines whether SEV should be enabled or not, so it can even affect systems that are not SEV capable. This may result in a page fault if the EFI config table array's address is unmapped. Since the page fault occurs before the new kernel establishes its own identity map and page fault routines, it is unrecoverable and kexec fails. Most often, this problem is not seen because the EFI config table array gets included in the map by the luck of being placed at a memory address close enough to other memory areas that *are* included in the map created by kexec. Both the "nogbpages" command line option and the "use gpbages only where full GB page should be mapped" change greatly reduce the chance of being included in the map by luck, which is why the problem appears. Signed-off-by: Tao Liu Signed-off-by: Steve Wahl Signed-off-by: Thomas Gleixner Tested-by: Pavin Joseph Tested-by: Sarah Brofeldt Tested-by: Eric Hagberg Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/all/20240717213121.3064030-2-steve.wahl@hpe.com --- arch/x86/kernel/machine_kexec_64.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index cc0f7f70b17b..9c9ac606893e 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -28,6 +28,7 @@ #include #include #include +#include #ifdef CONFIG_ACPI /* @@ -87,6 +88,8 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) { #ifdef CONFIG_EFI unsigned long mstart, mend; + void *kaddr; + int ret; if (!efi_enabled(EFI_BOOT)) return 0; @@ -102,6 +105,30 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) if (!mstart) return 0; + ret = kernel_ident_mapping_init(info, level4p, mstart, mend); + if (ret) + return ret; + + kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB); + if (!kaddr) { + pr_err("Could not map UEFI system table\n"); + return -ENOMEM; + } + + mstart = efi_config_table; + + if (efi_enabled(EFI_64BIT)) { + efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr; + + mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables; + } else { + efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr; + + mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables; + } + + memunmap(kaddr); + return kernel_ident_mapping_init(info, level4p, mstart, mend); #endif return 0; From cc31744a294584a36bf764a0ffa3255a8e69f036 Mon Sep 17 00:00:00 2001 From: Steve Wahl Date: Wed, 17 Jul 2024 16:31:21 -0500 Subject: [PATCH 033/126] x86/mm/ident_map: Use gbpages only where full GB page should be mapped. When ident_pud_init() uses only GB pages to create identity maps, large ranges of addresses not actually requested can be included in the resulting table; a 4K request will map a full GB. This can include a lot of extra address space past that requested, including areas marked reserved by the BIOS. That allows processor speculation into reserved regions, that on UV systems can cause system halts. Only use GB pages when map creation requests include the full GB page of space. Fall back to using smaller 2M pages when only portions of a GB page are included in the request. No attempt is made to coalesce mapping requests. If a request requires a map entry at the 2M (pmd) level, subsequent mapping requests within the same 1G region will also be at the pmd level, even if adjacent or overlapping such requests could have been combined to map a full GB page. Existing usage starts with larger regions and then adds smaller regions, so this should not have any great consequence. Signed-off-by: Steve Wahl Signed-off-by: Thomas Gleixner Tested-by: Pavin Joseph Tested-by: Sarah Brofeldt Tested-by: Eric Hagberg Link: https://lore.kernel.org/all/20240717213121.3064030-3-steve.wahl@hpe.com --- arch/x86/mm/ident_map.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index c45127265f2f..437e96fb4977 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -99,18 +99,31 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, for (; addr < end; addr = next) { pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; + bool use_gbpage; next = (addr & PUD_MASK) + PUD_SIZE; if (next > end) next = end; - if (info->direct_gbpages) { + /* if this is already a gbpage, this portion is already mapped */ + if (pud_leaf(*pud)) + continue; + + /* Is using a gbpage allowed? */ + use_gbpage = info->direct_gbpages; + + /* Don't use gbpage if it maps more than the requested region. */ + /* at the begining: */ + use_gbpage &= ((addr & ~PUD_MASK) == 0); + /* ... or at the end: */ + use_gbpage &= ((next & ~PUD_MASK) == 0); + + /* Never overwrite existing mappings */ + use_gbpage &= !pud_present(*pud); + + if (use_gbpage) { pud_t pudval; - if (pud_present(*pud)) - continue; - - addr &= PUD_MASK; pudval = __pud((addr - info->offset) | info->page_flag); set_pud(pud, pudval); continue; From 7424fc6b86c8980a87169e005f5cd4438d18efe6 Mon Sep 17 00:00:00 2001 From: Gatlin Newhouse Date: Wed, 24 Jul 2024 00:01:55 +0000 Subject: [PATCH 034/126] x86/traps: Enable UBSAN traps on x86 Currently ARM64 extracts which specific sanitizer has caused a trap via encoded data in the trap instruction. Clang on x86 currently encodes the same data in the UD1 instruction but x86 handle_bug() and is_valid_bugaddr() currently only look at UD2. Bring x86 to parity with ARM64, similar to commit 25b84002afb9 ("arm64: Support Clang UBSAN trap codes for better reporting"). See the llvm links for information about the code generation. Enable the reporting of UBSAN sanitizer details on x86 compiled with clang when CONFIG_UBSAN_TRAP=y by analysing UD1 and retrieving the type immediate which is encoded by the compiler after the UD1. [ tglx: Simplified it by moving the printk() into handle_bug() ] Signed-off-by: Gatlin Newhouse Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Kees Cook Link: https://lore.kernel.org/all/20240724000206.451425-1-gatlin.newhouse@gmail.com Link: https://github.com/llvm/llvm-project/commit/c5978f42ec8e9#diff-bb68d7cd885f41cfc35843998b0f9f534adb60b415f647109e597ce448e92d9f Link: https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/X86/X86InstrSystem.td#L27 --- arch/x86/include/asm/bug.h | 12 ++++++++ arch/x86/kernel/traps.c | 59 ++++++++++++++++++++++++++++++++++---- include/linux/ubsan.h | 5 ++++ lib/Kconfig.ubsan | 4 +-- 4 files changed, 73 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index a3ec87d198ac..806649c7f23d 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -13,6 +13,18 @@ #define INSN_UD2 0x0b0f #define LEN_UD2 2 +/* + * In clang we have UD1s reporting UBSAN failures on X86, 64 and 32bit. + */ +#define INSN_ASOP 0x67 +#define OPCODE_ESCAPE 0x0f +#define SECOND_BYTE_OPCODE_UD1 0xb9 +#define SECOND_BYTE_OPCODE_UD2 0x0b + +#define BUG_NONE 0xffff +#define BUG_UD1 0xfffe +#define BUG_UD2 0xfffd + #ifdef CONFIG_GENERIC_BUG #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4fa0b17e5043..415881607c5d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -91,6 +92,47 @@ __always_inline int is_valid_bugaddr(unsigned long addr) return *(unsigned short *)addr == INSN_UD2; } +/* + * Check for UD1 or UD2, accounting for Address Size Override Prefixes. + * If it's a UD1, get the ModRM byte to pass along to UBSan. + */ +__always_inline int decode_bug(unsigned long addr, u32 *imm) +{ + u8 v; + + if (addr < TASK_SIZE_MAX) + return BUG_NONE; + + v = *(u8 *)(addr++); + if (v == INSN_ASOP) + v = *(u8 *)(addr++); + if (v != OPCODE_ESCAPE) + return BUG_NONE; + + v = *(u8 *)(addr++); + if (v == SECOND_BYTE_OPCODE_UD2) + return BUG_UD2; + + if (!IS_ENABLED(CONFIG_UBSAN_TRAP) || v != SECOND_BYTE_OPCODE_UD1) + return BUG_NONE; + + /* Retrieve the immediate (type value) for the UBSAN UD1 */ + v = *(u8 *)(addr++); + if (X86_MODRM_RM(v) == 4) + addr++; + + *imm = 0; + if (X86_MODRM_MOD(v) == 1) + *imm = *(u8 *)addr; + else if (X86_MODRM_MOD(v) == 2) + *imm = *(u32 *)addr; + else + WARN_ONCE(1, "Unexpected MODRM_MOD: %u\n", X86_MODRM_MOD(v)); + + return BUG_UD1; +} + + static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, struct pt_regs *regs, long error_code) @@ -216,6 +258,8 @@ static inline void handle_invalid_op(struct pt_regs *regs) static noinstr bool handle_bug(struct pt_regs *regs) { bool handled = false; + int ud_type; + u32 imm; /* * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() @@ -223,7 +267,8 @@ static noinstr bool handle_bug(struct pt_regs *regs) * irqentry_enter(). */ kmsan_unpoison_entry_regs(regs); - if (!is_valid_bugaddr(regs->ip)) + ud_type = decode_bug(regs->ip, &imm); + if (ud_type == BUG_NONE) return handled; /* @@ -236,10 +281,14 @@ static noinstr bool handle_bug(struct pt_regs *regs) */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); - if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || - handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { - regs->ip += LEN_UD2; - handled = true; + if (ud_type == BUG_UD2) { + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || + handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { + regs->ip += LEN_UD2; + handled = true; + } + } else if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { + pr_crit("%s at %pS\n", report_ubsan_failure(regs, imm), (void *)regs->ip); } if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); diff --git a/include/linux/ubsan.h b/include/linux/ubsan.h index bff7445498de..d8219cbe09ff 100644 --- a/include/linux/ubsan.h +++ b/include/linux/ubsan.h @@ -4,6 +4,11 @@ #ifdef CONFIG_UBSAN_TRAP const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type); +#else +static inline const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type) +{ + return NULL; +} #endif #endif diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index bdda600f8dfb..1d4aa7a83b3a 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -29,8 +29,8 @@ config UBSAN_TRAP Also note that selecting Y will cause your kernel to Oops with an "illegal instruction" error with no further details - when a UBSAN violation occurs. (Except on arm64, which will - report which Sanitizer failed.) This may make it hard to + when a UBSAN violation occurs. (Except on arm64 and x86, which + will report which Sanitizer failed.) This may make it hard to determine whether an Oops was caused by UBSAN or to figure out the details of a UBSAN violation. It makes the kernel log output less useful for bug reports. From 830802a0fea8fb39d3dc9fb7d6b5581e1343eb1f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2024 18:15:34 +0200 Subject: [PATCH 035/126] x86/ioapic: Handle allocation failures gracefully Breno observed panics when using failslab under certain conditions during runtime: can not alloc irq_pin_list (-1,0,20) Kernel panic - not syncing: IO-APIC: failed to add irq-pin. Can not proceed panic+0x4e9/0x590 mp_irqdomain_alloc+0x9ab/0xa80 irq_domain_alloc_irqs_locked+0x25d/0x8d0 __irq_domain_alloc_irqs+0x80/0x110 mp_map_pin_to_irq+0x645/0x890 acpi_register_gsi_ioapic+0xe6/0x150 hpet_open+0x313/0x480 That's a pointless panic which is a leftover of the historic IO/APIC code which panic'ed during early boot when the interrupt allocation failed. The only place which might justify panic is the PIT/HPET timer_check() code which tries to figure out whether the timer interrupt is delivered through the IO/APIC. But that code does not require to handle interrupt allocation failures. If the interrupt cannot be allocated then timer delivery fails and it either panics due to that or falls back to legacy mode. Cure this by removing the panic wrapper around __add_pin_to_irq_node() and making mp_irqdomain_alloc() aware of the failure condition and handle it as any other failure in this function gracefully. Reported-by: Breno Leitao Signed-off-by: Thomas Gleixner Tested-by: Breno Leitao Tested-by: Qiuxu Zhuo Link: https://lore.kernel.org/all/ZqfJmUF8sXIyuSHN@gmail.com Link: https://lore.kernel.org/all/20240802155440.275200843@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 46 ++++++++++++++++------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 477b740b2f26..d1ec1dcb637a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -352,27 +352,26 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static int __add_pin_to_irq_node(struct mp_chip_data *data, - int node, int apic, int pin) +static bool add_pin_to_irq_node(struct mp_chip_data *data, int node, int apic, int pin) { struct irq_pin_list *entry; - /* don't allow duplicates */ - for_each_irq_pin(entry, data->irq_2_pin) + /* Don't allow duplicates */ + for_each_irq_pin(entry, data->irq_2_pin) { if (entry->apic == apic && entry->pin == pin) - return 0; + return true; + } entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); if (!entry) { - pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", - node, apic, pin); - return -ENOMEM; + pr_err("Cannot allocate irq_pin_list (%d,%d,%d)\n", node, apic, pin); + return false; } + entry->apic = apic; entry->pin = pin; list_add_tail(&entry->list, &data->irq_2_pin); - - return 0; + return true; } static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin) @@ -387,13 +386,6 @@ static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin) } } -static void add_pin_to_irq_node(struct mp_chip_data *data, - int node, int apic, int pin) -{ - if (__add_pin_to_irq_node(data, node, apic, pin)) - panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); -} - /* * Reroute an IRQ to a different pin. */ @@ -1002,8 +994,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, if (irq_data && irq_data->parent_data) { if (!mp_check_pin_attr(irq, info)) return -EBUSY; - if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic, - info->ioapic.pin)) + if (!add_pin_to_irq_node(irq_data->chip_data, node, ioapic, info->ioapic.pin)) return -ENOMEM; } else { info->flags |= X86_IRQ_ALLOC_LEGACY; @@ -3017,10 +3008,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return -ENOMEM; ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); - if (ret < 0) { - kfree(data); - return ret; - } + if (ret < 0) + goto free_data; INIT_LIST_HEAD(&data->irq_2_pin); irq_data->hwirq = info->ioapic.pin; @@ -3029,7 +3018,10 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, irq_data->chip_data = data; mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); - add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); + if (!add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin)) { + ret = -ENOMEM; + goto free_irqs; + } mp_preconfigure_entry(data); mp_register_handler(virq, data->is_level); @@ -3044,6 +3036,12 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, ioapic, mpc_ioapic_id(ioapic), pin, virq, data->is_level, data->active_low); return 0; + +free_irqs: + irq_domain_free_irqs_parent(domain, virq, nr_irqs); +free_data: + kfree(data); + return ret; } void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, From 6daceb891d5fd8729f9b4453b35d3d47dab10914 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2024 18:15:36 +0200 Subject: [PATCH 036/126] x86/ioapic: Mark mp_alloc_timer_irq() __init Only invoked from check_timer() which is __init too. Cleanup the variable declaration while at it. Signed-off-by: Thomas Gleixner Tested-by: Qiuxu Zhuo Tested-by: Breno Leitao Reviewed-by: Breno Leitao Link: https://lore.kernel.org/all/20240802155440.339321108@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d1ec1dcb637a..30a3af263193 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2122,10 +2122,10 @@ static int __init disable_timer_pin_setup(char *arg) } early_param("disable_timer_pin_1", disable_timer_pin_setup); -static int mp_alloc_timer_irq(int ioapic, int pin) +static int __init mp_alloc_timer_irq(int ioapic, int pin) { - int irq = -1; struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + int irq = -1; if (domain) { struct irq_alloc_info info; From d8c76d0167a0f20d071c540b1ffba5794eeb83ca Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2024 18:15:37 +0200 Subject: [PATCH 037/126] x86/ioapic: Cleanup structs Make them conforming to the TIP coding style guide. Signed-off-by: Thomas Gleixner Tested-by: Qiuxu Zhuo Tested-by: Breno Leitao Link: https://lore.kernel.org/all/20240802155440.402005874@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 30a3af263193..e24d48d27c22 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -86,8 +86,8 @@ static unsigned int ioapic_dynirq_base; static int ioapic_initialized; struct irq_pin_list { - struct list_head list; - int apic, pin; + struct list_head list; + int apic, pin; }; struct mp_chip_data { @@ -96,7 +96,7 @@ struct mp_chip_data { bool is_level; bool active_low; bool isa_irq; - u32 count; + u32 count; }; struct mp_ioapic_gsi { @@ -105,21 +105,17 @@ struct mp_ioapic_gsi { }; static struct ioapic { - /* - * # of IRQ routing registers - */ - int nr_registers; - /* - * Saved state during suspend/resume, or while enabling intr-remap. - */ - struct IO_APIC_route_entry *saved_registers; + /* # of IRQ routing registers */ + int nr_registers; + /* Saved state during suspend/resume, or while enabling intr-remap. */ + struct IO_APIC_route_entry *saved_registers; /* I/O APIC config */ - struct mpc_ioapic mp_config; + struct mpc_ioapic mp_config; /* IO APIC gsi routing info */ - struct mp_ioapic_gsi gsi_config; - struct ioapic_domain_cfg irqdomain_cfg; - struct irq_domain *irqdomain; - struct resource *iomem_res; + struct mp_ioapic_gsi gsi_config; + struct ioapic_domain_cfg irqdomain_cfg; + struct irq_domain *irqdomain; + struct resource *iomem_res; } ioapics[MAX_IO_APICS]; #define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver @@ -2431,8 +2427,8 @@ static void ioapic_resume(void) } static struct syscore_ops ioapic_syscore_ops = { - .suspend = save_ioapic_entries, - .resume = ioapic_resume, + .suspend = save_ioapic_entries, + .resume = ioapic_resume, }; static int __init ioapic_init_ops(void) From ed57538b8510a5811ec049659b726cdb24ed6b46 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2024 18:15:38 +0200 Subject: [PATCH 038/126] x86/ioapic: Use guard() for locking where applicable KISS rules! Signed-off-by: Thomas Gleixner Tested-by: Qiuxu Zhuo Tested-by: Breno Leitao Link: https://lore.kernel.org/all/20240802155440.464227224@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 192 +++++++++++---------------------- 1 file changed, 64 insertions(+), 128 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e24d48d27c22..4715e2f887da 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -296,14 +296,8 @@ static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { - struct IO_APIC_route_entry entry; - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - entry = __ioapic_read_entry(apic, pin); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - - return entry; + guard(raw_spinlock_irqsave)(&ioapic_lock); + return __ioapic_read_entry(apic, pin); } /* @@ -320,11 +314,8 @@ static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); __ioapic_write_entry(apic, pin, e); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -335,12 +326,10 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) static void ioapic_mask_entry(int apic, int pin) { struct IO_APIC_route_entry e = { .masked = true }; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); io_apic_write(apic, 0x10 + 2*pin, e.w1); io_apic_write(apic, 0x11 + 2*pin, e.w2); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -433,11 +422,9 @@ static void io_apic_sync(struct irq_pin_list *entry) static void mask_ioapic_irq(struct irq_data *irq_data) { struct mp_chip_data *data = irq_data->chip_data; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); io_apic_modify_irq(data, true, &io_apic_sync); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void __unmask_ioapic(struct mp_chip_data *data) @@ -448,11 +435,9 @@ static void __unmask_ioapic(struct mp_chip_data *data) static void unmask_ioapic_irq(struct irq_data *irq_data) { struct mp_chip_data *data = irq_data->chip_data; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); __unmask_ioapic(data); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -497,13 +482,11 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) static void eoi_ioapic_pin(int vector, struct mp_chip_data *data) { - unsigned long flags; struct irq_pin_list *entry; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); for_each_irq_pin(entry, data->irq_2_pin) __eoi_ioapic_pin(entry->apic, entry->pin, vector); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) @@ -526,8 +509,6 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) } if (entry.irr) { - unsigned long flags; - /* * Make sure the trigger mode is set to level. Explicit EOI * doesn't clear the remote-IRR if the trigger mode is not @@ -537,9 +518,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) entry.is_level = true; ioapic_write_entry(apic, pin, entry); } - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); __eoi_ioapic_pin(apic, pin, entry.vector); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -1033,7 +1013,7 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, return -EINVAL; } - mutex_lock(&ioapic_mutex); + guard(mutex)(&ioapic_mutex); if (!(flags & IOAPIC_MAP_ALLOC)) { if (!legacy) { irq = irq_find_mapping(domain, pin); @@ -1054,8 +1034,6 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, data->count++; } } - mutex_unlock(&ioapic_mutex); - return irq; } @@ -1120,10 +1098,9 @@ void mp_unmap_irq(int irq) if (!data || data->isa_irq) return; - mutex_lock(&ioapic_mutex); + guard(mutex)(&ioapic_mutex); if (--data->count == 0) irq_domain_free_irqs(irq, 1); - mutex_unlock(&ioapic_mutex); } /* @@ -1251,16 +1228,15 @@ static void __init print_IO_APIC(int ioapic_idx) union IO_APIC_reg_01 reg_01; union IO_APIC_reg_02 reg_02; union IO_APIC_reg_03 reg_03; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic_idx, 0); - reg_01.raw = io_apic_read(ioapic_idx, 1); - if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(ioapic_idx, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(ioapic_idx, 3); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) { + reg_00.raw = io_apic_read(ioapic_idx, 0); + reg_01.raw = io_apic_read(ioapic_idx, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(ioapic_idx, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(ioapic_idx, 3); + } printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); @@ -1451,7 +1427,6 @@ static void __init setup_ioapic_ids_from_mpc_nocheck(void) const u32 broadcast_id = 0xF; union IO_APIC_reg_00 reg_00; unsigned char old_id; - unsigned long flags; int ioapic_idx, i; /* @@ -1465,9 +1440,8 @@ static void __init setup_ioapic_ids_from_mpc_nocheck(void) */ for_each_ioapic(ioapic_idx) { /* Read the register 0 value */ - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic_idx, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) + reg_00.raw = io_apic_read(ioapic_idx, 0); old_id = mpc_ioapic_id(ioapic_idx); @@ -1522,16 +1496,11 @@ static void __init setup_ioapic_ids_from_mpc_nocheck(void) mpc_ioapic_id(ioapic_idx)); reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); - raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic_idx, 0, reg_00.raw); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - - /* - * Sanity check - */ - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic_idx, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) { + io_apic_write(ioapic_idx, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic_idx, 0); + } + /* Sanity check */ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) pr_cont("could not set ID!\n"); else @@ -1661,17 +1630,14 @@ static int __init timer_irq_works(void) static unsigned int startup_ioapic_irq(struct irq_data *data) { int was_pending = 0, irq = data->irq; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); if (irq < nr_legacy_irqs()) { legacy_pic->mask(irq); if (legacy_pic->irq_pending(irq)) was_pending = 1; } __unmask_ioapic(data->chip_data); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return was_pending; } @@ -1681,9 +1647,8 @@ atomic_t irq_mis_count; static bool io_apic_level_ack_pending(struct mp_chip_data *data) { struct irq_pin_list *entry; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); for_each_irq_pin(entry, data->irq_2_pin) { struct IO_APIC_route_entry e; int pin; @@ -1691,13 +1656,9 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data) pin = entry->pin; e.w1 = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ - if (e.irr) { - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + if (e.irr) return true; - } } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return false; } @@ -1898,18 +1859,16 @@ static void ioapic_configure_entry(struct irq_data *irqd) __ioapic_write_entry(entry->apic, entry->pin, mpd->entry); } -static int ioapic_set_affinity(struct irq_data *irq_data, - const struct cpumask *mask, bool force) +static int ioapic_set_affinity(struct irq_data *irq_data, const struct cpumask *mask, bool force) { struct irq_data *parent = irq_data->parent_data; - unsigned long flags; int ret; ret = parent->chip->irq_set_affinity(parent, mask, force); - raw_spin_lock_irqsave(&ioapic_lock, flags); + + guard(raw_spinlock_irqsave)(&ioapic_lock); if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) ioapic_configure_entry(irq_data); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); return ret; } @@ -1928,9 +1887,8 @@ static int ioapic_set_affinity(struct irq_data *irq_data, * * Verify that the corresponding Remote-IRR bits are clear. */ -static int ioapic_irq_get_chip_state(struct irq_data *irqd, - enum irqchip_irq_state which, - bool *state) +static int ioapic_irq_get_chip_state(struct irq_data *irqd, enum irqchip_irq_state which, + bool *state) { struct mp_chip_data *mcd = irqd->chip_data; struct IO_APIC_route_entry rentry; @@ -1940,7 +1898,8 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd, return -EINVAL; *state = false; - raw_spin_lock(&ioapic_lock); + + guard(raw_spinlock)(&ioapic_lock); for_each_irq_pin(p, mcd->irq_2_pin) { rentry = __ioapic_read_entry(p->apic, p->pin); /* @@ -1954,7 +1913,6 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd, break; } } - raw_spin_unlock(&ioapic_lock); return 0; } @@ -2129,9 +2087,8 @@ static int __init mp_alloc_timer_irq(int ioapic, int pin) ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); info.devid = mpc_ioapic_id(ioapic); info.ioapic.pin = pin; - mutex_lock(&ioapic_mutex); + guard(mutex)(&ioapic_mutex); irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info); - mutex_unlock(&ioapic_mutex); } return irq; @@ -2142,8 +2099,6 @@ static int __init mp_alloc_timer_irq(int ioapic, int pin) * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. - * - * FIXME: really need to revamp this for all platforms. */ static inline void __init check_timer(void) { @@ -2404,16 +2359,14 @@ void __init setup_IO_APIC(void) static void resume_ioapic_id(int ioapic_idx) { - unsigned long flags; union IO_APIC_reg_00 reg_00; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); reg_00.raw = io_apic_read(ioapic_idx, 0); if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) { reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); io_apic_write(ioapic_idx, 0, reg_00.raw); } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void ioapic_resume(void) @@ -2443,15 +2396,13 @@ device_initcall(ioapic_init_ops); static int io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); reg_01.raw = io_apic_read(ioapic, 1); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - /* The register returns the maximum index redir index - * supported, which is one less than the total number of redir - * entries. + /* + * The register returns the maximum index redir index supported, + * which is one less than the total number of redir entries. */ return reg_01.bits.entries + 1; } @@ -2481,16 +2432,14 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) static DECLARE_BITMAP(apic_id_map, MAX_LOCAL_APIC); const u32 broadcast_id = 0xF; union IO_APIC_reg_00 reg_00; - unsigned long flags; int i = 0; /* Initialize the ID map */ if (bitmap_empty(apic_id_map, MAX_LOCAL_APIC)) copy_phys_cpu_present_map(apic_id_map); - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) + reg_00.raw = io_apic_read(ioapic, 0); if (apic_id >= broadcast_id) { pr_warn("IOAPIC[%d]: Invalid apic_id %d, trying %d\n", @@ -2517,21 +2466,19 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) if (reg_00.bits.ID != apic_id) { reg_00.bits.ID = apic_id; - raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0, reg_00.raw); - reg_00.raw = io_apic_read(ioapic, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) { + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + } /* Sanity check */ if (reg_00.bits.ID != apic_id) { - pr_err("IOAPIC[%d]: Unable to change apic_id!\n", - ioapic); + pr_err("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); return -1; } } - apic_printk(APIC_VERBOSE, KERN_INFO - "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + apic_printk(APIC_VERBOSE, KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); return apic_id; } @@ -2547,7 +2494,6 @@ static u8 io_apic_unique_id(int idx, u8 id) { union IO_APIC_reg_00 reg_00; DECLARE_BITMAP(used, 256); - unsigned long flags; u8 new_id; int i; @@ -2563,26 +2509,23 @@ static u8 io_apic_unique_id(int idx, u8 id) * Read the current id from the ioapic and keep it if * available. */ - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(idx, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) + reg_00.raw = io_apic_read(idx, 0); + new_id = reg_00.bits.ID; if (!test_bit(new_id, used)) { - apic_printk(APIC_VERBOSE, KERN_INFO - "IOAPIC[%d]: Using reg apic_id %d instead of %d\n", - idx, new_id, id); + apic_printk(APIC_VERBOSE, KERN_INFO "IOAPIC[%d]: Using reg apic_id %d instead of %d\n", + idx, new_id, id); return new_id; } - /* - * Get the next free id and write it to the ioapic. - */ + /* Get the next free id and write it to the ioapic. */ new_id = find_first_zero_bit(used, 256); reg_00.bits.ID = new_id; - raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(idx, 0, reg_00.raw); - reg_00.raw = io_apic_read(idx, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) { + io_apic_write(idx, 0, reg_00.raw); + reg_00.raw = io_apic_read(idx, 0); + } /* Sanity check */ BUG_ON(reg_00.bits.ID != new_id); @@ -2592,12 +2535,10 @@ static u8 io_apic_unique_id(int idx, u8 id) static int io_apic_get_version(int ioapic) { - union IO_APIC_reg_01 reg_01; - unsigned long flags; + union IO_APIC_reg_01 reg_01; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); reg_01.raw = io_apic_read(ioapic, 1); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.version; } @@ -3050,22 +2991,17 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, irq_data = irq_domain_get_irq_data(domain, virq); if (irq_data && irq_data->chip_data) { data = irq_data->chip_data; - __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), - (int)irq_data->hwirq); + __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq); WARN_ON(!list_empty(&data->irq_2_pin)); kfree(irq_data->chip_data); } irq_domain_free_irqs_top(domain, virq, nr_irqs); } -int mp_irqdomain_activate(struct irq_domain *domain, - struct irq_data *irq_data, bool reserve) +int mp_irqdomain_activate(struct irq_domain *domain, struct irq_data *irq_data, bool reserve) { - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); ioapic_configure_entry(irq_data); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); return 0; } From d768e3f3e3fb43df2559d4c053b0f68c9649b2c7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2024 18:15:39 +0200 Subject: [PATCH 039/126] x86/apic: Provide apic_printk() helpers apic_printk() requires the APIC verbosity level and printk level which is tedious and horrible to read. Provide helpers to simplify all of that. Signed-off-by: Thomas Gleixner Tested-by: Qiuxu Zhuo Tested-by: Breno Leitao Link: https://lore.kernel.org/all/20240802155440.527510045@linutronix.de --- arch/x86/include/asm/apic.h | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 9327eb00e96d..34eaebe2f61d 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -18,6 +18,11 @@ #define ARCH_APICTIMER_STOPS_ON_C3 1 +/* Macros for apic_extnmi which controls external NMI masking */ +#define APIC_EXTNMI_BSP 0 /* Default */ +#define APIC_EXTNMI_ALL 1 +#define APIC_EXTNMI_NONE 2 + /* * Debugging macros */ @@ -25,22 +30,22 @@ #define APIC_VERBOSE 1 #define APIC_DEBUG 2 -/* Macros for apic_extnmi which controls external NMI masking */ -#define APIC_EXTNMI_BSP 0 /* Default */ -#define APIC_EXTNMI_ALL 1 -#define APIC_EXTNMI_NONE 2 - /* - * Define the default level of output to be very little - * This can be turned up by using apic=verbose for more - * information and apic=debug for _lots_ of information. - * apic_verbosity is defined in apic.c + * Define the default level of output to be very little This can be turned + * up by using apic=verbose for more information and apic=debug for _lots_ + * of information. apic_verbosity is defined in apic.c */ -#define apic_printk(v, s, a...) do { \ - if ((v) <= apic_verbosity) \ - printk(s, ##a); \ - } while (0) +#define apic_printk(v, s, a...) \ +do { \ + if ((v) <= apic_verbosity) \ + printk(s, ##a); \ +} while (0) +#define apic_pr_verbose(s, a...) apic_printk(APIC_VERBOSE, KERN_INFO s, ##a) +#define apic_pr_debug(s, a...) apic_printk(APIC_DEBUG, KERN_DEBUG s, ##a) +#define apic_pr_debug_cont(s, a...) apic_printk(APIC_DEBUG, KERN_CONT s, ##a) +/* Unconditional debug prints for code which is guarded by apic_verbosity already */ +#define apic_dbg(s, a...) printk(KERN_DEBUG s, ##a) #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) extern void x86_32_probe_apic(void); From ac1c9fc1b571d5355602daa642f74288ae4b701d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2024 18:15:41 +0200 Subject: [PATCH 040/126] x86/apic: Cleanup apic_printk()s Use the new apic_pr_*() helpers and cleanup the apic_printk() maze. Signed-off-by: Thomas Gleixner Tested-by: Qiuxu Zhuo Tested-by: Breno Leitao Link: https://lore.kernel.org/all/20240802155440.589821068@linutronix.de --- arch/x86/kernel/apic/apic.c | 81 ++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 46 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 66fd4b2a37a3..1838a73b0950 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -677,7 +677,7 @@ calibrate_by_pmtimer(u32 deltapm, long *delta, long *deltatsc) return -1; #endif - apic_printk(APIC_VERBOSE, "... PM-Timer delta = %u\n", deltapm); + apic_pr_verbose("... PM-Timer delta = %u\n", deltapm); /* Check, if the PM timer is available */ if (!deltapm) @@ -687,14 +687,14 @@ calibrate_by_pmtimer(u32 deltapm, long *delta, long *deltatsc) if (deltapm > (pm_100ms - pm_thresh) && deltapm < (pm_100ms + pm_thresh)) { - apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n"); + apic_pr_verbose("... PM-Timer result ok\n"); return 0; } res = (((u64)deltapm) * mult) >> 22; do_div(res, 1000000); - pr_warn("APIC calibration not consistent " - "with PM-Timer: %ldms instead of 100ms\n", (long)res); + pr_warn("APIC calibration not consistent with PM-Timer: %ldms instead of 100ms\n", + (long)res); /* Correct the lapic counter value */ res = (((u64)(*delta)) * pm_100ms); @@ -707,9 +707,8 @@ calibrate_by_pmtimer(u32 deltapm, long *delta, long *deltatsc) if (boot_cpu_has(X86_FEATURE_TSC)) { res = (((u64)(*deltatsc)) * pm_100ms); do_div(res, deltapm); - apic_printk(APIC_VERBOSE, "TSC delta adjusted to " - "PM-Timer: %lu (%ld)\n", - (unsigned long)res, *deltatsc); + apic_pr_verbose("TSC delta adjusted to PM-Timer: %lu (%ld)\n", + (unsigned long)res, *deltatsc); *deltatsc = (long)res; } @@ -792,8 +791,7 @@ static int __init calibrate_APIC_clock(void) * in the clockevent structure and return. */ if (!lapic_init_clockevent()) { - apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", - lapic_timer_period); + apic_pr_verbose("lapic timer already calibrated %d\n", lapic_timer_period); /* * Direct calibration methods must have an always running * local APIC timer, no need for broadcast timer. @@ -802,8 +800,7 @@ static int __init calibrate_APIC_clock(void) return 0; } - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); + apic_pr_verbose("Using local APIC timer interrupts. Calibrating APIC timer ...\n"); /* * There are platforms w/o global clockevent devices. Instead of @@ -866,7 +863,7 @@ static int __init calibrate_APIC_clock(void) /* Build delta t1-t2 as apic timer counts down */ delta = lapic_cal_t1 - lapic_cal_t2; - apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + apic_pr_verbose("... lapic delta = %ld\n", delta); deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); @@ -877,22 +874,19 @@ static int __init calibrate_APIC_clock(void) lapic_timer_period = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; lapic_init_clockevent(); - apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); - apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); - apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", - lapic_timer_period); + apic_pr_verbose("..... delta %ld\n", delta); + apic_pr_verbose("..... mult: %u\n", lapic_clockevent.mult); + apic_pr_verbose("..... calibration result: %u\n", lapic_timer_period); if (boot_cpu_has(X86_FEATURE_TSC)) { - apic_printk(APIC_VERBOSE, "..... CPU clock speed is " - "%ld.%04ld MHz.\n", - (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), - (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + apic_pr_verbose("..... CPU clock speed is %ld.%04ld MHz.\n", + (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ)); } - apic_printk(APIC_VERBOSE, "..... host bus clock speed is " - "%u.%04u MHz.\n", - lapic_timer_period / (1000000 / HZ), - lapic_timer_period % (1000000 / HZ)); + apic_pr_verbose("..... host bus clock speed is %u.%04u MHz.\n", + lapic_timer_period / (1000000 / HZ), + lapic_timer_period % (1000000 / HZ)); /* * Do a sanity check on the APIC calibration result @@ -911,7 +905,7 @@ static int __init calibrate_APIC_clock(void) * available. */ if (!pm_referenced && global_clock_event) { - apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); + apic_pr_verbose("... verify APIC timer\n"); /* * Setup the apic timer manually @@ -932,11 +926,11 @@ static int __init calibrate_APIC_clock(void) /* Jiffies delta */ deltaj = lapic_cal_j2 - lapic_cal_j1; - apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + apic_pr_verbose("... jiffies delta = %lu\n", deltaj); /* Check, if the jiffies result is consistent */ if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) - apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + apic_pr_verbose("... jiffies result ok\n"); else levt->features |= CLOCK_EVT_FEAT_DUMMY; } @@ -1221,9 +1215,8 @@ void __init sync_Arb_IDs(void) */ apic_wait_icr_idle(); - apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, APIC_DEST_ALLINC | - APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_pr_debug("Synchronizing Arb IDs.\n"); + apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); } enum apic_intr_mode_id apic_intr_mode __ro_after_init; @@ -1409,10 +1402,10 @@ static void lapic_setup_esr(void) if (maxlvt > 3) apic_write(APIC_ESR, 0); value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, "ESR value before enabling " - "vector: 0x%08x after: 0x%08x\n", - oldvalue, value); + if (value != oldvalue) { + apic_pr_verbose("ESR value before enabling vector: 0x%08x after: 0x%08x\n", + oldvalue, value); + } } #define APIC_IR_REGS APIC_ISR_NR @@ -1599,10 +1592,10 @@ static void setup_local_APIC(void) value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; if (!cpu && (pic_mode || !value || ioapic_is_disabled)) { value = APIC_DM_EXTINT; - apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu); + apic_pr_verbose("Enabled ExtINT on CPU#%d\n", cpu); } else { value = APIC_DM_EXTINT | APIC_LVT_MASKED; - apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu); + apic_pr_verbose("Masked ExtINT on CPU#%d\n", cpu); } apic_write(APIC_LVT0, value); @@ -2066,8 +2059,7 @@ static __init void apic_set_fixmap(bool read_apic) { set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); apic_mmio_base = APIC_BASE; - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - apic_mmio_base, mp_lapic_addr); + apic_pr_verbose("Mapped APIC to %16lx (%16lx)\n", apic_mmio_base, mp_lapic_addr); if (read_apic) apic_read_boot_cpu_id(false); } @@ -2170,18 +2162,17 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt) apic_eoi(); atomic_inc(&irq_err_count); - apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x", - smp_processor_id(), v); + apic_pr_debug("APIC error on CPU%d: %02x", smp_processor_id(), v); v &= 0xff; while (v) { if (v & 0x1) - apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); + apic_pr_debug_cont(" : %s", error_interrupt_reason[i]); i++; v >>= 1; } - apic_printk(APIC_DEBUG, KERN_CONT "\n"); + apic_pr_debug_cont("\n"); trace_error_apic_exit(ERROR_APIC_VECTOR); } @@ -2201,8 +2192,7 @@ static void __init connect_bsp_APIC(void) * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's * local APIC to INT and NMI lines. */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, " - "enabling APIC mode.\n"); + apic_pr_verbose("Leaving PIC mode, enabling APIC mode.\n"); imcr_pic_to_apic(); } #endif @@ -2227,8 +2217,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) * IPIs, won't work beyond this point! The only exception are * INIT IPIs. */ - apic_printk(APIC_VERBOSE, "disabling APIC mode, " - "entering PIC mode.\n"); + apic_pr_verbose("Disabling APIC mode, entering PIC mode.\n"); imcr_apic_to_pic(); return; } From f47998da395c1dfa53449bd335db4cf508be5275 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2024 18:15:42 +0200 Subject: [PATCH 041/126] x86/ioapic: Cleanup apic_printk()s Replace apic_printk($LEVEL) with the corresponding apic_pr_*() helpers and use pr_info() for APIC_QUIET as that is always printed so the indirection is pointless noise. Signed-off-by: Thomas Gleixner Tested-by: Qiuxu Zhuo Tested-by: Breno Leitao Link: https://lore.kernel.org/all/20240802155440.652239904@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 176 ++++++++++++++------------------- 1 file changed, 73 insertions(+), 103 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4715e2f887da..b978326baa86 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -201,10 +201,9 @@ void mp_save_irq(struct mpc_intsrc *m) { int i; - apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," - " IRQ %02x, APIC ID %x, APIC INT %02x\n", - m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, - m->srcbusirq, m->dstapic, m->dstirq); + apic_pr_verbose("Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, + m->srcbusirq, m->dstapic, m->dstirq); for (i = 0; i < mp_irq_entries; i++) { if (!memcmp(&mp_irqs[i], m, sizeof(*m))) @@ -554,28 +553,23 @@ static int pirq_entries[MAX_PIRQS] = { static int __init ioapic_pirq_setup(char *str) { - int i, max; - int ints[MAX_PIRQS+1]; + int i, max, ints[MAX_PIRQS+1]; get_options(str, ARRAY_SIZE(ints), ints); - apic_printk(APIC_VERBOSE, KERN_INFO - "PIRQ redirection, working around broken MP-BIOS.\n"); + apic_pr_verbose("PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; if (ints[0] < MAX_PIRQS) max = ints[0]; for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ + apic_pr_verbose("... PIRQ%d -> IRQ %d\n", i, ints[i + 1]); + /* PIRQs are mapped upside down, usually */ pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; } return 1; } - __setup("pirq=", ioapic_pirq_setup); #endif /* CONFIG_X86_32 */ @@ -737,8 +731,7 @@ static bool EISA_ELCR(unsigned int irq) unsigned int port = PIC_ELCR1 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } - apic_printk(APIC_VERBOSE, KERN_INFO - "Broken MPtable reports ISA irq %d\n", irq); + apic_pr_verbose("Broken MPtable reports ISA irq %d\n", irq); return false; } @@ -1052,15 +1045,13 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) * PCI IRQ command line redirection. Yes, limits are hardcoded. */ if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "disabling PIRQ%d\n", pin-16); + if (pirq_entries[pin - 16] != -1) { + if (!pirq_entries[pin - 16]) { + apic_pr_verbose("Disabling PIRQ%d\n", pin - 16); } else { int irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, KERN_DEBUG - "using PIRQ%d -> IRQ %d\n", - pin-16, irq); + + apic_pr_verbose("Using PIRQ%d -> IRQ %d\n", pin - 16, irq); return irq; } } @@ -1111,12 +1102,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) { int irq, i, best_ioapic = -1, best_idx = -1; - apic_printk(APIC_DEBUG, - "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); + apic_pr_debug("Querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); if (test_bit(bus, mp_bus_not_pci)) { - apic_printk(APIC_VERBOSE, - "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + apic_pr_verbose("PCI BIOS passed nonexistent PCI bus %d!\n", bus); return -1; } @@ -1173,17 +1162,16 @@ static void __init setup_IO_APIC_irqs(void) unsigned int ioapic, pin; int idx; - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + apic_pr_verbose("Init IO_APIC IRQs\n"); for_each_ioapic_pin(ioapic, pin) { idx = find_irq_entry(ioapic, pin, mp_INT); - if (idx < 0) - apic_printk(APIC_VERBOSE, - KERN_DEBUG " apic %d pin %d not connected\n", - mpc_ioapic_id(ioapic), pin); - else - pin_2_irq(idx, ioapic, pin, - ioapic ? 0 : IOAPIC_MAP_ALLOC); + if (idx < 0) { + apic_pr_verbose("apic %d pin %d not connected\n", + mpc_ioapic_id(ioapic), pin); + } else { + pin_2_irq(idx, ioapic, pin, ioapic ? 0 : IOAPIC_MAP_ALLOC); + } } } @@ -1359,29 +1347,24 @@ void __init enable_IO_APIC(void) i8259_apic = find_isa_irq_apic(0, mp_ExtINT); /* Trust the MP table if nothing is setup in the hardware */ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { - printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + pr_warn("ExtINT not setup in hardware but reported by MP table\n"); ioapic_i8259.pin = i8259_pin; ioapic_i8259.apic = i8259_apic; } /* Complain if the MP table and the hardware disagree */ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && - (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) - { - printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); - } + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + pr_warn("ExtINT in hardware and MP table differ\n"); - /* - * Do not trust the IO-APIC being empty at bootup - */ + /* Do not trust the IO-APIC being empty at bootup */ clear_IO_APIC(); } void native_restore_boot_irq_mode(void) { /* - * If the i8259 is routed through an IOAPIC - * Put that IOAPIC in virtual wire mode - * so legacy interrupts can be delivered. + * If the i8259 is routed through an IOAPIC Put that IOAPIC in + * virtual wire mode so legacy interrupts can be delivered. */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; @@ -1469,8 +1452,8 @@ static void __init setup_ioapic_ids_from_mpc_nocheck(void) set_bit(i, phys_id_present_map); ioapics[ioapic_idx].mp_config.apicid = i; } else { - apic_printk(APIC_VERBOSE, "Setting %d in the phys_id_present_map\n", - mpc_ioapic_id(ioapic_idx)); + apic_pr_verbose("Setting %d in the phys_id_present_map\n", + mpc_ioapic_id(ioapic_idx)); set_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map); } @@ -1491,9 +1474,8 @@ static void __init setup_ioapic_ids_from_mpc_nocheck(void) if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID) continue; - apic_printk(APIC_VERBOSE, KERN_INFO - "...changing IO-APIC physical APIC ID to %d ...", - mpc_ioapic_id(ioapic_idx)); + apic_pr_verbose("...changing IO-APIC physical APIC ID to %d ...", + mpc_ioapic_id(ioapic_idx)); reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); scoped_guard (raw_spinlock_irqsave, &ioapic_lock) { @@ -1504,7 +1486,7 @@ static void __init setup_ioapic_ids_from_mpc_nocheck(void) if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) pr_cont("could not set ID!\n"); else - apic_printk(APIC_VERBOSE, " ok.\n"); + apic_pr_verbose(" ok.\n"); } } @@ -2136,9 +2118,8 @@ static inline void __init check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " - "apic1=%d pin1=%d apic2=%d pin2=%d\n", - cfg->vector, apic1, pin1, apic2, pin2); + pr_info("..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); /* * Some BIOS writers are clueless and report the ExtINTA @@ -2182,13 +2163,10 @@ static inline void __init check_timer(void) panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC"); clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) - apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " - "8254 timer not connected to IO-APIC\n"); + pr_err("..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); - apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " - "(IRQ0) through the 8259A ...\n"); - apic_printk(APIC_QUIET, KERN_INFO - "..... (found apic %d pin %d) ...\n", apic2, pin2); + pr_info("...trying to set up timer (IRQ0) through the 8259A ...\n"); + pr_info("..... (found apic %d pin %d) ...\n", apic2, pin2); /* * legacy devices should be connected to IO APIC #0 */ @@ -2197,7 +2175,7 @@ static inline void __init check_timer(void) irq_domain_activate_irq(irq_data, false); legacy_pic->unmask(0); if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); + pr_info("....... works.\n"); goto out; } /* @@ -2205,26 +2183,24 @@ static inline void __init check_timer(void) */ legacy_pic->mask(0); clear_IO_APIC_pin(apic2, pin2); - apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); + pr_info("....... failed.\n"); } - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as Virtual Wire IRQ...\n"); + pr_info("...trying to set up timer as Virtual Wire IRQ...\n"); lapic_register_intr(0); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ legacy_pic->unmask(0); if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + pr_info("..... works.\n"); goto out; } legacy_pic->mask(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); - apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); + pr_info("..... failed.\n"); - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as ExtINT IRQ...\n"); + pr_info("...trying to set up timer as ExtINT IRQ...\n"); legacy_pic->init(0); legacy_pic->make_irq(0); @@ -2234,14 +2210,15 @@ static inline void __init check_timer(void) unlock_ExtINT_logic(); if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + pr_info("..... works.\n"); goto out; } - apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); - if (apic_is_x2apic_enabled()) - apic_printk(APIC_QUIET, KERN_INFO - "Perhaps problem with the pre-enabled x2apic mode\n" - "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); + + pr_info("..... failed :\n"); + if (apic_is_x2apic_enabled()) { + pr_info("Perhaps problem with the pre-enabled x2apic mode\n" + "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); + } panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " "report. Then try booting with the 'noapic' option.\n"); out: @@ -2339,7 +2316,7 @@ void __init setup_IO_APIC(void) io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL; - apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + apic_pr_verbose("ENABLING IO-APIC IRQs\n"); for_each_ioapic(ioapic) BUG_ON(mp_irqdomain_create(ioapic)); @@ -2478,7 +2455,7 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) } } - apic_printk(APIC_VERBOSE, KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + apic_pr_verbose("IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); return apic_id; } @@ -2514,8 +2491,8 @@ static u8 io_apic_unique_id(int idx, u8 id) new_id = reg_00.bits.ID; if (!test_bit(new_id, used)) { - apic_printk(APIC_VERBOSE, KERN_INFO "IOAPIC[%d]: Using reg apic_id %d instead of %d\n", - idx, new_id, id); + apic_pr_verbose("IOAPIC[%d]: Using reg apic_id %d instead of %d\n", + idx, new_id, id); return new_id; } @@ -2614,9 +2591,7 @@ void __init io_apic_init_mappings(void) ioapic_phys = mpc_ioapic_addr(i); #ifdef CONFIG_X86_32 if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " + pr_err("WARNING: bogus zero IO-APIC address found in MPTABLE, " "disabling IO/APIC support!\n"); smp_found_config = 0; ioapic_is_disabled = true; @@ -2635,9 +2610,8 @@ fake_ioapic_page: ioapic_phys = __pa(ioapic_phys); } io_apic_set_fixmap(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), - ioapic_phys); + apic_pr_verbose("mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), ioapic_phys); idx++; ioapic_res->start = ioapic_phys; @@ -2648,13 +2622,12 @@ fake_ioapic_page: void __init ioapic_insert_resources(void) { - int i; struct resource *r = ioapic_resources; + int i; if (!r) { if (nr_ioapics > 0) - printk(KERN_ERR - "IO APIC resources couldn't be allocated.\n"); + pr_err("IO APIC resources couldn't be allocated.\n"); return; } @@ -2674,11 +2647,12 @@ int mp_find_ioapic(u32 gsi) /* Find the IOAPIC that manages this GSI. */ for_each_ioapic(i) { struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i); + if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end) return i; } - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + pr_err("ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); return -1; } @@ -2745,12 +2719,13 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, pr_warn("Bogus (zero) I/O APIC address found, skipping!\n"); return -EINVAL; } - for_each_ioapic(ioapic) + + for_each_ioapic(ioapic) { if (ioapics[ioapic].mp_config.apicaddr == address) { - pr_warn("address 0x%x conflicts with IOAPIC%d\n", - address, ioapic); + pr_warn("address 0x%x conflicts with IOAPIC%d\n", address, ioapic); return -EEXIST; } + } idx = find_free_ioapic_entry(); if (idx >= MAX_IO_APICS) { @@ -2785,8 +2760,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, (gsi_end >= gsi_cfg->gsi_base && gsi_end <= gsi_cfg->gsi_end)) { pr_warn("GSI range [%u-%u] for new IOAPIC conflicts with GSI[%u-%u]\n", - gsi_base, gsi_end, - gsi_cfg->gsi_base, gsi_cfg->gsi_end); + gsi_base, gsi_end, gsi_cfg->gsi_base, gsi_cfg->gsi_end); clear_fixmap(FIX_IO_APIC_BASE_0 + idx); return -ENOSPC; } @@ -2820,8 +2794,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, ioapics[idx].nr_registers = entries; pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n", - idx, mpc_ioapic_id(idx), - mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), + idx, mpc_ioapic_id(idx), mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), gsi_cfg->gsi_base, gsi_cfg->gsi_end); return 0; @@ -2850,8 +2823,7 @@ int mp_unregister_ioapic(u32 gsi_base) if (irq >= 0) { data = irq_get_chip_data(irq); if (data && data->count) { - pr_warn("pin%d on IOAPIC%d is still in use.\n", - pin, ioapic); + pr_warn("pin%d on IOAPIC%d is still in use.\n", pin, ioapic); return -EBUSY; } } @@ -2968,10 +2940,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, legacy_pic->mask(virq); local_irq_restore(flags); - apic_printk(APIC_VERBOSE, KERN_DEBUG - "IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n", - ioapic, mpc_ioapic_id(ioapic), pin, virq, - data->is_level, data->active_low); + apic_pr_verbose("IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n", + ioapic, mpc_ioapic_id(ioapic), pin, virq, data->is_level, data->active_low); return 0; free_irqs: From 54cd3795b471057a7e82acaade2b6a22beee1242 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2024 18:15:43 +0200 Subject: [PATCH 042/126] x86/ioapic: Cleanup guarded debug printk()s Cleanup the APIC printk()s which are inside of a apic verbosity guarded region by using apic_dbg() for the KERN_DEBUG level prints. Signed-off-by: Thomas Gleixner Tested-by: Qiuxu Zhuo Tested-by: Breno Leitao Link: https://lore.kernel.org/all/20240802155440.714763708@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 67 +++++++++++++++------------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b978326baa86..3669b5d27518 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1186,26 +1186,21 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) char buf[256]; int i; - printk(KERN_DEBUG "IOAPIC %d:\n", apic); + apic_dbg("IOAPIC %d:\n", apic); for (i = 0; i <= nr_entries; i++) { entry = ioapic_read_entry(apic, i); - snprintf(buf, sizeof(buf), - " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", - i, - entry.masked ? "disabled" : "enabled ", + snprintf(buf, sizeof(buf), " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", + i, entry.masked ? "disabled" : "enabled ", entry.is_level ? "level" : "edge ", entry.active_low ? "low " : "high", entry.vector, entry.irr, entry.delivery_status); if (entry.ir_format) { - printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", - buf, - (entry.ir_index_15 << 15) | entry.ir_index_0_14, - entry.ir_zero); + apic_dbg("%s, remapped, I(%04X), Z(%X)\n", buf, + (entry.ir_index_15 << 15) | entry.ir_index_0_14, entry.ir_zero); } else { - printk(KERN_DEBUG "%s, %s, D(%02X%02X), M(%1d)\n", buf, - entry.dest_mode_logical ? "logical " : "physical", - entry.virt_destid_8_14, entry.destid_0_7, - entry.delivery_mode); + apic_dbg("%s, %s, D(%02X%02X), M(%1d)\n", buf, + entry.dest_mode_logical ? "logical " : "physic al", + entry.virt_destid_8_14, entry.destid_0_7, entry.delivery_mode); } } } @@ -1226,19 +1221,15 @@ static void __init print_IO_APIC(int ioapic_idx) reg_03.raw = io_apic_read(ioapic_idx, 3); } - printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); - printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); - printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); - printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - - printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); - printk(KERN_DEBUG "....... : max redirection entries: %02X\n", - reg_01.bits.entries); - - printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %02X\n", - reg_01.bits.version); + apic_dbg("IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); + apic_dbg(".... register #00: %08X\n", reg_00.raw); + apic_dbg("....... : physical APIC id: %02X\n", reg_00.bits.ID); + apic_dbg("....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + apic_dbg("....... : LTS : %X\n", reg_00.bits.LTS); + apic_dbg(".... register #01: %08X\n", *(int *)®_01); + apic_dbg("....... : max redirection entries: %02X\n", reg_01.bits.entries); + apic_dbg("....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + apic_dbg("....... : IO APIC version: %02X\n", reg_01.bits.version); /* * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, @@ -1246,8 +1237,8 @@ static void __init print_IO_APIC(int ioapic_idx) * value, so ignore it if reg_02 == reg_01. */ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); - printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + apic_dbg(".... register #02: %08X\n", reg_02.raw); + apic_dbg("....... : arbitration: %02X\n", reg_02.bits.arbitration); } /* @@ -1257,11 +1248,11 @@ static void __init print_IO_APIC(int ioapic_idx) */ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && reg_03.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); - printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + apic_dbg(".... register #03: %08X\n", reg_03.raw); + apic_dbg("....... : Boot DT : %X\n", reg_03.bits.boot_DT); } - printk(KERN_DEBUG ".... IRQ redirection table:\n"); + apic_dbg(".... IRQ redirection table:\n"); io_apic_print_entries(ioapic_idx, reg_01.bits.entries); } @@ -1270,11 +1261,11 @@ void __init print_IO_APICs(void) int ioapic_idx; unsigned int irq; - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for_each_ioapic(ioapic_idx) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mpc_ioapic_id(ioapic_idx), - ioapics[ioapic_idx].nr_registers); + apic_dbg("number of MP IRQ sources: %d.\n", mp_irq_entries); + for_each_ioapic(ioapic_idx) { + apic_dbg("number of IO-APIC #%d registers: %d.\n", + mpc_ioapic_id(ioapic_idx), ioapics[ioapic_idx].nr_registers); + } /* * We are a bit conservative about what we expect. We have to @@ -1285,7 +1276,7 @@ void __init print_IO_APICs(void) for_each_ioapic(ioapic_idx) print_IO_APIC(ioapic_idx); - printk(KERN_DEBUG "IRQ to pin mappings:\n"); + apic_dbg("IRQ to pin mappings:\n"); for_each_active_irq(irq) { struct irq_pin_list *entry; struct irq_chip *chip; @@ -1300,7 +1291,7 @@ void __init print_IO_APICs(void) if (list_empty(&data->irq_2_pin)) continue; - printk(KERN_DEBUG "IRQ%d ", irq); + apic_dbg("IRQ%d ", irq); for_each_irq_pin(entry, data->irq_2_pin) pr_cont("-> %d:%d", entry->apic, entry->pin); pr_cont("\n"); From 1ee0aa8285c11921ad22336d07b67fc81a0d36cf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2024 18:15:44 +0200 Subject: [PATCH 043/126] x86/mpparse: Cleanup apic_printk()s Use the new apic_pr_verbose() helper. Signed-off-by: Thomas Gleixner Tested-by: Qiuxu Zhuo Tested-by: Breno Leitao Link: https://lore.kernel.org/all/20240802155440.779537108@linutronix.de --- arch/x86/kernel/mpparse.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index e89171b0347a..4a1b1b28abf9 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -68,7 +68,7 @@ static void __init mpc_oem_bus_info(struct mpc_bus *m, char *str) { memcpy(str, m->bustype, 6); str[6] = 0; - apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); + apic_pr_verbose("Bus #%d is %s\n", m->busid, str); } static void __init MP_bus_info(struct mpc_bus *m) @@ -417,7 +417,7 @@ static unsigned long __init get_mpc_size(unsigned long physptr) mpc = early_memremap(physptr, PAGE_SIZE); size = mpc->length; early_memunmap(mpc, PAGE_SIZE); - apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); + apic_pr_verbose(" mpc: %lx-%lx\n", physptr, physptr + size); return size; } @@ -560,8 +560,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) struct mpf_intel *mpf; int ret = 0; - apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", - base, base + length - 1); + apic_pr_verbose("Scan for SMP in [mem %#010lx-%#010lx]\n", base, base + length - 1); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { @@ -683,13 +682,13 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) { int i; - apic_printk(APIC_VERBOSE, "OLD "); + apic_pr_verbose("OLD "); print_mp_irq_info(m); i = get_MP_intsrc_index(m); if (i > 0) { memcpy(m, &mp_irqs[i], sizeof(*m)); - apic_printk(APIC_VERBOSE, "NEW "); + apic_pr_verbose("NEW "); print_mp_irq_info(&mp_irqs[i]); return; } @@ -772,7 +771,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, continue; if (nr_m_spare > 0) { - apic_printk(APIC_VERBOSE, "*NEW* found\n"); + apic_pr_verbose("*NEW* found\n"); nr_m_spare--; memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i])); m_spare[nr_m_spare] = NULL; From 48855a2c92203389cb215c86ee3d2f2df5aa4024 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2024 18:15:45 +0200 Subject: [PATCH 044/126] iommu/vt-d: Cleanup apic_printk() Use the new apic_pr_verbose() helper. Signed-off-by: Thomas Gleixner Tested-by: Qiuxu Zhuo Tested-by: Breno Leitao Link: https://lore.kernel.org/all/20240802155440.843266805@linutronix.de --- drivers/iommu/intel/irq_remapping.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index e090ca07364b..7a6d188e3bea 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1352,12 +1352,11 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, case X86_IRQ_ALLOC_TYPE_IOAPIC: /* Set source-id of interrupt request */ set_ioapic_sid(irte, info->devid); - apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n", - info->devid, irte->present, irte->fpd, - irte->dst_mode, irte->redir_hint, - irte->trigger_mode, irte->dlvry_mode, - irte->avail, irte->vector, irte->dest_id, - irte->sid, irte->sq, irte->svt); + apic_pr_verbose("IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n", + info->devid, irte->present, irte->fpd, irte->dst_mode, + irte->redir_hint, irte->trigger_mode, irte->dlvry_mode, + irte->avail, irte->vector, irte->dest_id, irte->sid, + irte->sq, irte->svt); sub_handle = info->ioapic.pin; break; case X86_IRQ_ALLOC_TYPE_HPET: From ee64510fb959991c3afd94e05e468a5b27e77f68 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2024 18:15:47 +0200 Subject: [PATCH 045/126] x86/ioapic: Move replace_pin_at_irq_node() to the call site It's only used by check_timer(). Signed-off-by: Thomas Gleixner Tested-by: Qiuxu Zhuo Tested-by: Breno Leitao Link: https://lore.kernel.org/all/20240802155440.906636514@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 40 +++++++++++++++------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3669b5d27518..51ecfb5582dc 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -370,28 +370,6 @@ static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin) } } -/* - * Reroute an IRQ to a different pin. - */ -static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, - int oldapic, int oldpin, - int newapic, int newpin) -{ - struct irq_pin_list *entry; - - for_each_irq_pin(entry, data->irq_2_pin) { - if (entry->apic == oldapic && entry->pin == oldpin) { - entry->apic = newapic; - entry->pin = newpin; - /* every one is different, right? */ - return; - } - } - - /* old apic/pin didn't exist, so just add new ones */ - add_pin_to_irq_node(data, node, newapic, newpin); -} - static void io_apic_modify_irq(struct mp_chip_data *data, bool masked, void (*final)(struct irq_pin_list *entry)) { @@ -2067,6 +2045,24 @@ static int __init mp_alloc_timer_irq(int ioapic, int pin) return irq; } +static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, data->irq_2_pin) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + return; + } + } + + /* Old apic/pin didn't exist, so just add a new one */ + add_pin_to_irq_node(data, node, newapic, newpin); +} + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ From 75d449402b125648bf048309c9d22b39262d6fba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2024 18:15:48 +0200 Subject: [PATCH 046/126] x86/ioapic: Cleanup comments Use proper comment styles and shrink comments to their scope where applicable. Signed-off-by: Thomas Gleixner Tested-by: Qiuxu Zhuo Tested-by: Breno Leitao Link: https://lore.kernel.org/all/20240802155440.969619978@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 86 +++++++++++++++------------------- 1 file changed, 37 insertions(+), 49 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 51ecfb5582dc..25d1d1af3830 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -384,12 +384,12 @@ static void io_apic_modify_irq(struct mp_chip_data *data, bool masked, } } +/* + * Synchronize the IO-APIC and the CPU by doing a dummy read from the + * IO-APIC + */ static void io_apic_sync(struct irq_pin_list *entry) { - /* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ struct io_apic __iomem *io_apic; io_apic = io_apic_base(entry->apic); @@ -442,17 +442,13 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) entry = entry1 = __ioapic_read_entry(apic, pin); - /* - * Mask the entry and change the trigger mode to edge. - */ + /* Mask the entry and change the trigger mode to edge. */ entry1.masked = true; entry1.is_level = false; __ioapic_write_entry(apic, pin, entry1); - /* - * Restore the previous level triggered entry. - */ + /* Restore the previous level triggered entry. */ __ioapic_write_entry(apic, pin, entry); } } @@ -1012,16 +1008,12 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) { u32 gsi = mp_pin_to_gsi(ioapic, pin); - /* - * Debugging check, we are in big trouble if this message pops up! - */ + /* Debugging check, we are in big trouble if this message pops up! */ if (mp_irqs[idx].dstirq != pin) pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); #ifdef CONFIG_X86_32 - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ + /* PCI IRQ command line redirection. Yes, limits are hardcoded. */ if ((pin >= 16) && (pin <= 23)) { if (pirq_entries[pin - 16] != -1) { if (!pirq_entries[pin - 16]) { @@ -1296,8 +1288,9 @@ void __init enable_IO_APIC(void) /* See if any of the pins is in ExtINT mode */ struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin); - /* If the interrupt line is enabled and in ExtInt mode - * I have found the pin where the i8259 is connected. + /* + * If the interrupt line is enabled and in ExtInt mode I + * have found the pin where the i8259 is connected. */ if (!entry.masked && entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) { @@ -1307,8 +1300,11 @@ void __init enable_IO_APIC(void) } } found_i8259: - /* Look to see what if the MP table has reported the ExtINT */ - /* If we could not find the appropriate pin by looking at the ioapic + + /* + * Look to see what if the MP table has reported the ExtINT + * + * If we could not find the appropriate pin by looking at the ioapic * the i8259 probably is not connected the ioapic but give the * mptable a chance anyway. */ @@ -1348,9 +1344,7 @@ void native_restore_boot_irq_mode(void) entry.destid_0_7 = apic_id & 0xFF; entry.virt_destid_8_14 = apic_id >> 8; - /* - * Add it to the IO-APIC irq-routing table: - */ + /* Add it to the IO-APIC irq-routing table */ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } @@ -1427,8 +1421,8 @@ static void __init setup_ioapic_ids_from_mpc_nocheck(void) } /* - * We need to adjust the IRQ routing table - * if the ID changed. + * We need to adjust the IRQ routing table if the ID + * changed. */ if (old_id != mpc_ioapic_id(ioapic_idx)) for (i = 0; i < mp_irq_entries; i++) @@ -1437,8 +1431,8 @@ static void __init setup_ioapic_ids_from_mpc_nocheck(void) = mpc_ioapic_id(ioapic_idx); /* - * Update the ID register according to the right value - * from the MPC table if they are different. + * Update the ID register according to the right value from + * the MPC table if they are different. */ if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID) continue; @@ -1562,21 +1556,17 @@ static int __init timer_irq_works(void) * so we 'resend' these IRQs via IPIs, to the same CPU. It's much * better to do it this way as thus we do not have to be aware of * 'pending' interrupts in the IRQ path, except at this point. - */ -/* - * Edge triggered needs to resend any interrupt - * that was delayed but this is now handled in the device - * independent code. - */ - -/* - * Starting up a edge-triggered IO-APIC interrupt is - * nasty - we need to make sure that we get the edge. - * If it is already asserted for some reason, we need - * return 1 to indicate that is was pending. * - * This is not complete - we should be able to fake - * an edge even if it isn't on the 8259A... + * + * Edge triggered needs to resend any interrupt that was delayed but this + * is now handled in the device independent code. + * + * Starting up a edge-triggered IO-APIC interrupt is nasty - we need to + * make sure that we get the edge. If it is already asserted for some + * reason, we need return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake an edge even if it + * isn't on the 8259A... */ static unsigned int startup_ioapic_irq(struct irq_data *data) { @@ -1627,7 +1617,8 @@ static inline bool ioapic_prepare_move(struct irq_data *data) static inline void ioapic_finish_move(struct irq_data *data, bool moveit) { if (unlikely(moveit)) { - /* Only migrate the irq if the ack has been received. + /* + * Only migrate the irq if the ack has been received. * * On rare occasions the broadcast level triggered ack gets * delayed going to ioapics, and if we reprogram the @@ -1904,14 +1895,13 @@ static inline void init_IO_APIC_traps(void) cfg = irq_cfg(irq); if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* - * Hmm.. We don't have an entry for this, - * so default to an old-fashioned 8259 - * interrupt if we can.. + * Hmm.. We don't have an entry for this, so + * default to an old-fashioned 8259 interrupt if we + * can. Otherwise set the dummy interrupt chip. */ if (irq < nr_legacy_irqs()) legacy_pic->make_irq(irq); else - /* Strange. Oh, well.. */ irq_set_chip(irq, &no_irq_chip); } } @@ -2307,9 +2297,7 @@ void __init setup_IO_APIC(void) for_each_ioapic(ioapic) BUG_ON(mp_irqdomain_create(ioapic)); - /* - * Set up IO-APIC IRQ routing. - */ + /* Set up IO-APIC IRQ routing. */ x86_init.mpparse.setup_ioapic_ids(); sync_Arb_IDs(); From 4bcfdf76d7d1976b035ca75776bc8d266a09d6c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2024 18:15:49 +0200 Subject: [PATCH 047/126] x86/ioapic: Cleanup bracket usage Add brackets around if/for constructs as required by coding style or remove pointless line breaks to make it true single line statements which do not require brackets. Signed-off-by: Thomas Gleixner Tested-by: Qiuxu Zhuo Tested-by: Breno Leitao Link: https://lore.kernel.org/all/20240802155441.032045616@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 25d1d1af3830..8d81c471edf0 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -362,12 +362,13 @@ static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin) { struct irq_pin_list *tmp, *entry; - list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) + list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) { if (entry->apic == apic && entry->pin == pin) { list_del(&entry->list); kfree(entry); return; } + } } static void io_apic_modify_irq(struct mp_chip_data *data, bool masked, @@ -562,8 +563,7 @@ int save_ioapic_entries(void) } for_each_pin(apic, pin) - ioapics[apic].saved_registers[pin] = - ioapic_read_entry(apic, pin); + ioapics[apic].saved_registers[pin] = ioapic_read_entry(apic, pin); } return err; @@ -604,8 +604,7 @@ int restore_ioapic_entries(void) continue; for_each_pin(apic, pin) - ioapic_write_entry(apic, pin, - ioapics[apic].saved_registers[pin]); + ioapic_write_entry(apic, pin, ioapics[apic].saved_registers[pin]); } return 0; } @@ -617,12 +616,13 @@ static int find_irq_entry(int ioapic_idx, int pin, int type) { int i; - for (i = 0; i < mp_irq_entries; i++) + for (i = 0; i < mp_irq_entries; i++) { if (mp_irqs[i].irqtype == type && (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) || mp_irqs[i].dstapic == MP_APIC_ALL) && mp_irqs[i].dstirq == pin) return i; + } return -1; } @@ -662,9 +662,10 @@ static int __init find_isa_irq_apic(int irq, int type) if (i < mp_irq_entries) { int ioapic_idx; - for_each_ioapic(ioapic_idx) + for_each_ioapic(ioapic_idx) { if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic) return ioapic_idx; + } } return -1; @@ -1424,11 +1425,12 @@ static void __init setup_ioapic_ids_from_mpc_nocheck(void) * We need to adjust the IRQ routing table if the ID * changed. */ - if (old_id != mpc_ioapic_id(ioapic_idx)) - for (i = 0; i < mp_irq_entries; i++) + if (old_id != mpc_ioapic_id(ioapic_idx)) { + for (i = 0; i < mp_irq_entries; i++) { if (mp_irqs[i].dstapic == old_id) - mp_irqs[i].dstapic - = mpc_ioapic_id(ioapic_idx); + mp_irqs[i].dstapic = mpc_ioapic_id(ioapic_idx); + } + } /* * Update the ID register according to the right value from @@ -2666,12 +2668,10 @@ static int bad_ioapic_register(int idx) static int find_free_ioapic_entry(void) { - int idx; - - for (idx = 0; idx < MAX_IO_APICS; idx++) + for (int idx = 0; idx < MAX_IO_APICS; idx++) { if (ioapics[idx].nr_registers == 0) return idx; - + } return MAX_IO_APICS; } @@ -2780,11 +2780,13 @@ int mp_unregister_ioapic(u32 gsi_base) int ioapic, pin; int found = 0; - for_each_ioapic(ioapic) + for_each_ioapic(ioapic) { if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) { found = 1; break; } + } + if (!found) { pr_warn("can't find IOAPIC for GSI %d\n", gsi_base); return -ENODEV; From 966e09b1862555267effa1db9032dc3dbc0cc588 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2024 18:15:50 +0200 Subject: [PATCH 048/126] x86/ioapic: Cleanup line breaks 80 character limit is history. Signed-off-by: Thomas Gleixner Tested-by: Qiuxu Zhuo Tested-by: Breno Leitao Link: https://lore.kernel.org/all/20240802155441.095653193@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 55 +++++++++++----------------------- 1 file changed, 18 insertions(+), 37 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8d81c471edf0..051779faa43d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -637,10 +637,8 @@ static int __init find_isa_irq_pin(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].srcbus; - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].irqtype == type) && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) && (mp_irqs[i].srcbusirq == irq)) - return mp_irqs[i].dstirq; } return -1; @@ -653,8 +651,7 @@ static int __init find_isa_irq_apic(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].srcbus; - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].irqtype == type) && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) && (mp_irqs[i].srcbusirq == irq)) break; } @@ -907,8 +904,7 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, return -1; } - return __irq_domain_alloc_irqs(domain, irq, 1, - ioapic_alloc_attr_node(info), + return __irq_domain_alloc_irqs(domain, irq, 1, ioapic_alloc_attr_node(info), info, legacy, NULL); } @@ -922,13 +918,12 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be * multiple pins sharing the same legacy IRQ number when ACPI is disabled. */ -static int alloc_isa_irq_from_domain(struct irq_domain *domain, - int irq, int ioapic, int pin, +static int alloc_isa_irq_from_domain(struct irq_domain *domain, int irq, int ioapic, int pin, struct irq_alloc_info *info) { - struct mp_chip_data *data; struct irq_data *irq_data = irq_get_irq_data(irq); int node = ioapic_alloc_attr_node(info); + struct mp_chip_data *data; /* * Legacy ISA IRQ has already been allocated, just add pin to @@ -942,8 +937,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, return -ENOMEM; } else { info->flags |= X86_IRQ_ALLOC_LEGACY; - irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, - NULL); + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, NULL); if (irq >= 0) { irq_data = irq_domain_get_irq_data(domain, irq); data = irq_data->chip_data; @@ -1121,8 +1115,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) return -1; out: - return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, - IOAPIC_MAP_ALLOC); + return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, IOAPIC_MAP_ALLOC); } EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); @@ -1293,14 +1286,12 @@ void __init enable_IO_APIC(void) * If the interrupt line is enabled and in ExtInt mode I * have found the pin where the i8259 is connected. */ - if (!entry.masked && - entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) { + if (!entry.masked && entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) { ioapic_i8259.apic = apic; ioapic_i8259.pin = pin; - goto found_i8259; + break; } } - found_i8259: /* * Look to see what if the MP table has reported the ExtINT @@ -1496,8 +1487,7 @@ static void __init delay_with_tsc(void) do { rep_nop(); now = rdtsc(); - } while ((now - start) < 40000000000ULL / HZ && - time_before_eq(jiffies, end)); + } while ((now - start) < 40000000000ULL / HZ && time_before_eq(jiffies, end)); } static void __init delay_without_tsc(void) @@ -1912,20 +1902,17 @@ static inline void init_IO_APIC_traps(void) /* * The local APIC irq-chip implementation: */ - static void mask_lapic_irq(struct irq_data *data) { - unsigned long v; + unsigned long v = apic_read(APIC_LVT0); - v = apic_read(APIC_LVT0); apic_write(APIC_LVT0, v | APIC_LVT_MASKED); } static void unmask_lapic_irq(struct irq_data *data) { - unsigned long v; + unsigned long v = apic_read(APIC_LVT0); - v = apic_read(APIC_LVT0); apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } @@ -1944,8 +1931,7 @@ static struct irq_chip lapic_chip __read_mostly = { static void lapic_register_intr(int irq) { irq_clear_status_flags(irq, IRQ_LEVEL); - irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, - "edge"); + irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); } /* @@ -2265,10 +2251,8 @@ static int mp_irqdomain_create(int ioapic) return -ENOMEM; } - if (cfg->type == IOAPIC_DOMAIN_LEGACY || - cfg->type == IOAPIC_DOMAIN_STRICT) - ioapic_dynirq_base = max(ioapic_dynirq_base, - gsi_cfg->gsi_end + 1); + if (cfg->type == IOAPIC_DOMAIN_LEGACY || cfg->type == IOAPIC_DOMAIN_STRICT) + ioapic_dynirq_base = max(ioapic_dynirq_base, gsi_cfg->gsi_end + 1); return 0; } @@ -2682,8 +2666,7 @@ static int find_free_ioapic_entry(void) * @gsi_base: base of GSI associated with the IOAPIC * @cfg: configuration information for the IOAPIC */ -int mp_register_ioapic(int id, u32 address, u32 gsi_base, - struct ioapic_domain_cfg *cfg) +int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg) { bool hotplug = !!ioapic_initialized; struct mp_ioapic_gsi *gsi_cfg; @@ -2835,8 +2818,7 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, if (info && info->ioapic.valid) { data->is_level = info->ioapic.is_level; data->active_low = info->ioapic.active_low; - } else if (__acpi_get_override_irq(gsi, &data->is_level, - &data->active_low) < 0) { + } else if (__acpi_get_override_irq(gsi, &data->is_level, &data->active_low) < 0) { /* PCI interrupts are always active low level triggered. */ data->is_level = true; data->active_low = true; @@ -2956,8 +2938,7 @@ void mp_irqdomain_deactivate(struct irq_domain *domain, struct irq_data *irq_data) { /* It won't be called for IRQ with multiple IOAPIC pins associated */ - ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), - (int)irq_data->hwirq); + ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq); } int mp_irqdomain_ioapic_idx(struct irq_domain *domain) From 62e303e346d7f829ff4f52c73176451c7f85f4bc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 2 Aug 2024 18:15:52 +0200 Subject: [PATCH 049/126] x86/ioapic: Cleanup remaining coding style issues Add missing new lines and reorder variable definitions. Signed-off-by: Thomas Gleixner Tested-by: Qiuxu Zhuo Tested-by: Breno Leitao Link: https://lore.kernel.org/all/20240802155441.158662179@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 051779faa43d..1029ea4ac8ba 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -264,12 +264,14 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) static inline void io_apic_eoi(unsigned int apic, unsigned int vector) { struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(vector, &io_apic->eoi); } unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) { struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); return readl(&io_apic->data); } @@ -880,9 +882,9 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, struct irq_alloc_info *info) { + int type = ioapics[ioapic].irqdomain_cfg.type; bool legacy = false; int irq = -1; - int type = ioapics[ioapic].irqdomain_cfg.type; switch (type) { case IOAPIC_DOMAIN_LEGACY: @@ -951,11 +953,11 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, int irq, int ioa static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, unsigned int flags, struct irq_alloc_info *info) { - int irq; - bool legacy = false; + struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); struct irq_alloc_info tmp; struct mp_chip_data *data; - struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + bool legacy = false; + int irq; if (!domain) return -ENOSYS; @@ -1269,8 +1271,7 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; void __init enable_IO_APIC(void) { - int i8259_apic, i8259_pin; - int apic, pin; + int i8259_apic, i8259_pin, apic, pin; if (ioapic_is_disabled) nr_ioapics = 0; @@ -1943,9 +1944,9 @@ static void lapic_register_intr(int irq) */ static inline void __init unlock_ExtINT_logic(void) { - int apic, pin, i; - struct IO_APIC_route_entry entry0, entry1; unsigned char save_control, save_freq_select; + struct IO_APIC_route_entry entry0, entry1; + int apic, pin, i; u32 apic_id; pin = find_isa_irq_pin(8, mp_INT); @@ -2211,11 +2212,11 @@ out: static int mp_irqdomain_create(int ioapic) { - struct irq_domain *parent; + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); int hwirqs = mp_ioapic_pin_count(ioapic); struct ioapic *ip = &ioapics[ioapic]; struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; - struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); + struct irq_domain *parent; struct fwnode_handle *fn; struct irq_fwspec fwspec; @@ -2491,8 +2492,8 @@ static struct resource *ioapic_resources; static struct resource * __init ioapic_setup_resources(void) { - unsigned long n; struct resource *res; + unsigned long n; char *mem; int i; From 00e5bd44389145cd2f42be8e98cadb210731e72a Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Sat, 3 Aug 2024 19:37:04 +0800 Subject: [PATCH 050/126] x86/apic: Remove unused inline function apic_set_eoi_cb() Commit 2744a7ce34a7 ("x86/apic: Replace acpi_wake_cpu_handler_update() and apic_set_eoi_cb()") left it unused. Signed-off-by: Yue Haibing Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240803113704.246752-1-yuehaibing@huawei.com --- arch/x86/include/asm/apic.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 34eaebe2f61d..9dd22ee27117 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -489,7 +489,6 @@ static inline u64 apic_icr_read(void) { return 0; } static inline void apic_icr_write(u32 low, u32 high) { } static inline void apic_wait_icr_idle(void) { } static inline u32 safe_apic_wait_icr_idle(void) { return 0; } -static inline void apic_set_eoi_cb(void (*eoi)(void)) {} static inline void apic_native_eoi(void) { WARN_ON_ONCE(1); } static inline void apic_setup_apic_calls(void) { } From 195a56986c50599e5d10a558c12f74266fd51bab Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 26 Jul 2024 05:09:30 -0700 Subject: [PATCH 051/126] docs: fault-injection: document cache-filter feature for failslab The failslab fault injection mechanism has an undocumented capability that provides significant utility in testing and debugging. This feature, introduced in commit 4c13dd3b48fcb ("failslab: add ability to filter slab caches"), allows for targeted error injection into specific slab caches. However, it was inadvertently left undocumented at the time of its implementation. Add documentation for the cache-filter feature in the failslab mode description. Also, providing a practical example demonstrating how to use cache-filter to inject failures specifically when allocating socket buffers (skbs). Signed-off-by: Breno Leitao Reviewed-by: Akinobu Mita Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240726120930.3231333-1-leitao@debian.org --- .../fault-injection/fault-injection.rst | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst index 70380a2a01b4..07c24710bd21 100644 --- a/Documentation/fault-injection/fault-injection.rst +++ b/Documentation/fault-injection/fault-injection.rst @@ -141,6 +141,14 @@ configuration of fault-injection capabilities. default is 'Y', setting it to 'N' will also inject failures into highmem/user allocations (__GFP_HIGHMEM allocations). +- /sys/kernel/debug/failslab/cache-filter + Format: { 'Y' | 'N' } + + default is 'N', setting it to 'Y' will only inject failures when + objects are requests from certain caches. + + Select the cache by writing '1' to /sys/kernel/slab//failslab: + - /sys/kernel/debug/failslab/ignore-gfp-wait: - /sys/kernel/debug/fail_page_alloc/ignore-gfp-wait: @@ -459,6 +467,18 @@ Application Examples losetup -d $DEVICE rm testfile.img +------------------------------------------------------------------------------ + +- Inject only skbuff allocation failures :: + + # mark skbuff_head_cache as faulty + echo 1 > /sys/kernel/slab/skbuff_head_cache/failslab + # Turn on cache filter (off by default) + echo 1 > /sys/kernel/debug/failslab/cache-filter + # Turn on fault injection + echo 1 > /sys/kernel/debug/failslab/times + echo 1 > /sys/kernel/debug/failslab/probability + Tool to run command with failslab or fail_page_alloc ---------------------------------------------------- From 91031ca349ee607b3e1adc34578a33af5708a96c Mon Sep 17 00:00:00 2001 From: Jiamu Sun Date: Sun, 4 Aug 2024 02:19:09 +0900 Subject: [PATCH 052/126] docs: improve comment consistency in .muttrc example configuration Added a space to align comment formatting; this helps improve consistency and visual uniformity. Signed-off-by: Jiamu Sun Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/SY0P300MB0801D1A4B278157CA7C92DE2CEBC2@SY0P300MB0801.AUSP300.PROD.OUTLOOK.COM --- Documentation/process/email-clients.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/email-clients.rst b/Documentation/process/email-clients.rst index dd22c46d1d02..e6b9173a1845 100644 --- a/Documentation/process/email-clients.rst +++ b/Documentation/process/email-clients.rst @@ -216,7 +216,7 @@ Mutt is highly customizable. Here is a minimum configuration to start using Mutt to send patches through Gmail:: # .muttrc - # ================ IMAP ==================== + # ================ IMAP ==================== set imap_user = 'yourusername@gmail.com' set imap_pass = 'yourpassword' set spoolfile = imaps://imap.gmail.com/INBOX From e9c7acd723127f0b9bf78202a02f2e5e7d0b6a4f Mon Sep 17 00:00:00 2001 From: Shibu Kumar Date: Sun, 4 Aug 2024 00:03:03 +0530 Subject: [PATCH 053/126] docs: dm-crypt: Removal of unexpected indentation error Add the required indentation to fix this docs build error: Documentation/admin-guide/device-mapper/dm-crypt.rst:167: ERROR: Unexpected indentation. Also split the documentation for read and write into separate blocks. Signed-off-by: Shibu kumar shibukumar.bit@gmail.com [jc: rewrote changelog] Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240803183306.32425-1-shibukumar.bit@gmail.com --- .../admin-guide/device-mapper/dm-crypt.rst | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/device-mapper/dm-crypt.rst b/Documentation/admin-guide/device-mapper/dm-crypt.rst index e625830d335e..48a48bd09372 100644 --- a/Documentation/admin-guide/device-mapper/dm-crypt.rst +++ b/Documentation/admin-guide/device-mapper/dm-crypt.rst @@ -162,13 +162,19 @@ iv_large_sectors Module parameters:: -max_read_size -max_write_size - Maximum size of read or write requests. When a request larger than this size - is received, dm-crypt will split the request. The splitting improves - concurrency (the split requests could be encrypted in parallel by multiple - cores), but it also causes overhead. The user should tune these parameters to - fit the actual workload. + max_read_size + Maximum size of read requests. When a request larger than this size + is received, dm-crypt will split the request. The splitting improves + concurrency (the split requests could be encrypted in parallel by multiple + cores), but it also causes overhead. The user should tune this parameters to + fit the actual workload. + + max_write_size + Maximum size of write requests. When a request larger than this size + is received, dm-crypt will split the request. The splitting improves + concurrency (the split requests could be encrypted in parallel by multiple + cores), but it also causes overhead. The user should tune this parameters to + fit the actual workload. Example scripts From 602bce7e5edeb0d701ee2074fe512c4d793dac0d Mon Sep 17 00:00:00 2001 From: Carlos Bilbao Date: Fri, 19 Jul 2024 19:22:06 -0500 Subject: [PATCH 054/126] docs: scheduler: Start documenting the EEVDF scheduler Add some documentation regarding the newly introduced scheduler EEVDF. Signed-off-by: Carlos Bilbao Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240720002207.444286-2-carlos.bilbao.osdev@gmail.com --- Documentation/scheduler/index.rst | 1 + Documentation/scheduler/sched-design-CFS.rst | 10 +++-- Documentation/scheduler/sched-eevdf.rst | 43 ++++++++++++++++++++ 3 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 Documentation/scheduler/sched-eevdf.rst diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst index 43bd8a145b7a..1f2942c4d14b 100644 --- a/Documentation/scheduler/index.rst +++ b/Documentation/scheduler/index.rst @@ -12,6 +12,7 @@ Scheduler sched-bwc sched-deadline sched-design-CFS + sched-eevdf sched-domains sched-capacity sched-energy diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst index bc1e507269c6..8786f219fc73 100644 --- a/Documentation/scheduler/sched-design-CFS.rst +++ b/Documentation/scheduler/sched-design-CFS.rst @@ -8,10 +8,12 @@ CFS Scheduler 1. OVERVIEW ============ -CFS stands for "Completely Fair Scheduler," and is the new "desktop" process -scheduler implemented by Ingo Molnar and merged in Linux 2.6.23. It is the -replacement for the previous vanilla scheduler's SCHED_OTHER interactivity -code. +CFS stands for "Completely Fair Scheduler," and is the "desktop" process +scheduler implemented by Ingo Molnar and merged in Linux 2.6.23. When +originally merged, it was the replacement for the previous vanilla +scheduler's SCHED_OTHER interactivity code. Nowadays, CFS is making room +for EEVDF, for which documentation can be found in +Documentation/scheduler/sched-eevdf.rst. 80% of CFS's design can be summed up in a single sentence: CFS basically models an "ideal, precise multi-tasking CPU" on real hardware. diff --git a/Documentation/scheduler/sched-eevdf.rst b/Documentation/scheduler/sched-eevdf.rst new file mode 100644 index 000000000000..83efe7c0a30d --- /dev/null +++ b/Documentation/scheduler/sched-eevdf.rst @@ -0,0 +1,43 @@ +=============== +EEVDF Scheduler +=============== + +The "Earliest Eligible Virtual Deadline First" (EEVDF) was first introduced +in a scientific publication in 1995 [1]. The Linux kernel began +transitioning to EEVDF in version 6.6 (as a new option in 2024), moving +away from the earlier Completely Fair Scheduler (CFS) in favor of a version +of EEVDF proposed by Peter Zijlstra in 2023 [2-4]. More information +regarding CFS can be found in +Documentation/scheduler/sched-design-CFS.rst. + +Similarly to CFS, EEVDF aims to distribute CPU time equally among all +runnable tasks with the same priority. To do so, it assigns a virtual run +time to each task, creating a "lag" value that can be used to determine +whether a task has received its fair share of CPU time. In this way, a task +with a positive lag is owed CPU time, while a negative lag means the task +has exceeded its portion. EEVDF picks tasks with lag greater or equal to +zero and calculates a virtual deadline (VD) for each, selecting the task +with the earliest VD to execute next. It's important to note that this +allows latency-sensitive tasks with shorter time slices to be prioritized, +which helps with their responsiveness. + +There are ongoing discussions on how to manage lag, especially for sleeping +tasks; but at the time of writing EEVDF uses a "decaying" mechanism based +on virtual run time (VRT). This prevents tasks from exploiting the system +by sleeping briefly to reset their negative lag: when a task sleeps, it +remains on the run queue but marked for "deferred dequeue," allowing its +lag to decay over VRT. Hence, long-sleeping tasks eventually have their lag +reset. Finally, tasks can preempt others if their VD is earlier, and tasks +can request specific time slices using the new sched_setattr() system call, +which further facilitates the job of latency-sensitive applications. + +REFERENCES +========== + +[1] https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=805acf7726282721504c8f00575d91ebfd750564 + +[2] https://lore.kernel.org/lkml/a79014e6-ea83-b316-1e12-2ae056bda6fa@linux.vnet.ibm.com/ + +[3] https://lwn.net/Articles/969062/ + +[4] https://lwn.net/Articles/925371/ From 1f5e3920b5e45cef81a31a1dcf8e1d0f615840fa Mon Sep 17 00:00:00 2001 From: ganjie Date: Thu, 8 Aug 2024 16:57:07 +0800 Subject: [PATCH 055/126] Documentation: dontdiff: remove 'utf8data.h' Commit 2b3d04787012 ("unicode: Add utf8-data module") changed the database file from 'utf8data.h' to 'utf8data.c' to build separate module, but it seems forgot to update Documentation/dontdiff. Remove 'utf8data.h' and add 'utf8data.c'. Signed-off-by: ganjie Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240808085707.3235019-1-guoxuenan@huawei.com --- Documentation/dontdiff | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 3c399f132e2d..94b3492dc301 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -262,7 +262,7 @@ vsyscall_32.lds wanxlfw.inc uImage unifdef -utf8data.h +utf8data.c wakeup.bin wakeup.elf wakeup.lds From 830a0d12943f53077b235f2a3caa8ab2b36475a3 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 6 Aug 2024 20:08:23 +0800 Subject: [PATCH 056/126] x86/mm: Don't print out SRAT table information This per CPU log is becoming longer with more and more CPUs in system, which slows down the boot process due to the serializing nature of printk(). The value of this information is dubious and it can be retrieved by lscpu from user space if required.. Downgrade the printk() to pr_debug() so it is still accessible for debug purposes. [ tglx: Massaged changelog ] Signed-off-by: Li RongQing Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240806120823.17111-1-lirongqing@baidu.com --- arch/x86/mm/srat.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 9c52a95937ad..6f8e0f21c710 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -57,8 +57,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) } set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", - pxm, apic_id, node); + pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node); } /* Callback for Proximity Domain -> LAPIC mapping */ @@ -98,8 +97,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); - printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", - pxm, apic_id, node); + pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node); } int __init x86_acpi_numa_init(void) From 477d81a1c47a1b79b9c08fc92b5dea3c5143800b Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 11 Jun 2024 09:50:30 +0200 Subject: [PATCH 057/126] x86/entry: Remove unwanted instrumentation in common_interrupt() common_interrupt() and related variants call kvm_set_cpu_l1tf_flush_l1d(), which is neither marked noinstr nor __always_inline. So compiler puts it out of line and adds instrumentation to it. Since the call is inside of instrumentation_begin/end(), objtool does not warn about it. The manifestation is that KCOV produces spurious coverage in kvm_set_cpu_l1tf_flush_l1d() in random places because the call happens when preempt count is not yet updated to say that the kernel is in an interrupt. Mark kvm_set_cpu_l1tf_flush_l1d() as __always_inline and move it out of the instrumentation_begin/end() section. It only calls __this_cpu_write() which is already safe to call in noinstr contexts. Fixes: 6368558c3710 ("x86/entry: Provide IDTENTRY_SYSVEC") Signed-off-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Reviewed-by: Alexander Potapenko Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/3f9a1de9e415fcb53d07dc9e19fa8481bb021b1b.1718092070.git.dvyukov@google.com --- arch/x86/include/asm/hardirq.h | 8 ++++++-- arch/x86/include/asm/idtentry.h | 6 +++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index c67fa6ad098a..6ffa8b75f4cd 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -69,7 +69,11 @@ extern u64 arch_irq_stat(void); #define local_softirq_pending_ref pcpu_hot.softirq_pending #if IS_ENABLED(CONFIG_KVM_INTEL) -static inline void kvm_set_cpu_l1tf_flush_l1d(void) +/* + * This function is called from noinstr interrupt contexts + * and must be inlined to not get instrumentation. + */ +static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void) { __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); } @@ -84,7 +88,7 @@ static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void) return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d); } #else /* !IS_ENABLED(CONFIG_KVM_INTEL) */ -static inline void kvm_set_cpu_l1tf_flush_l1d(void) { } +static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void) { } #endif /* IS_ENABLED(CONFIG_KVM_INTEL) */ #endif /* _ASM_X86_HARDIRQ_H */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index d4f24499b256..ad5c68f0509d 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -212,8 +212,8 @@ __visible noinstr void func(struct pt_regs *regs, \ irqentry_state_t state = irqentry_enter(regs); \ u32 vector = (u32)(u8)error_code; \ \ + kvm_set_cpu_l1tf_flush_l1d(); \ instrumentation_begin(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ run_irq_on_irqstack_cond(__##func, regs, vector); \ instrumentation_end(); \ irqentry_exit(regs, state); \ @@ -250,7 +250,6 @@ static void __##func(struct pt_regs *regs); \ \ static __always_inline void instr_##func(struct pt_regs *regs) \ { \ - kvm_set_cpu_l1tf_flush_l1d(); \ run_sysvec_on_irqstack_cond(__##func, regs); \ } \ \ @@ -258,6 +257,7 @@ __visible noinstr void func(struct pt_regs *regs) \ { \ irqentry_state_t state = irqentry_enter(regs); \ \ + kvm_set_cpu_l1tf_flush_l1d(); \ instrumentation_begin(); \ instr_##func (regs); \ instrumentation_end(); \ @@ -288,7 +288,6 @@ static __always_inline void __##func(struct pt_regs *regs); \ static __always_inline void instr_##func(struct pt_regs *regs) \ { \ __irq_enter_raw(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ __##func (regs); \ __irq_exit_raw(); \ } \ @@ -297,6 +296,7 @@ __visible noinstr void func(struct pt_regs *regs) \ { \ irqentry_state_t state = irqentry_enter(regs); \ \ + kvm_set_cpu_l1tf_flush_l1d(); \ instrumentation_begin(); \ instr_##func (regs); \ instrumentation_end(); \ From 6cd0dd934b03d4ee4094ac474108723e2f2ed7d6 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 11 Jun 2024 09:50:31 +0200 Subject: [PATCH 058/126] kcov: Add interrupt handling self test Add a boot self test that can catch sprious coverage from interrupts. The coverage callback filters out interrupt code, but only after the handler updates preempt count. Some code periodically leaks out of that section and leads to spurious coverage. Add a best-effort (but simple) test that is likely to catch such bugs. If the test is enabled on CI systems that use KCOV, they should catch any issues fast. Signed-off-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Reviewed-by: Alexander Potapenko Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Link: https://lore.kernel.org/all/7662127c97e29da1a748ad1c1539dd7b65b737b2.1718092070.git.dvyukov@google.com --- kernel/kcov.c | 31 +++++++++++++++++++++++++++++++ lib/Kconfig.debug | 8 ++++++++ 2 files changed, 39 insertions(+) diff --git a/kernel/kcov.c b/kernel/kcov.c index f0a69d402066..d9d4a0c04185 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -1058,6 +1059,32 @@ u64 kcov_common_handle(void) } EXPORT_SYMBOL(kcov_common_handle); +#ifdef CONFIG_KCOV_SELFTEST +static void __init selftest(void) +{ + unsigned long start; + + pr_err("running self test\n"); + /* + * Test that interrupts don't produce spurious coverage. + * The coverage callback filters out interrupt code, but only + * after the handler updates preempt count. Some code periodically + * leaks out of that section and leads to spurious coverage. + * It's hard to call the actual interrupt handler directly, + * so we just loop here for a bit waiting for a timer interrupt. + * We set kcov_mode to enable tracing, but don't setup the area, + * so any attempt to trace will crash. Note: we must not call any + * potentially traced functions in this region. + */ + start = jiffies; + current->kcov_mode = KCOV_MODE_TRACE_PC; + while ((jiffies - start) * MSEC_PER_SEC / HZ < 300) + ; + current->kcov_mode = 0; + pr_err("done running self test\n"); +} +#endif + static int __init kcov_init(void) { int cpu; @@ -1077,6 +1104,10 @@ static int __init kcov_init(void) */ debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops); +#ifdef CONFIG_KCOV_SELFTEST + selftest(); +#endif + return 0; } diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a30c03a66172..270e367b3e6f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2173,6 +2173,14 @@ config KCOV_IRQ_AREA_SIZE soft interrupts. This specifies the size of those areas in the number of unsigned long words. +config KCOV_SELFTEST + bool "Perform short selftests on boot" + depends on KCOV + help + Run short KCOV coverage collection selftests on boot. + On test failure, causes the kernel to panic. Recommended to be + enabled, ensuring critical functionality works as intended. + menuconfig RUNTIME_TESTING_MENU bool "Runtime Testing" default y From f34d086fb7102fec895fd58b9e816b981b284c17 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 11 Jun 2024 09:50:32 +0200 Subject: [PATCH 059/126] module: Fix KCOV-ignored file name module.c was renamed to main.c, but the Makefile directive was copy-pasted verbatim with the old file name. Fix up the file name. Fixes: cfc1d277891e ("module: Move all into module/") Signed-off-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Reviewed-by: Alexander Potapenko Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/bc0cf790b4839c5e38e2fafc64271f620568a39e.1718092070.git.dvyukov@google.com --- kernel/module/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/module/Makefile b/kernel/module/Makefile index a10b2b9a6fdf..50ffcc413b54 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -5,7 +5,7 @@ # These are called from save_stack_trace() on slub debug path, # and produce insane amounts of uninteresting coverage. -KCOV_INSTRUMENT_module.o := n +KCOV_INSTRUMENT_main.o := n obj-y += main.o obj-y += strict_rwx.o From ae94b263f5f69c180347e795fbefa051b65aacc3 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 11 Jun 2024 09:50:33 +0200 Subject: [PATCH 060/126] x86: Ignore stack unwinding in KCOV Stack unwinding produces large amounts of uninteresting coverage. It's called from KASAN kmalloc/kfree hooks, fault injection, etc. It's not particularly useful and is not a function of system call args. Ignore that code. Signed-off-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Reviewed-by: Alexander Potapenko Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/eaf54b8634970b73552dcd38bf9be6ef55238c10.1718092070.git.dvyukov@google.com --- arch/x86/kernel/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a847180836e4..f7918980667a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,6 +35,14 @@ KMSAN_SANITIZE_nmi.o := n # If instrumentation of the following files is enabled, boot hangs during # first second. KCOV_INSTRUMENT_head$(BITS).o := n +# These are called from save_stack_trace() on debug paths, +# and produce large amounts of uninteresting coverage. +KCOV_INSTRUMENT_stacktrace.o := n +KCOV_INSTRUMENT_dumpstack.o := n +KCOV_INSTRUMENT_dumpstack_$(BITS).o := n +KCOV_INSTRUMENT_unwind_orc.o := n +KCOV_INSTRUMENT_unwind_frame.o := n +KCOV_INSTRUMENT_unwind_guess.o := n CFLAGS_irq.o := -I $(src)/../include/asm/trace From 838ba7733e4e3a94a928e8d0a058de1811a58621 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 28 Jul 2024 13:06:10 +0200 Subject: [PATCH 061/126] x86/apic: Remove logical destination mode for 64-bit Logical destination mode of the local APIC is used for systems with up to 8 CPUs. It has an advantage over physical destination mode as it allows to target multiple CPUs at once with IPIs. That advantage was definitely worth it when systems with up to 8 CPUs were state of the art for servers and workstations, but that's history. Aside of that there are systems which fail to work with logical destination mode as the ACPI/DMI quirks show and there are AMD Zen1 systems out there which fail when interrupt remapping is enabled as reported by Rob and Christian. The latter problem can be cured by firmware updates, but not all OEMs distribute the required changes. Physical destination mode is guaranteed to work because it is the only way to get a CPU up and running via the INIT/INIT/STARTUP sequence. As the number of CPUs keeps increasing, logical destination mode becomes a less used code path so there is no real good reason to keep it around. Therefore remove logical destination mode support for 64-bit and default to physical destination mode. Reported-by: Rob Newcater Reported-by: Christian Heusel Signed-off-by: Thomas Gleixner Tested-by: Borislav Petkov (AMD) Tested-by: Rob Newcater Link: https://lore.kernel.org/all/877cd5u671.ffs@tglx --- arch/x86/include/asm/apic.h | 8 -- arch/x86/kernel/apic/apic_flat_64.c | 119 ++-------------------------- 2 files changed, 7 insertions(+), 120 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 9dd22ee27117..53f08447193d 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -350,20 +350,12 @@ extern struct apic *apic; * APIC drivers are probed based on how they are listed in the .apicdrivers * section. So the order is important and enforced by the ordering * of different apic driver files in the Makefile. - * - * For the files having two apic drivers, we use apic_drivers() - * to enforce the order with in them. */ #define apic_driver(sym) \ static const struct apic *__apicdrivers_##sym __used \ __aligned(sizeof(struct apic *)) \ __section(".apicdrivers") = { &sym } -#define apic_drivers(sym1, sym2) \ - static struct apic *__apicdrivers_##sym1##sym2[2] __used \ - __aligned(sizeof(struct apic *)) \ - __section(".apicdrivers") = { &sym1, &sym2 } - extern struct apic *__apicdrivers[], *__apicdrivers_end[]; /* diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index f37ad3392fec..e0308d8c4e6c 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -8,129 +8,25 @@ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ -#include #include -#include -#include #include #include "local.h" -static struct apic apic_physflat; -static struct apic apic_flat; - -struct apic *apic __ro_after_init = &apic_flat; -EXPORT_SYMBOL_GPL(apic); - -static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - return 1; -} - -static void _flat_send_IPI_mask(unsigned long mask, int vector) -{ - unsigned long flags; - - local_irq_save(flags); - __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); - local_irq_restore(flags); -} - -static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) -{ - unsigned long mask = cpumask_bits(cpumask)[0]; - - _flat_send_IPI_mask(mask, vector); -} - -static void -flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) -{ - unsigned long mask = cpumask_bits(cpumask)[0]; - int cpu = smp_processor_id(); - - if (cpu < BITS_PER_LONG) - __clear_bit(cpu, &mask); - - _flat_send_IPI_mask(mask, vector); -} - -static u32 flat_get_apic_id(u32 x) +static u32 physflat_get_apic_id(u32 x) { return (x >> 24) & 0xFF; } -static int flat_probe(void) +static int physflat_probe(void) { return 1; } -static struct apic apic_flat __ro_after_init = { - .name = "flat", - .probe = flat_probe, - .acpi_madt_oem_check = flat_acpi_madt_oem_check, - - .dest_mode_logical = true, - - .disable_esr = 0, - - .init_apic_ldr = default_init_apic_ldr, - .cpu_present_to_apicid = default_cpu_present_to_apicid, - - .max_apic_id = 0xFE, - .get_apic_id = flat_get_apic_id, - - .calc_dest_apicid = apic_flat_calc_apicid, - - .send_IPI = default_send_IPI_single, - .send_IPI_mask = flat_send_IPI_mask, - .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, - .send_IPI_allbutself = default_send_IPI_allbutself, - .send_IPI_all = default_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - .nmi_to_offline_cpu = true, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi = native_apic_mem_eoi, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = apic_mem_wait_icr_idle, - .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, -}; - -/* - * Physflat mode is used when there are more than 8 CPUs on a system. - * We cannot use logical delivery in this case because the mask - * overflows, so use physical mode. - */ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { -#ifdef CONFIG_ACPI - /* - * Quirk: some x86_64 machines can only use physical APIC mode - * regardless of how many processors are present (x86_64 ES7000 - * is an example). - */ - if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && - (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { - printk(KERN_DEBUG "system APIC only can use physical flat"); - return 1; - } - - if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) { - printk(KERN_DEBUG "IBM Summit detected, will use apic physical"); - return 1; - } -#endif - - return 0; -} - -static int physflat_probe(void) -{ - return apic == &apic_physflat || num_possible_cpus() > 8 || jailhouse_paravirt(); + return 1; } static struct apic apic_physflat __ro_after_init = { @@ -146,7 +42,7 @@ static struct apic apic_physflat __ro_after_init = { .cpu_present_to_apicid = default_cpu_present_to_apicid, .max_apic_id = 0xFE, - .get_apic_id = flat_get_apic_id, + .get_apic_id = physflat_get_apic_id, .calc_dest_apicid = apic_default_calc_apicid, @@ -166,8 +62,7 @@ static struct apic apic_physflat __ro_after_init = { .wait_icr_idle = apic_mem_wait_icr_idle, .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, }; +apic_driver(apic_physflat); -/* - * We need to check for physflat first, so this order is important. - */ -apic_drivers(apic_physflat, apic_flat); +struct apic *apic __ro_after_init = &apic_physflat; +EXPORT_SYMBOL_GPL(apic); From 4276a0bb62598966716e1ee1ac4a64d382cc9ef7 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 25 Apr 2024 21:59:51 +0000 Subject: [PATCH 062/126] x86/mm: Remove unused CR3_HW_ASID_BITS Commit 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches") removed the last usage of CR3_HW_ASID_BITS and opted to use X86_CR3_PCID_BITS instead. Remove CR3_HW_ASID_BITS. Signed-off-by: Yosry Ahmed Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240425215951.2310105-1-yosryahmed@google.com --- arch/x86/mm/tlb.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 1fe9ba33c580..09950feffd07 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -86,9 +86,6 @@ * */ -/* There are 12 bits of space for ASIDS in CR3 */ -#define CR3_HW_ASID_BITS 12 - /* * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for * user/kernel switches From 2db5f86c0d5da9ef0f3a1a5a9c460bf0f36268bf Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Sat, 10 Aug 2024 17:36:10 +0800 Subject: [PATCH 063/126] x86/apic: Remove unused extern declarations The removal of get_physical_broadcast(), generic_bigsmp_probe() and default_apic_id_valid() left the stale declarations around. Remove them. Signed-off-by: Yue Haibing Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240810093610.2586665-1-yuehaibing@huawei.com --- arch/x86/include/asm/apic.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 53f08447193d..f21ff1932699 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -127,8 +127,6 @@ static inline bool apic_is_x2apic_enabled(void) extern void enable_IR_x2apic(void); -extern int get_physical_broadcast(void); - extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); extern void disconnect_bsp_APIC(int virt_wire_setup); @@ -508,8 +506,6 @@ static inline bool is_vector_pending(unsigned int vector) #define TRAMPOLINE_PHYS_LOW 0x467 #define TRAMPOLINE_PHYS_HIGH 0x469 -extern void generic_bigsmp_probe(void); - #ifdef CONFIG_X86_LOCAL_APIC #include @@ -532,8 +528,6 @@ static inline int default_acpi_madt_oem_check(char *a, char *b) { return 0; } static inline void x86_64_probe_apic(void) { } #endif -extern int default_apic_id_valid(u32 apicid); - extern u32 apic_default_calc_apicid(unsigned int cpu); extern u32 apic_flat_calc_apicid(unsigned int cpu); From a1fab3e69d9d0e9b62aab4450cb97da432ac3ef2 Mon Sep 17 00:00:00 2001 From: Sohil Mehta Date: Mon, 12 Aug 2024 21:43:18 +0000 Subject: [PATCH 064/126] x86/irq: Fix comment on IRQ vector layout commit f5a3562ec9dd ("x86/irq: Reserve a per CPU IDT vector for posted MSIs") changed the first system vector from LOCAL_TIMER_VECTOR to POSTED_MSI_NOTIFICATION_VECTOR. Reflect this change in the vector layout comment as well. However, instead of pointing to the specific vector, use the FIRST_SYSTEM_VECTOR indirection which essentially refers to the same. This avoids unnecessary modifications to the same comment whenever additional system vectors get added. Signed-off-by: Sohil Mehta Signed-off-by: Thomas Gleixner Reviewed-by: Jacob Pan Link: https://lore.kernel.org/all/20240812214318.2715360-1-sohil.mehta@intel.com --- arch/x86/include/asm/irq_vectors.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 13aea8fc3d45..47051871b436 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -18,8 +18,8 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vectors 129 ... LOCAL_TIMER_VECTOR-1 - * Vectors LOCAL_TIMER_VECTOR ... 255 : special interrupts + * Vectors 129 ... FIRST_SYSTEM_VECTOR-1 : device interrupts + * Vectors FIRST_SYSTEM_VECTOR ... 255 : special interrupts * * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. * From 989b5cfaa7b6054f4e1bde914470ee091c23e6a5 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Tue, 9 Jul 2024 08:40:46 -0700 Subject: [PATCH 065/126] x86/fred: Parse cmdline param "fred=" in cpu_parse_early_param() Depending on whether FRED is enabled, sysvec_install() installs a system interrupt handler into either into FRED's system vector dispatch table or into the IDT. However FRED can be disabled later in trap_init(), after sysvec_install() has been invoked already; e.g., the HYPERVISOR_CALLBACK_VECTOR handler is registered with sysvec_install() in kvm_guest_init(), which is called in setup_arch() but way before trap_init(). IOW, there is a gap between FRED is available and available but disabled. As a result, when FRED is available but disabled, early sysvec_install() invocations fail to install the IDT handler resulting in spurious interrupts. Fix it by parsing cmdline param "fred=" in cpu_parse_early_param() to ensure that FRED is disabled before the first sysvec_install() incovations. Fixes: 3810da12710a ("x86/fred: Add a fred= cmdline param") Reported-by: Hou Wenlong Suggested-by: Thomas Gleixner Signed-off-by: Xin Li (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240709154048.3543361-2-xin@zytor.com --- arch/x86/kernel/cpu/common.c | 5 +++++ arch/x86/kernel/traps.c | 26 -------------------------- 2 files changed, 5 insertions(+), 26 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d4e539d4e158..10a5402d8297 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1510,6 +1510,11 @@ static void __init cpu_parse_early_param(void) if (cmdline_find_option_bool(boot_command_line, "nousershstk")) setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK); + /* Minimize the gap between FRED is available and available but disabled. */ + arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg)); + if (arglen != 2 || strncmp(arg, "on", 2)) + setup_clear_cpu_cap(X86_FEATURE_FRED); + arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); if (arglen <= 0) return; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4fa0b17e5043..6afb41e6cbbb 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1402,34 +1402,8 @@ DEFINE_IDTENTRY_SW(iret_error) } #endif -/* Do not enable FRED by default yet. */ -static bool enable_fred __ro_after_init = false; - -#ifdef CONFIG_X86_FRED -static int __init fred_setup(char *str) -{ - if (!str) - return -EINVAL; - - if (!cpu_feature_enabled(X86_FEATURE_FRED)) - return 0; - - if (!strcmp(str, "on")) - enable_fred = true; - else if (!strcmp(str, "off")) - enable_fred = false; - else - pr_warn("invalid FRED option: 'fred=%s'\n", str); - return 0; -} -early_param("fred", fred_setup); -#endif - void __init trap_init(void) { - if (cpu_feature_enabled(X86_FEATURE_FRED) && !enable_fred) - setup_clear_cpu_cap(X86_FEATURE_FRED); - /* Init cpu_entry_area before IST entries are set up */ setup_cpu_entry_areas(); From 73270c1f2369fb37683121ebe097cd37172602b6 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Tue, 9 Jul 2024 08:40:47 -0700 Subject: [PATCH 066/126] x86/fred: Move FRED RSP initialization into a separate function To enable FRED earlier, move the RSP initialization out of cpu_init_fred_exceptions() into cpu_init_fred_rsps(). This is required as the FRED RSP initialization depends on the availability of the CPU entry areas which are set up late in trap_init(), No functional change intended. Marked with Fixes as it's a depedency for the real fix. Fixes: 14619d912b65 ("x86/fred: FRED entry/exit and dispatch code") Signed-off-by: Xin Li (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240709154048.3543361-3-xin@zytor.com --- arch/x86/include/asm/fred.h | 2 ++ arch/x86/kernel/cpu/common.c | 6 ++++-- arch/x86/kernel/fred.c | 28 +++++++++++++++++++--------- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h index e86c7ba32435..66d7dbe2d314 100644 --- a/arch/x86/include/asm/fred.h +++ b/arch/x86/include/asm/fred.h @@ -84,11 +84,13 @@ static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int } void cpu_init_fred_exceptions(void); +void cpu_init_fred_rsps(void); void fred_complete_exception_setup(void); #else /* CONFIG_X86_FRED */ static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } static inline void cpu_init_fred_exceptions(void) { } +static inline void cpu_init_fred_rsps(void) { } static inline void fred_complete_exception_setup(void) { } static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } #endif /* CONFIG_X86_FRED */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 10a5402d8297..6de12b3c1b04 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2195,10 +2195,12 @@ void cpu_init_exception_handling(void) /* GHCB needs to be setup to handle #VC. */ setup_ghcb(); - if (cpu_feature_enabled(X86_FEATURE_FRED)) + if (cpu_feature_enabled(X86_FEATURE_FRED)) { cpu_init_fred_exceptions(); - else + cpu_init_fred_rsps(); + } else { load_current_idt(); + } } /* diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c index 4bcd8791ad96..99a134fcd5bf 100644 --- a/arch/x86/kernel/fred.c +++ b/arch/x86/kernel/fred.c @@ -32,6 +32,25 @@ void cpu_init_fred_exceptions(void) FRED_CONFIG_INT_STKLVL(0) | FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user)); + wrmsrl(MSR_IA32_FRED_STKLVLS, 0); + wrmsrl(MSR_IA32_FRED_RSP0, 0); + wrmsrl(MSR_IA32_FRED_RSP1, 0); + wrmsrl(MSR_IA32_FRED_RSP2, 0); + wrmsrl(MSR_IA32_FRED_RSP3, 0); + + /* Enable FRED */ + cr4_set_bits(X86_CR4_FRED); + /* Any further IDT use is a bug */ + idt_invalidate(); + + /* Use int $0x80 for 32-bit system calls in FRED mode */ + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); +} + +/* Must be called after setup_cpu_entry_areas() */ +void cpu_init_fred_rsps(void) +{ /* * The purpose of separate stacks for NMI, #DB and #MC *in the kernel* * (remember that user space faults are always taken on stack level 0) @@ -47,13 +66,4 @@ void cpu_init_fred_exceptions(void) wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB)); wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI)); wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF)); - - /* Enable FRED */ - cr4_set_bits(X86_CR4_FRED); - /* Any further IDT use is a bug */ - idt_invalidate(); - - /* Use int $0x80 for 32-bit system calls in FRED mode */ - setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); - setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); } From a97756cbec448032f84b5bbfe4e101478d1e01e0 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Tue, 9 Jul 2024 08:40:48 -0700 Subject: [PATCH 067/126] x86/fred: Enable FRED right after init_mem_mapping() On 64-bit init_mem_mapping() relies on the minimal page fault handler provided by the early IDT mechanism. The real page fault handler is installed right afterwards into the IDT. This is problematic on CPUs which have X86_FEATURE_FRED set because the real page fault handler retrieves the faulting address from the FRED exception stack frame and not from CR2, but that does obviously not work when FRED is not yet enabled in the CPU. To prevent this enable FRED right after init_mem_mapping() without interrupt stacks. Those are enabled later in trap_init() after the CPU entry area is set up. [ tglx: Encapsulate the FRED details ] Fixes: 14619d912b65 ("x86/fred: FRED entry/exit and dispatch code") Reported-by: Hou Wenlong Suggested-by: Thomas Gleixner Signed-off-by: Xin Li (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240709154048.3543361-4-xin@zytor.com --- arch/x86/include/asm/processor.h | 3 ++- arch/x86/kernel/cpu/common.c | 15 +++++++++++++-- arch/x86/kernel/setup.c | 7 ++++++- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/traps.c | 2 +- 5 files changed, 23 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a75a07f4931f..399f7d1c4c61 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -582,7 +582,8 @@ extern void switch_gdt_and_percpu_base(int); extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void cpu_init(void); -extern void cpu_init_exception_handling(void); +extern void cpu_init_exception_handling(bool boot_cpu); +extern void cpu_init_replace_early_idt(void); extern void cr4_init(void); extern void set_task_blockstep(struct task_struct *task, bool on); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6de12b3c1b04..a4735d9b5a1d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2176,7 +2176,7 @@ static inline void tss_setup_io_bitmap(struct tss_struct *tss) * Setup everything needed to handle exceptions from the IDT, including the IST * exceptions which use paranoid_entry(). */ -void cpu_init_exception_handling(void) +void cpu_init_exception_handling(bool boot_cpu) { struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); int cpu = raw_smp_processor_id(); @@ -2196,13 +2196,24 @@ void cpu_init_exception_handling(void) setup_ghcb(); if (cpu_feature_enabled(X86_FEATURE_FRED)) { - cpu_init_fred_exceptions(); + /* The boot CPU has enabled FRED during early boot */ + if (!boot_cpu) + cpu_init_fred_exceptions(); + cpu_init_fred_rsps(); } else { load_current_idt(); } } +void __init cpu_init_replace_early_idt(void) +{ + if (cpu_feature_enabled(X86_FEATURE_FRED)) + cpu_init_fred_exceptions(); + else + idt_setup_early_pf(); +} + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT. We diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6129dc2ba784..f1fea506e20f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1039,7 +1039,12 @@ void __init setup_arch(char **cmdline_p) init_mem_mapping(); - idt_setup_early_pf(); + /* + * init_mem_mapping() relies on the early IDT page fault handling. + * Now either enable FRED or install the real page fault handler + * for 64-bit in the IDT. + */ + cpu_init_replace_early_idt(); /* * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0c35207320cb..dc4fff8fccce 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -246,7 +246,7 @@ static void notrace start_secondary(void *unused) __flush_tlb_all(); } - cpu_init_exception_handling(); + cpu_init_exception_handling(false); /* * Load the microcode before reaching the AP alive synchronization diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6afb41e6cbbb..197d5888b0e2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1411,7 +1411,7 @@ void __init trap_init(void) sev_es_init_vc_handling(); /* Initialize TSS before setting up traps so ISTs work */ - cpu_init_exception_handling(); + cpu_init_exception_handling(true); /* Setup traps as cpu_init() might #GP */ if (!cpu_feature_enabled(X86_FEATURE_FRED)) From 22f42697265589534df9b109fe0b0efa36896509 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 14 Aug 2024 11:16:36 +0800 Subject: [PATCH 068/126] x86/platform/uv: Remove unused declaration uv_irq_2_mmr_info() Commit 43fe1abc18a2 ("x86/uv: Use hierarchical irqdomain to manage UV interrupts") removed the implementation but left declaration. Signed-off-by: Yue Haibing Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240814031636.1304772-1-yuehaibing@huawei.com --- arch/x86/include/asm/uv/uv_irq.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h index d6b17c760622..1876b5edd142 100644 --- a/arch/x86/include/asm/uv/uv_irq.h +++ b/arch/x86/include/asm/uv/uv_irq.h @@ -31,7 +31,6 @@ enum { UV_AFFINITY_CPU }; -extern int uv_irq_2_mmr_info(int, unsigned long *, int *); extern int uv_setup_irq(char *, int, int, unsigned long, int); extern void uv_teardown_irq(unsigned int); From 1aa0c92f816b3a136cc3a31ef184206a19fc3c03 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 14 Aug 2024 11:19:22 +0800 Subject: [PATCH 069/126] x86/mm: Remove unused NX related declarations Since commit 4763ed4d4552 ("x86, mm: Clean up and simplify NX enablement") these declarations is unused and can be removed. Signed-off-by: Yue Haibing Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240814031922.2333198-1-yuehaibing@huawei.com --- arch/x86/include/asm/pgtable_types.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 2f321137736c..6f82e75b6149 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -517,8 +517,6 @@ typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; extern pteval_t __default_kernel_pte_mask; -extern void set_nx(void); -extern int nx_enabled; #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); From d4245fd4a62931aebd1c5e6b7b6f51b6ef7ad087 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Wed, 14 Aug 2024 20:46:45 +0800 Subject: [PATCH 070/126] x86/mm: Remove duplicate check from build_cr3() There is already a check for 'asid > MAX_ASID_AVAILABLE' in kern_pcid(), so it is unnecessary to perform this check in build_cr3() right before calling kern_pcid(). Remove it. Signed-off-by: Yuntao Wang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240814124645.51019-1-yuntao.wang@linux.dev --- arch/x86/mm/tlb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 09950feffd07..86593d1b787d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -158,7 +158,6 @@ static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam) unsigned long cr3 = __sme_pa(pgd) | lam; if (static_cpu_has(X86_FEATURE_PCID)) { - VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); cr3 |= kern_pcid(asid); } else { VM_WARN_ON_ONCE(asid != 0); From 8839adc33ff7a5464dbfc17de8afe5fedf9c6029 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Fri, 9 Aug 2024 11:52:48 +0200 Subject: [PATCH 071/126] Documentation: devres: fix error about PCI devres The documentation states that pcim_enable_device() will make "all PCI ops" managed. This is totally false, only a small subset of PCI functions become managed that way. Implicating otherwise has caused at least one bug so far, namely in commit 8558de401b5f ("drm/vboxvideo: use managed pci functions"). Change the function summary so the functions dangerous behavior becomes obvious. Signed-off-by: Philipp Stanner Acked-by: Bjorn Helgaas Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240809095248.14220-2-pstanner@redhat.com --- Documentation/driver-api/driver-model/devres.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst index ac9ee7441887..5f2ee8d717b1 100644 --- a/Documentation/driver-api/driver-model/devres.rst +++ b/Documentation/driver-api/driver-model/devres.rst @@ -391,7 +391,7 @@ PCI devm_pci_remap_cfgspace() : ioremap PCI configuration space devm_pci_remap_cfg_resource() : ioremap PCI configuration space resource - pcim_enable_device() : after success, all PCI ops become managed + pcim_enable_device() : after success, some PCI ops become managed pcim_iomap() : do iomap() on a single BAR pcim_iomap_regions() : do request_region() and iomap() on multiple BARs pcim_iomap_regions_request_all() : do request_region() on all and iomap() on multiple BARs From 0769a1b7cf30d9a8af0657299e18713ce1587f57 Mon Sep 17 00:00:00 2001 From: David Hunter Date: Wed, 7 Aug 2024 14:53:32 -0400 Subject: [PATCH 072/126] Documentation: Capitalize Fahrenheit in watchdog-api.rst Capitalize "fahrenheit," a spelling mistake. Signed-off-by: David Hunter Reviewed-by: Guenter Roeck Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240807185332.61624-1-david.hunter.linux@gmail.com --- Documentation/watchdog/watchdog-api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/watchdog/watchdog-api.rst b/Documentation/watchdog/watchdog-api.rst index 800dcd7586f2..78e228c272cf 100644 --- a/Documentation/watchdog/watchdog-api.rst +++ b/Documentation/watchdog/watchdog-api.rst @@ -249,7 +249,7 @@ Note that not all devices support these two calls, and some only support the GETBOOTSTATUS call. Some drivers can measure the temperature using the GETTEMP ioctl. The -returned value is the temperature in degrees fahrenheit:: +returned value is the temperature in degrees Fahrenheit:: int temperature; ioctl(fd, WDIOC_GETTEMP, &temperature); From d19d638b1e6cf746263ef60b7d0dee0204d8216a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 8 Jul 2024 13:22:06 -0700 Subject: [PATCH 073/126] x86/syscall: Avoid memcpy() for ia32 syscall_get_arguments() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modern (fortified) memcpy() prefers to avoid writing (or reading) beyond the end of the addressed destination (or source) struct member: In function ‘fortify_memcpy_chk’, inlined from ‘syscall_get_arguments’ at ./arch/x86/include/asm/syscall.h:85:2, inlined from ‘populate_seccomp_data’ at kernel/seccomp.c:258:2, inlined from ‘__seccomp_filter’ at kernel/seccomp.c:1231:3: ./include/linux/fortify-string.h:580:25: error: call to ‘__read_overflow2_field’ declared with attribute warning: detected read beyond size of field (2nd parameter); maybe use struct_group()? [-Werror=attribute-warning] 580 | __read_overflow2_field(q_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ As already done for x86_64 and compat mode, do not use memcpy() to extract syscall arguments from struct pt_regs but rather just perform direct assignments. Binary output differences are negligible, and actually ends up using less stack space: - sub $0x84,%esp + sub $0x6c,%esp and less text size: text data bss dec hex filename 10794 252 0 11046 2b26 gcc-32b/kernel/seccomp.o.stock 10714 252 0 10966 2ad6 gcc-32b/kernel/seccomp.o.after Closes: https://lore.kernel.org/lkml/9b69fb14-df89-4677-9c82-056ea9e706f5@gmail.com/ Reported-by: Mirsad Todorovac Signed-off-by: Kees Cook Signed-off-by: Dave Hansen Reviewed-by: Gustavo A. R. Silva Acked-by: Dave Hansen Tested-by: Mirsad Todorovac Link: https://lore.kernel.org/all/20240708202202.work.477-kees%40kernel.org --- arch/x86/include/asm/syscall.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 2fc7bc3863ff..7c488ff0c764 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -82,7 +82,12 @@ static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args) { - memcpy(args, ®s->bx, 6 * sizeof(args[0])); + args[0] = regs->bx; + args[1] = regs->cx; + args[2] = regs->dx; + args[3] = regs->si; + args[4] = regs->di; + args[5] = regs->bp; } static inline int syscall_get_arch(struct task_struct *task) From decb9ac4a9739c16e228f7b2918bfdca34cc89a9 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 22 Aug 2024 17:18:08 -0700 Subject: [PATCH 074/126] x86/cpu_entry_area: Annotate percpu_setup_exception_stacks() as __init After a recent LLVM change that deduces __cold on functions that only call cold code (such as __init functions), there is a section mismatch warning from percpu_setup_exception_stacks(), which got moved to .text.unlikely. as a result of that optimization: WARNING: modpost: vmlinux: section mismatch in reference: percpu_setup_exception_stacks+0x3a (section: .text.unlikely.) -> cea_map_percpu_pages (section: .init.text) Drop the inline keyword, which does not guarantee inlining, and replace it with __init, as percpu_setup_exception_stacks() is only called from __init code, which clears up the warning. Signed-off-by: Nathan Chancellor Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240822-x86-percpu_setup_exception_stacks-init-v1-1-57c5921b8209@kernel.org --- arch/x86/mm/cpu_entry_area.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index e91500a80963..575f863f3c75 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -164,7 +164,7 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) } } #else -static inline void percpu_setup_exception_stacks(unsigned int cpu) +static void __init percpu_setup_exception_stacks(unsigned int cpu) { struct cpu_entry_area *cea = get_cpu_entry_area(cpu); From 741fc1d788c0aff3022249e561fd489c7adaded3 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Sat, 24 Aug 2024 20:02:34 +0800 Subject: [PATCH 075/126] x86/mtrr: Remove obsolete declaration for mtrr_bp_restore() mtrr_bp_restore() has been removed in commit 0b9a6a8bedbf ("x86/mtrr: Add a stop_machine() handler calling only cache_cpu_init()"), but the declaration was left behind. Remove it. Signed-off-by: Gaosheng Cui Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240824120234.2516830-1-cuigaosheng1@huawei.com --- arch/x86/include/asm/mtrr.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 090d658a85a6..4218248083d9 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -69,7 +69,6 @@ extern int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, bool increment); extern int mtrr_del(int reg, unsigned long base, unsigned long size); extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); -extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); void mtrr_disable(void); @@ -117,7 +116,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) return 0; } #define mtrr_bp_init() do {} while (0) -#define mtrr_bp_restore() do {} while (0) #define mtrr_disable() do {} while (0) #define mtrr_enable() do {} while (0) #define mtrr_generic_set_state() do {} while (0) From 80a4da05642c384bc6f5b602b865ebd7e3963902 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Sat, 24 Aug 2024 23:17:10 +0100 Subject: [PATCH 076/126] x86/EISA: Use memremap() to probe for the EISA BIOS signature The area at the 0x0FFFD9 physical location in the PC memory space is regular memory, traditionally ROM BIOS and more recently a copy of BIOS code and data in RAM, write-protected. Therefore use memremap() to get access to it rather than ioremap(), avoiding issues in virtualization scenarios and complementing changes such as commit f7750a795687 ("x86, mpparse, x86/acpi, x86/PCI, x86/dmi, SFI: Use memremap() for RAM mappings") or commit 5997efb96756 ("x86/boot: Use memremap() to map the MPF and MPC data"). Reported-by: Kirill A. Shutemov Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/alpine.DEB.2.21.2408242025210.30766@angie.orcam.me.uk Closes: https://lore.kernel.org/r/20240822095122.736522-1-kirill.shutemov@linux.intel.com --- arch/x86/kernel/eisa.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c index 53935b4d62e3..5a4f99a098bf 100644 --- a/arch/x86/kernel/eisa.c +++ b/arch/x86/kernel/eisa.c @@ -11,15 +11,15 @@ static __init int eisa_bus_probe(void) { - void __iomem *p; + void *p; if ((xen_pv_domain() && !xen_initial_domain()) || cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return 0; - p = ioremap(0x0FFFD9, 4); + p = memremap(0x0FFFD9, 4, MEMREMAP_WB); if (p && readl(p) == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24)) EISA_bus = 1; - iounmap(p); + memunmap(p); return 0; } subsys_initcall(eisa_bus_probe); From c6e6a3c1698a86bacf7eac256b0f1215a3616dc7 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Sun, 25 Aug 2024 20:06:49 +1200 Subject: [PATCH 077/126] x86/sgx: Fix a W=1 build warning in function comment Building the SGX code with W=1 generates below warning: arch/x86/kernel/cpu/sgx/main.c:741: warning: Function parameter or struct member 'low' not described in 'sgx_calc_section_metric' arch/x86/kernel/cpu/sgx/main.c:741: warning: Function parameter or struct member 'high' not described in 'sgx_calc_section_metric' ... The function sgx_calc_section_metric() is a simple helper which is only used in sgx/main.c. There's no need to use kernel-doc style comment for it. Downgrade to a normal comment. Signed-off-by: Kai Huang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240825080649.145250-1-kai.huang@intel.com --- arch/x86/kernel/cpu/sgx/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 27892e57c4ef..1a000acd933a 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -732,7 +732,7 @@ out: return 0; } -/** +/* * A section metric is concatenated in a way that @low bits 12-31 define the * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the * metric. From 3c41ad39f179c0e41f585975eb6834cd14f286ec Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Sun, 25 Aug 2024 20:18:05 +1200 Subject: [PATCH 078/126] x86/kexec: Fix a comment of swap_pages() assembly When relocate_kernel() gets called, %rdi holds 'indirection_page' and %rsi holds 'page_list'. And %rdi always holds 'indirection_page' when swap_pages() is called. Therefore the comment of the first line code of swap_pages() movq %rdi, %rcx /* Put the page_list in %rcx */ .. isn't correct because it actually moves the 'indirection_page' to the %rcx. Fix it. Signed-off-by: Kai Huang Signed-off-by: Thomas Gleixner Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/all/adafdfb1421c88efce04420fc9a996c0e2ca1b34.1724573384.git.kai.huang@intel.com --- arch/x86/kernel/relocate_kernel_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 042c9a0334e9..f7a3ca3dee53 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -258,7 +258,7 @@ SYM_CODE_END(virtual_mapped) /* Do the copies */ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) UNWIND_HINT_END_OF_STACK - movq %rdi, %rcx /* Put the page_list in %rcx */ + movq %rdi, %rcx /* Put the indirection_page in %rcx */ xorl %edi, %edi xorl %esi, %esi jmp 1f From ea49cdb26e7cffc261ceb1db26707e6d337fa104 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Sun, 25 Aug 2024 20:18:06 +1200 Subject: [PATCH 079/126] x86/kexec: Add comments around swap_pages() assembly to improve readability The current assembly around swap_pages() in the relocate_kernel() takes some time to follow because the use of registers can be easily lost when the line of assembly goes long. Add a couple of comments to clarify the code around swap_pages() to improve readability. Signed-off-by: Kai Huang Signed-off-by: Thomas Gleixner Acked-by: Kirill A. Shutemov Link: https://lore.kernel.org/all/8b52b0b8513a34b2a02fb4abb05c6700c2821475.1724573384.git.kai.huang@intel.com --- arch/x86/kernel/relocate_kernel_64.S | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index f7a3ca3dee53..e9e88c342f75 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -170,6 +170,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) wbinvd .Lsme_off: + /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ movq %rcx, %r11 call swap_pages @@ -289,18 +290,21 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi - movq %rdi, %rdx - movq %rsi, %rax + movq %rdi, %rdx /* Save destination page to %rdx */ + movq %rsi, %rax /* Save source page to %rax */ + /* copy source page to swap page */ movq %r10, %rdi movl $512, %ecx rep ; movsq + /* copy destination page to source page */ movq %rax, %rdi movq %rdx, %rsi movl $512, %ecx rep ; movsq + /* copy swap page to destination page */ movq %rdx, %rdi movq %r10, %rsi movl $512, %ecx From 7678a53a1688e3d03337ca884b284c6e7b060ec5 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Sun, 25 Aug 2024 18:46:53 +0800 Subject: [PATCH 080/126] x86/cpu: Clarify the error message when BIOS does not support SGX When SGX is not supported by the BIOS, the kernel log contains the error 'SGX disabled by BIOS', which can be confusing since there might not be an SGX-related option in the BIOS settings. For the kernel it's difficult to distinguish between the BIOS not supporting SGX and the BIOS supporting SGX but having it disabled. Therefore, update the error message to 'SGX disabled or unsupported by BIOS' to make it easier for those reading kernel logs to understand what's happening. Reported-by: Bo Wu Co-developed-by: Zelong Xiang Signed-off-by: Zelong Xiang Signed-off-by: WangYuli Signed-off-by: Thomas Gleixner Acked-by: Kai Huang Link: https://lore.kernel.org/all/F8D977CB368423F3+20240825104653.1294624-1-wangyuli@uniontech.com Closes: https://github.com/linuxdeepin/developer-center/issues/10032 --- arch/x86/kernel/cpu/feat_ctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 1640ae76548f..4a4118784c13 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -188,7 +188,7 @@ update_caps: update_sgx: if (!(msr & FEAT_CTL_SGX_ENABLED)) { if (enable_sgx_kvm || enable_sgx_driver) - pr_err_once("SGX disabled by BIOS.\n"); + pr_err_once("SGX disabled or unsupported by BIOS.\n"); clear_cpu_cap(c, X86_FEATURE_SGX); return; } From b51207dc02ec3aeaa849e419f79055d7334845b6 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 19 Aug 2024 10:33:13 +0200 Subject: [PATCH 081/126] x86/boot/64: Strip percpu address space when setting up GDT descriptors init_per_cpu_var() returns a pointer in the percpu address space while rip_rel_ptr() expects a pointer in the generic address space. When strict address space checks are enabled, GCC's named address space checks fail: asm.h:124:63: error: passing argument 1 of 'rip_rel_ptr' from pointer to non-enclosed address space Add a explicit cast to remove address space of the returned pointer. Fixes: 11e36b0f7c21 ("x86/boot/64: Load the final kernel GDT during early boot directly, remove startup_gdt[]") Signed-off-by: Uros Bizjak Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240819083334.148536-1-ubizjak@gmail.com --- arch/x86/kernel/head64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index a817ed0724d1..4b9d4557fc94 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -559,10 +559,11 @@ void early_setup_idt(void) */ void __head startup_64_setup_gdt_idt(void) { + struct desc_struct *gdt = (void *)(__force unsigned long)init_per_cpu_var(gdt_page.gdt); void *handler = NULL; struct desc_ptr startup_gdt_descr = { - .address = (unsigned long)&RIP_REL_REF(init_per_cpu_var(gdt_page.gdt)), + .address = (unsigned long)&RIP_REL_REF(*gdt), .size = GDT_SIZE - 1, }; From cc5e03f3be3154f860c9d08b2ac8c139863e1515 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Fri, 16 Aug 2024 18:22:19 +0800 Subject: [PATCH 082/126] x86/extable: Remove unused declaration fixup_bug() Commit 15a416e8aaa7 ("x86/entry: Treat BUG/WARN as NMI-like entries") removed the implementation but left the declaration. Signed-off-by: Yue Haibing Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240816102219.883297-1-yuehaibing@huawei.com --- arch/x86/include/asm/extable.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h index eeed395c3177..a0e0c6b50155 100644 --- a/arch/x86/include/asm/extable.h +++ b/arch/x86/include/asm/extable.h @@ -37,7 +37,6 @@ struct pt_regs; extern int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr); -extern int fixup_bug(struct pt_regs *regs, int trapnr); extern int ex_get_fixup_type(unsigned long ip); extern void early_fixup_exception(struct pt_regs *regs, int trapnr); From 0f70fdd42559b4eed8f74bf7389656a8ae3eb5c3 Mon Sep 17 00:00:00 2001 From: Richard Gong Date: Mon, 19 Aug 2024 07:30:41 -0500 Subject: [PATCH 083/126] x86/amd_nb: Add new PCI IDs for AMD family 1Ah model 60h-70h Add new PCI IDs for Device 18h and Function 4 to enable the amd_atl driver on those systems. Signed-off-by: Richard Gong Signed-off-by: Thomas Gleixner Reviewed-by: Yazen Ghannam Link: https://lore.kernel.org/all/20240819123041.915734-1-richard.gong@amd.com --- arch/x86/kernel/amd_nb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 61eadde08511..dc5d3216af24 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -44,6 +44,8 @@ #define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4 +#define PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4 0x124c +#define PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4 0x12bc #define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4 #define PCI_DEVICE_ID_AMD_MI300_DF_F4 0x152c @@ -125,6 +127,8 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) }, {} From 723edbd2ca5fb4c78ac4a5644511c63895fd1c57 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Fri, 16 Aug 2024 03:43:16 -0700 Subject: [PATCH 084/126] x86/fred: Set SS to __KERNEL_DS when enabling FRED SS is initialized to NULL during boot time and not explicitly set to __KERNEL_DS. With FRED enabled, if a kernel event is delivered before a CPU goes to user level for the first time, its SS is NULL thus NULL is pushed into the SS field of the FRED stack frame. But before ERETS is executed, the CPU may context switch to another task and go to user level. Then when the CPU comes back to kernel mode, SS is changed to __KERNEL_DS. Later when ERETS is executed to return from the kernel event handler, a #GP fault is generated because SS doesn't match the SS saved in the FRED stack frame. Initialize SS to __KERNEL_DS when enabling FRED to prevent that. Note, IRET doesn't check if SS matches the SS saved in its stack frame, thus IDT doesn't have this problem. For IDT it doesn't matter whether SS is set to __KERNEL_DS or not, because it's set to NULL upon interrupt or exception delivery and __KERNEL_DS upon SYSCALL. Thus it's pointless to initialize SS for IDT. Signed-off-by: Xin Li (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240816104316.2276968-1-xin@zytor.com --- arch/x86/kernel/fred.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c index 99a134fcd5bf..266c69e332a4 100644 --- a/arch/x86/kernel/fred.c +++ b/arch/x86/kernel/fred.c @@ -26,6 +26,20 @@ void cpu_init_fred_exceptions(void) /* When FRED is enabled by default, remove this log message */ pr_info("Initialize FRED on CPU%d\n", smp_processor_id()); + /* + * If a kernel event is delivered before a CPU goes to user level for + * the first time, its SS is NULL thus NULL is pushed into the SS field + * of the FRED stack frame. But before ERETS is executed, the CPU may + * context switch to another task and go to user level. Then when the + * CPU comes back to kernel mode, SS is changed to __KERNEL_DS. Later + * when ERETS is executed to return from the kernel event handler, a #GP + * fault is generated because SS doesn't match the SS saved in the FRED + * stack frame. + * + * Initialize SS to __KERNEL_DS when enabling FRED to avoid such #GPs. + */ + loadsegment(ss, __KERNEL_DS); + wrmsrl(MSR_IA32_FRED_CONFIG, /* Reserve for CALL emulation */ FRED_CONFIG_REDZONE | From 0dfac6f267fa091aa348c6a6742b463c9e7c98e3 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Thu, 22 Aug 2024 00:39:04 -0700 Subject: [PATCH 085/126] x86/entry: Test ti_work for zero before processing individual bits In most cases, ti_work values passed to arch_exit_to_user_mode_prepare() are zeros, e.g., 99% in kernel build tests. So an obvious optimization is to test ti_work for zero before processing individual bits in it. Omit the optimization when FPU debugging is enabled, otherwise the FPU consistency check is never executed. Intel 0day tests did not find a perfermance regression with this change. Suggested-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240822073906.2176342-2-xin@zytor.com --- arch/x86/include/asm/entry-common.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index fb2809b20b0a..db970828f385 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -44,8 +44,7 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) } #define arch_enter_from_user_mode arch_enter_from_user_mode -static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, - unsigned long ti_work) +static inline void arch_exit_work(unsigned long ti_work) { if (ti_work & _TIF_USER_RETURN_NOTIFY) fire_user_return_notifiers(); @@ -56,6 +55,13 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, fpregs_assert_state_consistent(); if (unlikely(ti_work & _TIF_NEED_FPU_LOAD)) switch_fpu_return(); +} + +static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, + unsigned long ti_work) +{ + if (IS_ENABLED(CONFIG_X86_DEBUG_FPU) || unlikely(ti_work)) + arch_exit_work(ti_work); #ifdef CONFIG_COMPAT /* From efe508816d2caf83536ff2f308e09043380fb2b7 Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Thu, 22 Aug 2024 00:39:05 -0700 Subject: [PATCH 086/126] x86/msr: Switch between WRMSRNS and WRMSR with the alternatives mechanism Per the discussion about FRED MSR writes with WRMSRNS instruction [1], use the alternatives mechanism to choose WRMSRNS when it's available, otherwise fallback to WRMSR. Remove the dependency on X86_FEATURE_WRMSRNS as WRMSRNS is no longer dependent on FRED. [1] https://lore.kernel.org/lkml/15f56e6a-6edd-43d0-8e83-bb6430096514@citrix.com/ Use DS prefix to pad WRMSR instead of a NOP. The prefix is ignored. At least that's the current information from the hardware folks. Signed-off-by: Andrew Cooper Signed-off-by: Xin Li (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240822073906.2176342-3-xin@zytor.com --- arch/x86/include/asm/msr.h | 25 +++++++++++-------------- arch/x86/include/asm/switch_to.h | 1 - arch/x86/kernel/cpu/cpuid-deps.c | 1 - 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index d642037f9ed5..001853541f1e 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -99,19 +99,6 @@ static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) : : "c" (msr), "a"(low), "d" (high) : "memory"); } -/* - * WRMSRNS behaves exactly like WRMSR with the only difference being - * that it is not a serializing instruction by default. - */ -static __always_inline void __wrmsrns(u32 msr, u32 low, u32 high) -{ - /* Instruction opcode for WRMSRNS; supported in binutils >= 2.40. */ - asm volatile("1: .byte 0x0f,0x01,0xc6\n" - "2:\n" - _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) - : : "c" (msr), "a"(low), "d" (high)); -} - #define native_rdmsr(msr, val1, val2) \ do { \ u64 __val = __rdmsr((msr)); \ @@ -312,9 +299,19 @@ do { \ #endif /* !CONFIG_PARAVIRT_XXL */ +/* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */ +#define WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6) + +/* Non-serializing WRMSR, when available. Falls back to a serializing WRMSR. */ static __always_inline void wrmsrns(u32 msr, u64 val) { - __wrmsrns(msr, val, val >> 32); + /* + * WRMSR is 2 bytes. WRMSRNS is 3 bytes. Pad WRMSR with a redundant + * DS prefix to avoid a trailing NOP. + */ + asm volatile("1: " ALTERNATIVE("ds wrmsr", WRMSRNS, X86_FEATURE_WRMSRNS) + "2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) + : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32))); } /* diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index c3bd0c0758c9..e9ded149a9e3 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -71,7 +71,6 @@ static inline void update_task_stack(struct task_struct *task) this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else if (cpu_feature_enabled(X86_FEATURE_FRED)) { - /* WRMSRNS is a baseline feature for FRED. */ wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) + THREAD_SIZE); } else if (cpu_feature_enabled(X86_FEATURE_XENPV)) { /* Xen PV enters the kernel on the thread stack. */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index b7d9f530ae16..8bd84114c2d9 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -83,7 +83,6 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, { X86_FEATURE_FRED, X86_FEATURE_LKGS }, - { X86_FEATURE_FRED, X86_FEATURE_WRMSRNS }, {} }; From fe85ee391966c4cf3bfe1c405314e894c951f521 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Thu, 22 Aug 2024 00:39:06 -0700 Subject: [PATCH 087/126] x86/entry: Set FRED RSP0 on return to userspace instead of context switch The FRED RSP0 MSR points to the top of the kernel stack for user level event delivery. As this is the task stack it needs to be updated when a task is scheduled in. The update is done at context switch. That means it's also done when switching to kernel threads, which is pointless as those never go out to user space. For KVM threads this means there are two writes to FRED_RSP0 as KVM has to switch to the guest value before VMENTER. Defer the update to the exit to user space path and cache the per CPU FRED_RSP0 value, so redundant writes can be avoided. Provide fred_sync_rsp0() for KVM to keep the cache in sync with the actual MSR value after returning from guest to host mode. [ tglx: Massage change log ] Suggested-by: Sean Christopherson Suggested-by: Thomas Gleixner Signed-off-by: Xin Li (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240822073906.2176342-4-xin@zytor.com --- arch/x86/include/asm/entry-common.h | 3 +++ arch/x86/include/asm/fred.h | 21 ++++++++++++++++++++- arch/x86/include/asm/switch_to.h | 5 +---- arch/x86/kernel/fred.c | 3 +++ 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index db970828f385..77d20555e04d 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -8,6 +8,7 @@ #include #include #include +#include /* Check that the stack and regs on entry from user mode are sane. */ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) @@ -63,6 +64,8 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, if (IS_ENABLED(CONFIG_X86_DEBUG_FPU) || unlikely(ti_work)) arch_exit_work(ti_work); + fred_update_rsp0(); + #ifdef CONFIG_COMPAT /* * Compat syscalls set TS_COMPAT. Make sure we clear it before diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h index 66d7dbe2d314..25ca00bd70e8 100644 --- a/arch/x86/include/asm/fred.h +++ b/arch/x86/include/asm/fred.h @@ -36,6 +36,7 @@ #ifdef CONFIG_X86_FRED #include +#include #include @@ -87,12 +88,30 @@ void cpu_init_fred_exceptions(void); void cpu_init_fred_rsps(void); void fred_complete_exception_setup(void); +DECLARE_PER_CPU(unsigned long, fred_rsp0); + +static __always_inline void fred_sync_rsp0(unsigned long rsp0) +{ + __this_cpu_write(fred_rsp0, rsp0); +} + +static __always_inline void fred_update_rsp0(void) +{ + unsigned long rsp0 = (unsigned long) task_stack_page(current) + THREAD_SIZE; + + if (cpu_feature_enabled(X86_FEATURE_FRED) && (__this_cpu_read(fred_rsp0) != rsp0)) { + wrmsrns(MSR_IA32_FRED_RSP0, rsp0); + __this_cpu_write(fred_rsp0, rsp0); + } +} #else /* CONFIG_X86_FRED */ static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } static inline void cpu_init_fred_exceptions(void) { } static inline void cpu_init_fred_rsps(void) { } static inline void fred_complete_exception_setup(void) { } -static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } +static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } +static inline void fred_sync_rsp0(unsigned long rsp0) { } +static inline void fred_update_rsp0(void) { } #endif /* CONFIG_X86_FRED */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index e9ded149a9e3..75248546403d 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -70,12 +70,9 @@ static inline void update_task_stack(struct task_struct *task) #ifdef CONFIG_X86_32 this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else - if (cpu_feature_enabled(X86_FEATURE_FRED)) { - wrmsrns(MSR_IA32_FRED_RSP0, (unsigned long)task_stack_page(task) + THREAD_SIZE); - } else if (cpu_feature_enabled(X86_FEATURE_XENPV)) { + if (!cpu_feature_enabled(X86_FEATURE_FRED) && cpu_feature_enabled(X86_FEATURE_XENPV)) /* Xen PV enters the kernel on the thread stack. */ load_sp0(task_top_of_stack(task)); - } #endif } diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c index 266c69e332a4..8d32c3f48abc 100644 --- a/arch/x86/kernel/fred.c +++ b/arch/x86/kernel/fred.c @@ -21,6 +21,9 @@ #define FRED_STKLVL(vector, lvl) ((lvl) << (2 * (vector))) +DEFINE_PER_CPU(unsigned long, fred_rsp0); +EXPORT_PER_CPU_SYMBOL(fred_rsp0); + void cpu_init_fred_exceptions(void) { /* When FRED is enabled by default, remove this log message */ From 61eb040228ba9a0657937a1e48d6696f66364a55 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 31 Jul 2024 01:45:07 +0200 Subject: [PATCH 088/126] m68k: cmpxchg: Use swap() to improve code Remove the local variable tmp and use the swap() macro instead. Signed-off-by: Thorsten Blum Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20240730234506.492743-2-thorsten.blum@toblux.com Signed-off-by: Geert Uytterhoeven --- arch/m68k/include/asm/cmpxchg.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/m68k/include/asm/cmpxchg.h b/arch/m68k/include/asm/cmpxchg.h index 4ba14f3535fc..71fbe5c5c564 100644 --- a/arch/m68k/include/asm/cmpxchg.h +++ b/arch/m68k/include/asm/cmpxchg.h @@ -3,6 +3,7 @@ #define __ARCH_M68K_CMPXCHG__ #include +#include #define __xg(type, x) ((volatile type *)(x)) @@ -11,25 +12,19 @@ extern unsigned long __invalid_xchg_size(unsigned long, volatile void *, int); #ifndef CONFIG_RMW_INSNS static inline unsigned long __arch_xchg(unsigned long x, volatile void * ptr, int size) { - unsigned long flags, tmp; + unsigned long flags; local_irq_save(flags); switch (size) { case 1: - tmp = *(u8 *)ptr; - *(u8 *)ptr = x; - x = tmp; + swap(*(u8 *)ptr, x); break; case 2: - tmp = *(u16 *)ptr; - *(u16 *)ptr = x; - x = tmp; + swap(*(u16 *)ptr, x); break; case 4: - tmp = *(u32 *)ptr; - *(u32 *)ptr = x; - x = tmp; + swap(*(u32 *)ptr, x); break; default: x = __invalid_xchg_size(x, ptr, size); From 09b3d870faa7bc3e96c0978ab3cf4e96e4b15571 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Sun, 11 Aug 2024 10:12:29 +1000 Subject: [PATCH 089/126] m68k: Fix kernel_clone_args.flags in m68k_clone() Stan Johnson recently reported a failure from the 'dump' command: DUMP: Date of this level 0 dump: Fri Aug 9 23:37:15 2024 DUMP: Dumping /dev/sda (an unlisted file system) to /dev/null DUMP: Label: none DUMP: Writing 10 Kilobyte records DUMP: mapping (Pass I) [regular files] DUMP: mapping (Pass II) [directories] DUMP: estimated 3595695 blocks. DUMP: Context save fork fails in parent 671 The dump program uses the clone syscall with the CLONE_IO flag, that is, flags == 0x80000000. When that value is promoted from long int to u64 by m68k_clone(), it undergoes sign-extension. The new value includes CLONE_INTO_CGROUP so the validation in cgroup_css_set_fork() fails and the syscall returns -EBADF. Avoid sign-extension by casting to u32. Reported-by: Stan Johnson Closes: https://lists.debian.org/debian-68k/2024/08/msg00000.html Fixes: 6aabc1facdb2 ("m68k: Implement copy_thread_tls()") Signed-off-by: Finn Thain Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/3463f1e5d4e95468dc9f3368f2b78ffa7b72199b.1723335149.git.fthain@linux-m68k.org Signed-off-by: Geert Uytterhoeven --- arch/m68k/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 2584e94e2134..fda7eac23f87 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -117,7 +117,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs) { /* regs will be equal to current_pt_regs() */ struct kernel_clone_args args = { - .flags = regs->d1 & ~CSIGNAL, + .flags = (u32)(regs->d1) & ~CSIGNAL, .pidfd = (int __user *)regs->d3, .child_tid = (int __user *)regs->d4, .parent_tid = (int __user *)regs->d3, From b90fae5df91744e45e683c17bb1a38e466770df3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 29 Jul 2024 12:26:33 +0200 Subject: [PATCH 090/126] m68k: defconfig: Update defconfigs for v6.11-rc1 - Drop CONFIG_CRYPTO_SM2=m (removed in commit 46b3ff73afc815f1 ("crypto: sm2 - Remove sm2 algorithm")), - Drop CONFIG_TEST_USER_COPY=m (replaced by auto-modular CONFIG_USERCOPY_KUNIT_TEST in commit cf6219ee889fb304 ("usercopy: Convert test_user_copy to KUnit test")). Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/bfe0530e290cee9d350f89c4d41436f3de7cb2a5.1722248695.git.geert@linux-m68k.org --- arch/m68k/configs/amiga_defconfig | 2 -- arch/m68k/configs/apollo_defconfig | 2 -- arch/m68k/configs/atari_defconfig | 2 -- arch/m68k/configs/bvme6000_defconfig | 2 -- arch/m68k/configs/hp300_defconfig | 2 -- arch/m68k/configs/mac_defconfig | 2 -- arch/m68k/configs/multi_defconfig | 2 -- arch/m68k/configs/mvme147_defconfig | 2 -- arch/m68k/configs/mvme16x_defconfig | 2 -- arch/m68k/configs/q40_defconfig | 2 -- arch/m68k/configs/sun3_defconfig | 2 -- arch/m68k/configs/sun3x_defconfig | 2 -- 12 files changed, 24 deletions(-) diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 9a084943a68a..d01dc47d52ea 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -559,7 +559,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -636,7 +635,6 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m -CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 58ec725bf392..46808e581d7b 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -516,7 +516,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -593,7 +592,6 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m -CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 63ab7e892e59..4469a7839c9d 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -536,7 +536,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -613,7 +612,6 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m -CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index ca40744eec60..c0719322c028 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -508,7 +508,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -585,7 +584,6 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m -CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index 77bcc6faf468..8d429e63f8f2 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -518,7 +518,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -595,7 +594,6 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m -CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index e73d4032b659..bafd33da27c1 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -535,7 +535,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -612,7 +611,6 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m -CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 638df8442c98..6f5ca3f85ea1 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -621,7 +621,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -698,7 +697,6 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m -CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 2248db426081..d16b328c7136 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -507,7 +507,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -584,7 +583,6 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m -CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 2975b66521f6..80f6c15a5ed5 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -508,7 +508,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -585,7 +584,6 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m -CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 0a0e32344033..0e81589f0ee2 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -525,7 +525,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -602,7 +601,6 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m -CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index f16f1a99c2ba..8cd785290339 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -506,7 +506,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -582,7 +581,6 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m -CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index 667bd61ae9b3..78035369f60f 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -506,7 +506,6 @@ CONFIG_CRYPTO_DH=m CONFIG_CRYPTO_ECDH=m CONFIG_CRYPTO_ECDSA=m CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_SM2=m CONFIG_CRYPTO_CURVE25519=m CONFIG_CRYPTO_AES=y CONFIG_CRYPTO_AES_TI=m @@ -583,7 +582,6 @@ CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m -CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m From 50c6dbdfd16e312382842198a7919341ad480e05 Mon Sep 17 00:00:00 2001 From: Max Ramanouski Date: Sun, 25 Aug 2024 01:01:11 +0300 Subject: [PATCH 091/126] x86/ioremap: Improve iounmap() address range checks Allowing iounmap() on memory that was not ioremap()'d in the first place is obviously a bad idea. There is currently a feeble attempt to avoid errant iounmap()s by checking to see if the address is below "high_memory". But that's imprecise at best because there are plenty of high addresses that are also invalid to call iounmap() on. Thankfully, there is a more precise helper: is_ioremap_addr(). x86 just does not use it in iounmap(). Restrict iounmap() to addresses in the ioremap region, by using is_ioremap_addr(). This aligns x86 closer to the generic iounmap() implementation. Additionally, add a warning in case there is an attempt to iounmap() invalid memory. This replaces an existing silent return and will help alert folks to any incorrect usage of iounmap(). Due to VMALLOC_START on i386 not being present in asm/pgtable.h, include for asm/vmalloc.h had to be added to include/linux/ioremap.h. [ dhansen: tweak subject and changelog ] Signed-off-by: Max Ramanouski Signed-off-by: Dave Hansen Reviewed-by: Christoph Hellwig Reviewed-by: Alistair Popple Link: https://lore.kernel.org/all/20240824220111.84441-1-max8rr8%40gmail.com --- arch/x86/mm/ioremap.c | 3 ++- include/linux/ioremap.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index aa7d279321ea..70b02fc61d93 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -457,7 +458,7 @@ void iounmap(volatile void __iomem *addr) { struct vm_struct *p, *o; - if ((void __force *)addr <= high_memory) + if (WARN_ON_ONCE(!is_ioremap_addr((void __force *)addr))) return; /* diff --git a/include/linux/ioremap.h b/include/linux/ioremap.h index f0e99fc7dd8b..2bd1661fe9ad 100644 --- a/include/linux/ioremap.h +++ b/include/linux/ioremap.h @@ -4,6 +4,7 @@ #include #include +#include #if defined(CONFIG_HAS_IOMEM) || defined(CONFIG_GENERIC_IOREMAP) /* From 6e56774c17d88524fabfa8a3aef1881abb57d874 Mon Sep 17 00:00:00 2001 From: Aryabhatta Dey Date: Sun, 25 Aug 2024 17:03:40 +0530 Subject: [PATCH 092/126] docs: process: fix typos in Documentation/process/backporting.rst Change 'submiting' to 'submitting', 'famliar' to 'familiar' and 'appared' to 'appeared'. Signed-off-by: Aryabhatta Dey Reviewed-by: Vegard Nossum Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/rd2vu7z2t23ppafto4zxc6jge5mj7w7xnpmwywaa2e3eiojgf2@poicxprsdoks --- Documentation/process/backporting.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/process/backporting.rst b/Documentation/process/backporting.rst index e1a6ea0a1e8a..a71480fcf3b4 100644 --- a/Documentation/process/backporting.rst +++ b/Documentation/process/backporting.rst @@ -73,7 +73,7 @@ Once you have the patch in git, you can go ahead and cherry-pick it into your source tree. Don't forget to cherry-pick with ``-x`` if you want a written record of where the patch came from! -Note that if you are submiting a patch for stable, the format is +Note that if you are submitting a patch for stable, the format is slightly different; the first line after the subject line needs tobe either:: @@ -147,7 +147,7 @@ divergence. It's important to always identify the commit or commits that caused the conflict, as otherwise you cannot be confident in the correctness of your resolution. As an added bonus, especially if the patch is in an -area you're not that famliar with, the changelogs of these commits will +area you're not that familiar with, the changelogs of these commits will often give you the context to understand the code and potential problems or pitfalls with your conflict resolution. @@ -197,7 +197,7 @@ git blame Another way to find prerequisite commits (albeit only the most recent one for a given conflict) is to run ``git blame``. In this case, you need to run it against the parent commit of the patch you are -cherry-picking and the file where the conflict appared, i.e.:: +cherry-picking and the file where the conflict appeared, i.e.:: git blame ^ -- From 14ac4cac4ddb7e0097d5fa190371480fcb2efa5b Mon Sep 17 00:00:00 2001 From: Aryabhatta Dey Date: Sun, 25 Aug 2024 09:20:39 +0530 Subject: [PATCH 093/126] docs: leds: fix typo in Documentation/leds/leds-mlxcpld.rst Change 'cylce' to 'cycle'. Signed-off-by: Aryabhatta Dey Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/5nib2kj6uh7lkafrmmwcjpeyvs7megdfmseftkjws2wcuztoyc@yhidnl4ilbok --- Documentation/leds/leds-mlxcpld.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/leds/leds-mlxcpld.rst b/Documentation/leds/leds-mlxcpld.rst index 528582429e0b..c520a134d91e 100644 --- a/Documentation/leds/leds-mlxcpld.rst +++ b/Documentation/leds/leds-mlxcpld.rst @@ -115,4 +115,4 @@ Driver provides the following LEDs for the system "msn2100": - [1,1,1,1] = Blue blink 6Hz Driver supports HW blinking at 3Hz and 6Hz frequency (50% duty cycle). -For 3Hz duty cylce is about 167 msec, for 6Hz is about 83 msec. +For 3Hz duty cycle is about 167 msec, for 6Hz is about 83 msec. From 6ffc34cefc43f2018971b1905f23314ec0b73220 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20Gonz=C3=A1lez=20Collado?= Date: Sat, 24 Aug 2024 11:54:02 +0200 Subject: [PATCH 094/126] docs/sp_Sp: Add translation to spanish of the documentation related to EEVDF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Translate Documentation/scheduler/sched-eevdf.rst to Spanish. Signed-off-by: Sergio González Collado Reviewed-by: Carlos Bilbao [jc: fixed build warning] Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240824095402.7706-1-sergio.collado@gmail.com --- .../translations/sp_SP/scheduler/index.rst | 1 + .../sp_SP/scheduler/sched-design-CFS.rst | 8 +-- .../sp_SP/scheduler/sched-eevdf.rst | 58 +++++++++++++++++++ 3 files changed, 63 insertions(+), 4 deletions(-) create mode 100644 Documentation/translations/sp_SP/scheduler/sched-eevdf.rst diff --git a/Documentation/translations/sp_SP/scheduler/index.rst b/Documentation/translations/sp_SP/scheduler/index.rst index 768488d6f001..32f9fd7517b2 100644 --- a/Documentation/translations/sp_SP/scheduler/index.rst +++ b/Documentation/translations/sp_SP/scheduler/index.rst @@ -6,3 +6,4 @@ :maxdepth: 1 sched-design-CFS + sched-eevdf diff --git a/Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst b/Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst index 90a153cad4e8..c146e5bba881 100644 --- a/Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst +++ b/Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst @@ -14,10 +14,10 @@ Gestor de tareas CFS CFS viene de las siglas en inglés de "Gestor de tareas totalmente justo" ("Completely Fair Scheduler"), y es el nuevo gestor de tareas de escritorio -implementado por Ingo Molnar e integrado en Linux 2.6.23. Es el sustituto de -el previo gestor de tareas SCHED_OTHER. - -Nota: El planificador EEVDF fue incorporado más recientemente al kernel. +implementado por Ingo Molnar e integrado en Linux 2.6.23. Es el sustituto +del previo gestor de tareas SCHED_OTHER. Hoy en día se está abriendo camino +para el gestor de tareas EEVDF, cuya documentación se puede ver en +Documentation/scheduler/sched-eevdf.rst El 80% del diseño de CFS puede ser resumido en una única frase: CFS básicamente modela una "CPU ideal, precisa y multi-tarea" sobre hardware diff --git a/Documentation/translations/sp_SP/scheduler/sched-eevdf.rst b/Documentation/translations/sp_SP/scheduler/sched-eevdf.rst new file mode 100644 index 000000000000..d54736f297b8 --- /dev/null +++ b/Documentation/translations/sp_SP/scheduler/sched-eevdf.rst @@ -0,0 +1,58 @@ + +.. include:: ../disclaimer-sp.rst + +:Original: Documentation/scheduler/sched-eevdf.rst +:Translator: Sergio González Collado + +====================== +Gestor de tareas EEVDF +====================== + +El gestor de tareas EEVDF, del inglés: "Earliest Eligible Virtual Deadline +First", fue presentado por primera vez en una publicación científica en +1995 [1]. El kernel de Linux comenzó a transicionar hacia EEVPF en la +versión 6.6 (y como una nueva opción en 2024), alejándose del gestor +de tareas CFS, en favor de una versión de EEVDF propuesta por Peter +Zijlstra en 2023 [2-4]. Más información relativa a CFS puede encontrarse +en Documentation/scheduler/sched-design-CFS.rst. + +De forma parecida a CFS, EEVDF intenta distribuir el tiempo de ejecución +de la CPU de forma equitativa entre todas las tareas que tengan la misma +prioridad y puedan ser ejecutables. Para eso, asigna un tiempo de +ejecución virtual a cada tarea, creando un "retraso" que puede ser usado +para determinar si una tarea ha recibido su cantidad justa de tiempo +de ejecución en la CPU. De esta manera, una tarea con un "retraso" +positivo, es porque se le debe tiempo de ejecución, mientras que una +con "retraso" negativo implica que la tarea ha excedido su cuota de +tiempo. EEVDF elige las tareas con un "retraso" mayor igual a cero y +calcula un tiempo límite de ejecución virtual (VD, del inglés: virtual +deadline) para cada una, eligiendo la tarea con la VD más próxima para +ser ejecutada a continuación. Es importante darse cuenta que esto permite +que la tareas que sean sensibles a la latencia que tengan porciones de +tiempos de ejecución de CPU más cortos ser priorizadas, lo cual ayuda con +su menor tiempo de respuesta. + +Ahora mismo se está discutiendo cómo gestionar esos "retrasos", especialmente +en tareas que estén en un estado durmiente; pero en el momento en el que +se escribe este texto EEVDF usa un mecanismo de "decaimiento" basado en el +tiempo virtual de ejecución (VRT, del inglés: virtual run time). Esto previene +a las tareas de abusar del sistema simplemente durmiendo brevemente para +reajustar su retraso negativo: cuando una tarea duerme, esta permanece en +la cola de ejecución pero marcada para "desencolado diferido", permitiendo +a su retraso decaer a lo largo de VRT. Por tanto, las tareas que duerman +por más tiempo eventualmente eliminarán su retraso. Finalmente, las tareas +pueden adelantarse a otras si su VD es más próximo en el tiempo, y las +tareas podrán pedir porciones de tiempo específicas con la nueva llamada +del sistema sched_setattr(), todo esto facilitara el trabajo de las aplicaciones +que sean sensibles a las latencias. + +REFERENCIAS +=========== + +[1] https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=805acf7726282721504c8f00575d91ebfd750564 + +[2] https://lore.kernel.org/lkml/a79014e6-ea83-b316-1e12-2ae056bda6fa@linux.vnet.ibm.com/ + +[3] https://lwn.net/Articles/969062/ + +[4] https://lwn.net/Articles/925371/ From a4931bb8b066f66786182e6d8fd5fc27f4120f71 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 23 Aug 2024 21:43:39 +0800 Subject: [PATCH 095/126] Documentation: add ublk driver ioctl numbers ublk driver takes the following ioctl numbers: 'u' 00-2F linux/ublk_cmd.h So document it in ioctl-number.rst Signed-off-by: Ming Lei Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240823134339.1496916-1-ming.lei@redhat.com --- Documentation/userspace-api/ioctl/ioctl-number.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 217bdc76fe56..e4be1378ba26 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -293,6 +293,7 @@ Code Seq# Include File Comments 't' 80-8F linux/isdn_ppp.h 't' 90-91 linux/toshiba.h toshiba and toshiba_acpi SMM 'u' 00-1F linux/smb_fs.h gone +'u' 00-2F linux/ublk_cmd.h conflict! 'u' 20-3F linux/uvcvideo.h USB video class host driver 'u' 40-4f linux/udmabuf.h userspace dma-buf misc device 'v' 00-1F linux/ext2_fs.h conflict! From cbbdb6c625f6415c30f5dbc9305f1d2d5b79b02f Mon Sep 17 00:00:00 2001 From: Thorsten Leemhuis Date: Thu, 22 Aug 2024 09:35:33 +0200 Subject: [PATCH 096/126] docs: bug-bisect: rewrite to better match the other bisecting text Rewrite the short document on bisecting kernel bugs. The new text improves .config handling, brings a mention of 'git bisect skip', and explains what to do after the bisection finished -- including trying a revert to verify the result. The rewrite at the same time removes the unrelated and outdated section on 'Devices not appearing' and replaces some sentences about bug reporting with a pointer to the document covering that topic in detail. This overall brings the approach close to the one in the recently added text Documentation/admin-guide/verify-bugs-and-bisect-regressions.rst. As those two texts serve a similar purpose for different audiences, mention that document in the head of this one and outline when the other might be the better one to follow. Signed-off-by: Thorsten Leemhuis Reviewed-by: Petr Tesarik Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/74dc0137dcc3e2c05648e885a7bc31ffd39a0890.1724312119.git.linux@leemhuis.info --- Documentation/admin-guide/bug-bisect.rst | 166 ++++++++++++++++------- MAINTAINERS | 1 + 2 files changed, 118 insertions(+), 49 deletions(-) diff --git a/Documentation/admin-guide/bug-bisect.rst b/Documentation/admin-guide/bug-bisect.rst index 325c5d0ed34a..585630d14581 100644 --- a/Documentation/admin-guide/bug-bisect.rst +++ b/Documentation/admin-guide/bug-bisect.rst @@ -1,76 +1,144 @@ -Bisecting a bug -+++++++++++++++ +.. SPDX-License-Identifier: (GPL-2.0+ OR CC-BY-4.0) +.. [see the bottom of this file for redistribution information] -Last updated: 28 October 2016 +====================== +Bisecting a regression +====================== -Introduction -============ +This document describes how to use a ``git bisect`` to find the source code +change that broke something -- for example when some functionality stopped +working after upgrading from Linux 6.0 to 6.1. -Always try the latest kernel from kernel.org and build from source. If you are -not confident in doing that please report the bug to your distribution vendor -instead of to a kernel developer. +The text focuses on the gist of the process. If you are new to bisecting the +kernel, better follow Documentation/admin-guide/verify-bugs-and-bisect-regressions.rst +instead: it depicts everything from start to finish while covering multiple +aspects even kernel developers occasionally forget. This includes detecting +situations early where a bisection would be a waste of time, as nobody would +care about the result -- for example, because the problem happens after the +kernel marked itself as 'tainted', occurs in an abandoned version, was already +fixed, or is caused by a .config change you or your Linux distributor performed. -Finding bugs is not always easy. Have a go though. If you can't find it don't -give up. Report as much as you have found to the relevant maintainer. See -MAINTAINERS for who that is for the subsystem you have worked on. +Finding the change causing a kernel issue using a bisection +=========================================================== -Before you submit a bug report read -'Documentation/admin-guide/reporting-issues.rst'. +*Note: the following process assumes you prepared everything for a bisection. +This includes having a Git clone with the appropriate sources, installing the +software required to build and install kernels, as well as a .config file stored +in a safe place (the following example assumes '~/prepared_kernel_.config') to +use as pristine base at each bisection step; ideally, you have also worked out +a fully reliable and straight-forward way to reproduce the regression, too.* -Devices not appearing -===================== +* Preparation: start the bisection and tell Git about the points in the history + you consider to be working and broken, which Git calls 'good' and 'bad':: -Often this is caused by udev/systemd. Check that first before blaming it -on the kernel. + git bisect start + git bisect good v6.0 + git bisect bad v6.1 -Finding patch that caused a bug -=============================== + Instead of Git tags like 'v6.0' and 'v6.1' you can specify commit-ids, too. -Using the provided tools with ``git`` makes finding bugs easy provided the bug -is reproducible. +1. Copy your prepared .config into the build directory and adjust it to the + needs of the codebase Git checked out for testing:: -Steps to do it: + cp ~/prepared_kernel_.config .config + make olddefconfig -- build the Kernel from its git source -- start bisect with [#f1]_:: +2. Now build, install, and boot a kernel. This might fail for unrelated reasons, + for example, when a compile error happens at the current stage of the + bisection a later change resolves. In such cases run ``git bisect skip`` and + go back to step 1. - $ git bisect start +3. Check if the functionality that regressed works in the kernel you just built. -- mark the broken changeset with:: + If it works, execute:: - $ git bisect bad [commit] + git bisect good -- mark a changeset where the code is known to work with:: + If it is broken, run:: - $ git bisect good [commit] + git bisect bad -- rebuild the Kernel and test -- interact with git bisect by using either:: + Note, getting this wrong just once will send the rest of the bisection + totally off course. To prevent having to start anew later you thus want to + ensure what you tell Git is correct; it is thus often wise to spend a few + minutes more on testing in case your reproducer is unreliable. - $ git bisect good + After issuing one of these two commands, Git will usually check out another + bisection point and print something like 'Bisecting: 675 revisions left to + test after this (roughly 10 steps)'. In that case go back to step 1. - or:: + If Git instead prints something like 'cafecaca0c0dacafecaca0c0dacafecaca0c0da + is the first bad commit', then you have finished the bisection. In that case + move to the next point below. Note, right after displaying that line Git will + show some details about the culprit including its patch description; this can + easily fill your terminal, so you might need to scroll up to see the message + mentioning the culprit's commit-id. - $ git bisect bad + In case you missed Git's output, you can always run ``git bisect log`` to + print the status: it will show how many steps remain or mention the result of + the bisection. - depending if the bug happened on the changeset you're testing -- After some interactions, git bisect will give you the changeset that - likely caused the bug. +* Recommended complementary task: put the bisection log and the current .config + file aside for the bug report; furthermore tell Git to reset the sources to + the state before the bisection:: -- For example, if you know that the current version is bad, and version - 4.8 is good, you could do:: + git bisect log > ~/bisection-log + cp .config ~/bisection-config-culprit + git bisect reset - $ git bisect start - $ git bisect bad # Current version is bad - $ git bisect good v4.8 +* Recommended optional task: try reverting the culprit on top of the latest + codebase and check if that fixes your bug; if that is the case, it validates + the bisection and enables developers to resolve the regression through a + revert. + + To try this, update your clone and check out latest mainline. Then tell Git + to revert the change by specifying its commit-id:: + + git revert --no-edit cafec0cacaca0 + + Git might reject this, for example when the bisection landed on a merge + commit. In that case, abandon the attempt. Do the same, if Git fails to revert + the culprit on its own because later changes depend on it -- at least unless + you bisected a stable or longterm kernel series, in which case you want to + check out its latest codebase and try a revert there. + + If a revert succeeds, build and test another kernel to check if reverting + resolved your regression. + +With that the process is complete. Now report the regression as described by +Documentation/admin-guide/reporting-issues.rst. -.. [#f1] You can, optionally, provide both good and bad arguments at git - start with ``git bisect start [BAD] [GOOD]`` +Additional reading material +--------------------------- -For further references, please read: +* The `man page for 'git bisect' `_ and + `fighting regressions with 'git bisect' `_ + in the Git documentation. +* `Working with git bisect `_ + from kernel developer Nathan Chancellor. +* `Using Git bisect to figure out when brokenness was introduced `_. +* `Fully automated bisecting with 'git bisect run' `_. -- The man page for ``git-bisect`` -- `Fighting regressions with git bisect `_ -- `Fully automated bisecting with "git bisect run" `_ -- `Using Git bisect to figure out when brokenness was introduced `_ +.. + end-of-content +.. + This document is maintained by Thorsten Leemhuis . If + you spot a typo or small mistake, feel free to let him know directly and + he'll fix it. You are free to do the same in a mostly informal way if you + want to contribute changes to the text -- but for copyright reasons please CC + linux-doc@vger.kernel.org and 'sign-off' your contribution as + Documentation/process/submitting-patches.rst explains in the section 'Sign + your work - the Developer's Certificate of Origin'. +.. + This text is available under GPL-2.0+ or CC-BY-4.0, as stated at the top + of the file. If you want to distribute this text under CC-BY-4.0 only, + please use 'The Linux kernel development community' for author attribution + and link this as source: + https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/Documentation/admin-guide/bug-bisect.rst + +.. + Note: Only the content of this RST file as found in the Linux kernel sources + is available under CC-BY-4.0, as versions of this text that were processed + (for example by the kernel's build system) might contain content taken from + files which use a more restrictive license. diff --git a/MAINTAINERS b/MAINTAINERS index b34385f2e46d..90c8681d4d31 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6719,6 +6719,7 @@ DOCUMENTATION REPORTING ISSUES M: Thorsten Leemhuis L: linux-doc@vger.kernel.org S: Maintained +F: Documentation/admin-guide/bug-bisect.rst F: Documentation/admin-guide/quickly-build-trimmed-linux.rst F: Documentation/admin-guide/reporting-issues.rst F: Documentation/admin-guide/verify-bugs-and-bisect-regressions.rst From 0a6339ff3906881a0bb13c9c96bc3785cc89af56 Mon Sep 17 00:00:00 2001 From: Gianfranco Trad Date: Mon, 19 Aug 2024 13:06:41 +0200 Subject: [PATCH 097/126] Fix typo "allocateed" to allocated Fix minor typo in the fault-injection documentation, where the word allocated was misspelled as "allocateed". Signed-off-by: Gianfranco Trad Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240819110641.1931-1-gianf.trad@gmail.com --- Documentation/fault-injection/fault-injection.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst index 07c24710bd21..8b8aeea71c68 100644 --- a/Documentation/fault-injection/fault-injection.rst +++ b/Documentation/fault-injection/fault-injection.rst @@ -291,7 +291,7 @@ kernel may crash because it may not be able to handle the error. There are 4 types of errors defined in include/asm-generic/error-injection.h EI_ETYPE_NULL - This function will return `NULL` if it fails. e.g. return an allocateed + This function will return `NULL` if it fails. e.g. return an allocated object address. EI_ETYPE_ERRNO From 748c3404c8cd3e019df4d1cce9582176c541fb3a Mon Sep 17 00:00:00 2001 From: Haoyang Liu Date: Sun, 18 Aug 2024 01:51:51 +0800 Subject: [PATCH 098/126] docs/zh_CN: Add dev-tools/kcsan Chinese translation Translate dev-tools/kcsan commit 31f605a308e6 ("kcsan, compiler_types: Introduce __data_racy type qualifier") into Chinese and add it in dev-tools/zh_CN/index.rst Signed-off-by: Haoyang Liu Reviewed-by: Yanteng Si Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240817175151.164923-1-tttturtleruss@hust.edu.cn --- .../translations/zh_CN/dev-tools/index.rst | 2 +- .../translations/zh_CN/dev-tools/kcsan.rst | 320 ++++++++++++++++++ 2 files changed, 321 insertions(+), 1 deletion(-) create mode 100644 Documentation/translations/zh_CN/dev-tools/kcsan.rst diff --git a/Documentation/translations/zh_CN/dev-tools/index.rst b/Documentation/translations/zh_CN/dev-tools/index.rst index c540e4a7d5db..6a8c637c0be1 100644 --- a/Documentation/translations/zh_CN/dev-tools/index.rst +++ b/Documentation/translations/zh_CN/dev-tools/index.rst @@ -21,6 +21,7 @@ Documentation/translations/zh_CN/dev-tools/testing-overview.rst testing-overview sparse kcov + kcsan gcov kasan ubsan @@ -32,7 +33,6 @@ Todolist: - checkpatch - coccinelle - kmsan - - kcsan - kfence - kgdb - kselftest diff --git a/Documentation/translations/zh_CN/dev-tools/kcsan.rst b/Documentation/translations/zh_CN/dev-tools/kcsan.rst new file mode 100644 index 000000000000..8c495c17f109 --- /dev/null +++ b/Documentation/translations/zh_CN/dev-tools/kcsan.rst @@ -0,0 +1,320 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/dev-tools/kcsan.rst +:Translator: 刘浩阳 Haoyang Liu + +内核并发消毒剂(KCSAN) +===================== + +内核并发消毒剂(KCSAN)是一个动态竞争检测器,依赖编译时插桩,并且使用基于观察 +点的采样方法来检测竞争。KCSAN 的主要目的是检测 `数据竞争`_。 + +使用 +---- + +KCSAN 受 GCC 和 Clang 支持。使用 GCC 需要版本 11 或更高,使用 Clang 也需要 +版本 11 或更高。 + +为了启用 KCSAN,用如下参数配置内核:: + + CONFIG_KCSAN = y + +KCSAN 提供了几个其他的配置选项来自定义行为(见 ``lib/Kconfig.kcsan`` 中的各自的 +帮助文档以获取更多信息)。 + +错误报告 +~~~~~~~~ + +一个典型数据竞争的报告如下所示:: + + ================================================================== + BUG: KCSAN: data-race in test_kernel_read / test_kernel_write + + write to 0xffffffffc009a628 of 8 bytes by task 487 on cpu 0: + test_kernel_write+0x1d/0x30 + access_thread+0x89/0xd0 + kthread+0x23e/0x260 + ret_from_fork+0x22/0x30 + + read to 0xffffffffc009a628 of 8 bytes by task 488 on cpu 6: + test_kernel_read+0x10/0x20 + access_thread+0x89/0xd0 + kthread+0x23e/0x260 + ret_from_fork+0x22/0x30 + + value changed: 0x00000000000009a6 -> 0x00000000000009b2 + + Reported by Kernel Concurrency Sanitizer on: + CPU: 6 PID: 488 Comm: access_thread Not tainted 5.12.0-rc2+ #1 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 + ================================================================== + +报告的头部提供了一个关于竞争中涉及到的函数的简短总结。随后是竞争中的两个线程的 +访问类型和堆栈信息。如果 KCSAN 发现了一个值的变化,那么那个值的旧值和新值会在 +“value changed”这一行单独显示。 + +另一个不太常见的数据竞争类型的报告如下所示:: + + ================================================================== + BUG: KCSAN: data-race in test_kernel_rmw_array+0x71/0xd0 + + race at unknown origin, with read to 0xffffffffc009bdb0 of 8 bytes by task 515 on cpu 2: + test_kernel_rmw_array+0x71/0xd0 + access_thread+0x89/0xd0 + kthread+0x23e/0x260 + ret_from_fork+0x22/0x30 + + value changed: 0x0000000000002328 -> 0x0000000000002329 + + Reported by Kernel Concurrency Sanitizer on: + CPU: 2 PID: 515 Comm: access_thread Not tainted 5.12.0-rc2+ #1 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 + ================================================================== + +这个报告是当另一个竞争线程不可能被发现,但是可以从观测的内存地址的值改变而推断 +出来的时候生成的。这类报告总是会带有“value changed”行。这类报告的出现通常是因 +为在竞争线程中缺少插桩,也可能是因为其他原因,比如 DMA 访问。这类报告只会在 +设置了内核参数 ``CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN=y`` 时才会出现,而这 +个参数是默认启用的。 + +选择性分析 +~~~~~~~~~~ + +对于一些特定的访问,函数,编译单元或者整个子系统,可能需要禁用数据竞争检测。 +对于静态黑名单,有如下可用的参数: + +* KCSAN 支持使用 ``data_race(expr)`` 注解,这个注解告诉 KCSAN 任何由访问 + ``expr`` 所引起的数据竞争都应该被忽略,其产生的行为后果被认为是安全的。请查阅 + `在 LKMM 中 "标记共享内存访问"`_ 获得更多信息。 + +* 与 ``data_race(...)`` 相似,可以使用类型限定符 ``__data_racy`` 来标记一个变量 + ,所有访问该变量而导致的数据竞争都是故意为之并且应该被 KCSAN 忽略:: + + struct foo { + ... + int __data_racy stats_counter; + ... + }; + +* 使用函数属性 ``__no_kcsan`` 可以对整个函数禁用数据竞争检测:: + + __no_kcsan + void foo(void) { + ... + + 为了动态限制该为哪些函数生成报告,查阅 `Debug 文件系统接口`_ 黑名单/白名单特性。 + +* 为特定的编译单元禁用数据竞争检测,将下列参数加入到 ``Makefile`` 中:: + + KCSAN_SANITIZE_file.o := n + +* 为 ``Makefile`` 中的所有编译单元禁用数据竞争检测,将下列参数添加到相应的 + ``Makefile`` 中:: + + KCSAN_SANITIZE := n + +.. _在 LKMM 中 "标记共享内存访问": https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/memory-model/Documentation/access-marking.txt + +此外,KCSAN 可以根据偏好设置显示或隐藏整个类别的数据竞争。可以使用如下 +Kconfig 参数进行更改: + +* ``CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY``: 如果启用了该参数并且通过观测点 + (watchpoint) 观测到一个有冲突的写操作,但是对应的内存地址中存储的值没有改变, + 则不会报告这起数据竞争。 + +* ``CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC``: 假设默认情况下,不超过字大小的简 + 单对齐写入操作是原子的。假设这些写入操作不会受到不安全的编译器优化影响,从而导 + 致数据竞争。该选项使 KCSAN 不报告仅由不超过字大小的简单对齐写入操作引起 + 的冲突所导致的数据竞争。 + +* ``CONFIG_KCSAN_PERMISSIVE``: 启用额外的宽松规则来忽略某些常见类型的数据竞争。 + 与上面的规则不同,这条规则更加复杂,涉及到值改变模式,访问类型和地址。这个 + 选项依赖编译选项 ``CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=y``。请查看 + ``kernel/kcsan/permissive.h`` 获取更多细节。对于只侧重于特定子系统而不是整个 + 内核报告的测试者和维护者,建议禁用该选项。 + +要使用尽可能严格的规则,选择 ``CONFIG_KCSAN_STRICT=y``,这将配置 KCSAN 尽可 +能紧密地遵循 Linux 内核内存一致性模型(LKMM)。 + +Debug 文件系统接口 +~~~~~~~~~~~~~~~~~~ + +文件 ``/sys/kernel/debug/kcsan`` 提供了如下接口: + +* 读 ``/sys/kernel/debug/kcsan`` 返回不同的运行时统计数据。 + +* 将 ``on`` 或 ``off`` 写入 ``/sys/kernel/debug/kcsan`` 允许打开或关闭 KCSAN。 + +* 将 ``!some_func_name`` 写入 ``/sys/kernel/debug/kcsan`` 会将 + ``some_func_name`` 添加到报告过滤列表中,该列表(默认)会将数据竞争报告中的顶 + 层堆栈帧是列表中函数的情况列入黑名单。 + +* 将 ``blacklist`` 或 ``whitelist`` 写入 ``/sys/kernel/debug/kcsan`` 会改变报告 + 过滤行为。例如,黑名单的特性可以用来过滤掉经常发生的数据竞争。白名单特性可以帮 + 助复现和修复测试。 + +性能调优 +~~~~~~~~ + +影响 KCSAN 整体的性能和 bug 检测能力的核心参数是作为内核命令行参数公开的,其默认 +值也可以通过相应的 Kconfig 选项更改。 + +* ``kcsan.skip_watch`` (``CONFIG_KCSAN_SKIP_WATCH``): 在另一个观测点设置之前每 + 个 CPU 要跳过的内存操作次数。更加频繁的设置观测点将增加观察到竞争情况的可能性 + 。这个参数对系统整体的性能和竞争检测能力影响最显著。 + +* ``kcsan.udelay_task`` (``CONFIG_KCSAN_UDELAY_TASK``): 对于任务,观测点设置之 + 后暂停执行的微秒延迟。值越大,检测到竞争情况的可能性越高。 + +* ``kcsan.udelay_interrupt`` (``CONFIG_KCSAN_UDELAY_INTERRUPT``): 对于中断, + 观测点设置之后暂停执行的微秒延迟。中断对于延迟的要求更加严格,其延迟通常应该小 + 于为任务选择的延迟。 + +它们可以通过 ``/sys/module/kcsan/parameters/`` 在运行时进行调整。 + +数据竞争 +-------- + +在一次执行中,如果两个内存访问存在 *冲突*,在不同的线程中并发执行,并且至少 +有一个访问是 *简单访问*,则它们就形成了 *数据竞争*。如果它们访问了同一个内存地址并且 +至少有一个是写操作,则称它们存在 *冲突*。有关更详细的讨论和定义,见 +`LKMM 中的 "简单访问和数据竞争"`_。 + +.. _LKMM 中的 "简单访问和数据竞争": https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/memory-model/Documentation/explanation.txt#n1922 + +与 Linux 内核内存一致性模型(LKMM)的关系 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +LKMM 定义了各种内存操作的传播和排序规则,让开发者可以推理并发代码。最终这允许确 +定并发代码可能的执行情况并判断这些代码是否存在数据竞争。 + +KCSAN 可以识别 *被标记的原子操作* ( ``READ_ONCE``, ``WRITE_ONCE`` , ``atomic_*`` +等),以及内存屏障所隐含的一部分顺序保证。启用 ``CONFIG_KCSAN_WEAK_MEMORY=y`` +配置,KCSAN 会对加载或存储缓冲区进行建模,并可以检测遗漏的 +``smp_mb()``, ``smp_wmb()``, ``smp_rmb()``, ``smp_store_release()``,以及所有的 +具有等效隐含内存屏障的 ``atomic_*`` 操作。 + +请注意,KCSAN 不会报告所有由于缺失内存顺序而导致的数据竞争,特别是在需要内存屏障 +来禁止后续内存操作在屏障之前重新排序的情况下。因此,开发人员应该仔细考虑那些未 +被检查的内存顺序要求。 + +数据竞争以外的竞争检测 +--------------------------- + +对于有着复杂并发设计的代码,竞争状况不总是表现为数据竞争。如果并发操作引起了意 +料之外的系统行为,则认为发生了竞争状况。另一方面,数据竞争是在 C 语言层面定义 +的。内核定义了一些宏定义用来检测非数据竞争的漏洞并发代码的属性。 + +.. note:: + 为了不引入新的文档编译警告,这里不展示宏定义的具体内容,如果想查看具体 + 宏定义可以结合原文(Documentation/dev-tools/kcsan.rst)阅读。 + +实现细节 +-------- + +KCSAN 需要观测两个并发访问。特别重要的是,我们想要(a)增加观测到竞争的机会(尤 +其是很少发生的竞争),以及(b)能够实际观测到这些竞争。我们可以通过(a)注入 +不同的延迟,以及(b)使用地址观测点(或断点)来实现。 + +如果我们在设置了地址观察点的情况下故意延迟一个内存访问,然后观察到观察点被触发 +,那么两个对同一地址的访问就发生了竞争。使用硬件观察点,这是 `DataCollider +`_ 中采用 +的方法。与 DataCollider 不同,KCSAN 不使用硬件观察点,而是依赖于编译器插桩和“软 +观测点”。 + +在 KCSAN 中,观察点是通过一种高效的编码实现的,该编码将访问类型、大小和地址存储 +在一个长整型变量中;使用“软观察点”的好处是具有可移植性和更大的灵活性。然后, +KCSAN依赖于编译器对普通访问的插桩。对于每个插桩的普通访问: + +1. 检测是否存在一个符合的观测点,如果存在,并且至少有一个操作是写操作,则我们发 + 现了一个竞争访问。 + +2. 如果不存在匹配的观察点,则定期的设置一个观测点并随机延迟一小段时间。 + +3. 在延迟前检查数据值,并在延迟后重新检查数据值;如果值不匹配,我们推测存在一个 + 未知来源的竞争状况。 + +为了检测普通访问和标记访问之间的数据竞争,KCSAN 也对标记访问进行标记,但仅用于 +检查是否存在观察点;即 KCSAN 不会在标记访问上设置观察点。通过不在标记操作上设 +置观察点,如果对一个变量的所有并发访问都被正确标记,KCSAN 将永远不会触发观察点 +,因此也不会报告这些访问。 + +弱内存建模 +~~~~~~~~~~ + +KCSAN 通过建模访问重新排序(使用 ``CONFIG_KCSAN_WEAK_MEMORY=y``)来检测由于缺少 +内存屏障而导致的数据竞争。每个设置了观察点的普通内存访问也会被选择在其函数范围 +内进行模拟重新排序(最多一个正在进行的访问)。 + +一旦某个访问被选择用于重新排序,它将在函数范围内与每个其他访问进行检查。如果遇 +到适当的内存屏障,该访问将不再被考虑进行模拟重新排序。 + +当内存操作的结果应该由屏障排序时,KCSAN 可以检测到仅由于缺失屏障而导致的冲突的 +数据竞争。考虑下面的例子:: + + int x, flag; + void T1(void) + { + x = 1; // data race! + WRITE_ONCE(flag, 1); // correct: smp_store_release(&flag, 1) + } + void T2(void) + { + while (!READ_ONCE(flag)); // correct: smp_load_acquire(&flag) + ... = x; // data race! + } + +当启用了弱内存建模,KCSAN 将考虑对 ``T1`` 中的 ``x`` 进行模拟重新排序。在写入 +``flag`` 之后,x再次被检查是否有并发访问:因为 ``T2`` 可以在写入 +``flag`` 之后继续进行,因此检测到数据竞争。如果遇到了正确的屏障, ``x`` 在正确 +释放 ``flag`` 后将不会被考虑重新排序,因此不会检测到数据竞争。 + +在复杂性上的权衡以及实际的限制意味着只能检测到一部分由于缺失内存屏障而导致的数 +据竞争。由于当前可用的编译器支持,KCSAN 的实现仅限于建模“缓冲”(延迟访问)的 +效果,因为运行时不能“预取”访问。同时要注意,观测点只设置在普通访问上,这是唯 +一一个 KCSAN 会模拟重新排序的访问类型。这意味着标记访问的重新排序不会被建模。 + +上述情况的一个后果是获取 (acquire) 操作不需要屏障插桩(不需要预取)。此外,引 +入地址或控制依赖的标记访问不需要特殊处理(标记访问不能重新排序,后续依赖的访问 +不能被预取)。 + +关键属性 +~~~~~~~~ + +1. **内存开销**:整体的内存开销只有几 MiB,取决于配置。当前的实现是使用一个小长 + 整型数组来编码观测点信息,几乎可以忽略不计。 + +2. **性能开销**:KCSAN 的运行时旨在性能开销最小化,使用一个高效的观测点编码,在 + 快速路径中不需要获取任何锁。在拥有 8 个 CPU 的系统上的内核启动来说: + + - 使用默认 KCSAN 配置时,性能下降 5 倍; + - 仅因运行时快速路径开销导致性能下降 2.8 倍(设置非常大的 + ``KCSAN_SKIP_WATCH`` 并取消设置 ``KCSAN_SKIP_WATCH_RANDOMIZE``)。 + +3. **注解开销**:KCSAN 运行时之外需要的注释很少。因此,随着内核的发展维护的开 + 销也很小。 + +4. **检测设备的竞争写入**:由于设置观测点时会检查数据值,设备的竞争写入也可以 + 被检测到。 + +5. **内存排序**:KCSAN 只了解一部分 LKMM 排序规则;这可能会导致漏报数据竞争( + 假阴性)。 + +6. **分析准确率**: 对于观察到的执行,由于使用采样策略,分析是 *不健全* 的 + (可能有假阴性),但期望得到完整的分析(没有假阳性)。 + +考虑的替代方案 +-------------- + +一个内核数据竞争检测的替代方法是 `Kernel Thread Sanitizer (KTSAN) +`_。KTSAN 是一 +个基于先行发生关系(happens-before)的数据竞争检测器,它显式建立内存操作之间的先 +后发生顺序,这可以用来确定 `数据竞争`_ 中定义的数据竞争。 + +为了建立正确的先行发生关系,KTSAN 必须了解 LKMM 的所有排序规则和同步原语。不幸 +的是,任何遗漏都会导致大量的假阳性,这在包含众多自定义同步机制的内核上下文中特 +别有害。为了跟踪前因后果关系,KTSAN 的实现需要为每个内存位置提供元数据(影子内 +存),这意味着每页内存对应 4 页影子内存,在大型系统上可能会带来数十 GiB 的开销 +。 From 033964f13ef2aec1c0ae090755935055c11aaef7 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Thu, 15 Aug 2024 14:34:49 +0300 Subject: [PATCH 099/126] get_maintainer: add --bug option to print bug reporting info For example Documentation/adming-guide/bug-hunting.rst suggest using get_maintainer.pl to get a list of maintainers and mailing lists to report bugs to, while a number of subsystems and drivers explicitly use the "B:" MAINTAINERS entry to direct bug reports at issue trackers instead of mailing lists and people. Add the --bug option to get_maintainer.pl to print the bug reporting URIs, if any. Cc: Joe Perches Cc: Jonathan Corbet Signed-off-by: Jani Nikula Acked-by: Joe Perches Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240815113450.3397499-1-jani.nikula@intel.com --- scripts/get_maintainer.pl | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index ee1aed7e090c..5ac02e198737 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -54,6 +54,7 @@ my $output_section_maxlen = 50; my $scm = 0; my $tree = 1; my $web = 0; +my $bug = 0; my $subsystem = 0; my $status = 0; my $letters = ""; @@ -271,6 +272,7 @@ if (!GetOptions( 'scm!' => \$scm, 'tree!' => \$tree, 'web!' => \$web, + 'bug!' => \$bug, 'letters=s' => \$letters, 'pattern-depth=i' => \$pattern_depth, 'k|keywords!' => \$keywords, @@ -320,13 +322,14 @@ if ($sections || $letters ne "") { $status = 0; $subsystem = 0; $web = 0; + $bug = 0; $keywords = 0; $keywords_in_file = 0; $interactive = 0; } else { - my $selections = $email + $scm + $status + $subsystem + $web; + my $selections = $email + $scm + $status + $subsystem + $web + $bug; if ($selections == 0) { - die "$P: Missing required option: email, scm, status, subsystem or web\n"; + die "$P: Missing required option: email, scm, status, subsystem, web or bug\n"; } } @@ -631,6 +634,7 @@ my %hash_list_to; my @list_to = (); my @scm = (); my @web = (); +my @bug = (); my @subsystem = (); my @status = (); my %deduplicate_name_hash = (); @@ -662,6 +666,11 @@ if ($web) { output(@web); } +if ($bug) { + @bug = uniq(@bug); + output(@bug); +} + exit($exit); sub self_test { @@ -847,6 +856,7 @@ sub get_maintainers { @list_to = (); @scm = (); @web = (); + @bug = (); @subsystem = (); @status = (); %deduplicate_name_hash = (); @@ -1069,6 +1079,7 @@ MAINTAINER field selection options: --status => print status if any --subsystem => print subsystem name if any --web => print website(s) if any + --bug => print bug reporting info if any Output type options: --separator [, ] => separator for multiple entries on 1 line @@ -1382,6 +1393,8 @@ sub add_categories { push(@scm, $pvalue . $suffix); } elsif ($ptype eq "W") { push(@web, $pvalue . $suffix); + } elsif ($ptype eq "B") { + push(@bug, $pvalue . $suffix); } elsif ($ptype eq "S") { push(@status, $pvalue . $suffix); } From cd0403adeaf78b4506403cf75317cc4d97ba8b5e Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Thu, 15 Aug 2024 14:34:50 +0300 Subject: [PATCH 100/126] Documentation: admin-guide: direct people to bug trackers, if specified Update bug reporting info in bug-hunting.rst to direct people to driver/subsystem bug trackers, if explicitly specified with the MAINTAINERS "B:" entry. Use the new get_maintainer.pl --bug option to print the info. Cc: Joe Perches Cc: Jonathan Corbet Signed-off-by: Jani Nikula Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240815113450.3397499-2-jani.nikula@intel.com --- Documentation/admin-guide/bug-hunting.rst | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/bug-hunting.rst b/Documentation/admin-guide/bug-hunting.rst index 95299b08c405..1d0f8ceb3075 100644 --- a/Documentation/admin-guide/bug-hunting.rst +++ b/Documentation/admin-guide/bug-hunting.rst @@ -244,14 +244,14 @@ Reporting the bug Once you find where the bug happened, by inspecting its location, you could either try to fix it yourself or report it upstream. -In order to report it upstream, you should identify the mailing list -used for the development of the affected code. This can be done by using -the ``get_maintainer.pl`` script. +In order to report it upstream, you should identify the bug tracker, if any, or +mailing list used for the development of the affected code. This can be done by +using the ``get_maintainer.pl`` script. For example, if you find a bug at the gspca's sonixj.c file, you can get its maintainers with:: - $ ./scripts/get_maintainer.pl -f drivers/media/usb/gspca/sonixj.c + $ ./scripts/get_maintainer.pl --bug -f drivers/media/usb/gspca/sonixj.c Hans Verkuil (odd fixer:GSPCA USB WEBCAM DRIVER,commit_signer:1/1=100%) Mauro Carvalho Chehab (maintainer:MEDIA INPUT INFRASTRUCTURE (V4L/DVB),commit_signer:1/1=100%) Tejun Heo (commit_signer:1/1=100%) @@ -267,11 +267,12 @@ Please notice that it will point to: - The driver maintainer (Hans Verkuil); - The subsystem maintainer (Mauro Carvalho Chehab); - The driver and/or subsystem mailing list (linux-media@vger.kernel.org); -- the Linux Kernel mailing list (linux-kernel@vger.kernel.org). +- The Linux Kernel mailing list (linux-kernel@vger.kernel.org); +- The bug reporting URIs for the driver/subsystem (none in the above example). -Usually, the fastest way to have your bug fixed is to report it to mailing -list used for the development of the code (linux-media ML) copying the -driver maintainer (Hans). +If the listing contains bug reporting URIs at the end, please prefer them over +email. Otherwise, please report bugs to the mailing list used for the +development of the code (linux-media ML) copying the driver maintainer (Hans). If you are totally stumped as to whom to send the report, and ``get_maintainer.pl`` didn't provide you anything useful, send it to From 3232216228411d2a594d297b4fdaa67f254a0be2 Mon Sep 17 00:00:00 2001 From: Thorsten Scherer Date: Mon, 5 Aug 2024 14:03:47 +0200 Subject: [PATCH 101/126] doc: iio: Fix sysfs paths Add missing 'devices' folder in the /sys/bus/iio path. Signed-off-by: Thorsten Scherer Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240805120357.21135-1-t.scherer@eckelmann.de --- Documentation/driver-api/iio/buffers.rst | 8 ++++---- Documentation/driver-api/iio/core.rst | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Documentation/driver-api/iio/buffers.rst b/Documentation/driver-api/iio/buffers.rst index e83026aebe97..63f364e862d1 100644 --- a/Documentation/driver-api/iio/buffers.rst +++ b/Documentation/driver-api/iio/buffers.rst @@ -15,8 +15,8 @@ trigger source. Multiple data channels can be read at once from IIO buffer sysfs interface ========================== An IIO buffer has an associated attributes directory under -:file:`/sys/bus/iio/iio:device{X}/buffer/*`. Here are some of the existing -attributes: +:file:`/sys/bus/iio/devices/iio:device{X}/buffer/*`. Here are some of the +existing attributes: * :file:`length`, the total number of data samples (capacity) that can be stored by the buffer. @@ -28,8 +28,8 @@ IIO buffer setup The meta information associated with a channel reading placed in a buffer is called a scan element. The important bits configuring scan elements are exposed to userspace applications via the -:file:`/sys/bus/iio/iio:device{X}/scan_elements/` directory. This directory contains -attributes of the following form: +:file:`/sys/bus/iio/devices/iio:device{X}/scan_elements/` directory. This +directory contains attributes of the following form: * :file:`enable`, used for enabling a channel. If and only if its attribute is non *zero*, then a triggered capture will contain data samples for this diff --git a/Documentation/driver-api/iio/core.rst b/Documentation/driver-api/iio/core.rst index 715cf29482a1..dfe438dc91a7 100644 --- a/Documentation/driver-api/iio/core.rst +++ b/Documentation/driver-api/iio/core.rst @@ -24,7 +24,7 @@ then we will show how a device driver makes use of an IIO device. There are two ways for a user space application to interact with an IIO driver. -1. :file:`/sys/bus/iio/iio:device{X}/`, this represents a hardware sensor +1. :file:`/sys/bus/iio/devices/iio:device{X}/`, this represents a hardware sensor and groups together the data channels of the same chip. 2. :file:`/dev/iio:device{X}`, character device node interface used for buffered data transfer and for events information retrieval. @@ -51,8 +51,8 @@ IIO device sysfs interface Attributes are sysfs files used to expose chip info and also allowing applications to set various configuration parameters. For device with -index X, attributes can be found under /sys/bus/iio/iio:deviceX/ directory. -Common attributes are: +index X, attributes can be found under /sys/bus/iio/devices/iio:deviceX/ +directory. Common attributes are: * :file:`name`, description of the physical chip. * :file:`dev`, shows the major:minor pair associated with @@ -140,16 +140,16 @@ Here is how we can make use of the channel's modifiers:: This channel's definition will generate two separate sysfs files for raw data retrieval: -* :file:`/sys/bus/iio/iio:device{X}/in_intensity_ir_raw` -* :file:`/sys/bus/iio/iio:device{X}/in_intensity_both_raw` +* :file:`/sys/bus/iio/devices/iio:device{X}/in_intensity_ir_raw` +* :file:`/sys/bus/iio/devices/iio:device{X}/in_intensity_both_raw` one file for processed data: -* :file:`/sys/bus/iio/iio:device{X}/in_illuminance_input` +* :file:`/sys/bus/iio/devices/iio:device{X}/in_illuminance_input` and one shared sysfs file for sampling frequency: -* :file:`/sys/bus/iio/iio:device{X}/sampling_frequency`. +* :file:`/sys/bus/iio/devices/iio:device{X}/sampling_frequency`. Here is how we can make use of the channel's indexing:: From ee27a49c0fe97b5e696b004b36e1ed756d3f8dc5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 25 Aug 2024 18:09:47 -0700 Subject: [PATCH 102/126] Docs/translations/ko_KR: link howto.rst with other language versions The menu for documents of other available languages is created for documents in same file hierarchy under the translations/ directory. Because howto.rst of Korean translation is at the root of translations/ directory while that for English is under howto/ directory, the Korean translation is not linked with other available language versions via the menu. Move the document under the same hierarchy to make it be linked with other langauge versions. Signed-off-by: SeongJae Park Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240826010949.78305-1-sj@kernel.org --- Documentation/translations/ko_KR/index.rst | 2 +- Documentation/translations/ko_KR/{ => process}/howto.rst | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename Documentation/translations/ko_KR/{ => process}/howto.rst (100%) diff --git a/Documentation/translations/ko_KR/index.rst b/Documentation/translations/ko_KR/index.rst index 4add6b2fe1f2..2d51f1481310 100644 --- a/Documentation/translations/ko_KR/index.rst +++ b/Documentation/translations/ko_KR/index.rst @@ -11,7 +11,7 @@ .. toctree:: :maxdepth: 1 - howto + process/howto 리눅스 커널 메모리 배리어 diff --git a/Documentation/translations/ko_KR/howto.rst b/Documentation/translations/ko_KR/process/howto.rst similarity index 100% rename from Documentation/translations/ko_KR/howto.rst rename to Documentation/translations/ko_KR/process/howto.rst From 227f6cf96948efe965c2f2c9d940eb6a226cd39c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 25 Aug 2024 18:09:48 -0700 Subject: [PATCH 103/126] Docs/translations/ko_KR: link memory-barriers wrapper with other language versions The menu for documents of other available languages is created for documents in same file hierarchy under the translations/ directory. Because memory-barriers.txt of Korean translation is at the root index of translations/ directory while that for English is on core-api/wrappers/memory-barriers.rst, the Korean translation is not linked with other available language versions via the menu. Move the document under the same directory hierarchy to make it linked with other language versions. Signed-off-by: SeongJae Park Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240826010949.78305-2-sj@kernel.org --- .../core-api/wrappers/memory-barriers.rst | 18 ++++++++++++++++++ Documentation/translations/ko_KR/index.rst | 14 ++------------ 2 files changed, 20 insertions(+), 12 deletions(-) create mode 100644 Documentation/translations/ko_KR/core-api/wrappers/memory-barriers.rst diff --git a/Documentation/translations/ko_KR/core-api/wrappers/memory-barriers.rst b/Documentation/translations/ko_KR/core-api/wrappers/memory-barriers.rst new file mode 100644 index 000000000000..526ae534dd86 --- /dev/null +++ b/Documentation/translations/ko_KR/core-api/wrappers/memory-barriers.rst @@ -0,0 +1,18 @@ +.. SPDX-License-Identifier: GPL-2.0 + This is a simple wrapper to bring memory-barriers.txt into the RST world + until such a time as that file can be converted directly. + +========================= +리눅스 커널 메모리 배리어 +========================= + +.. raw:: latex + + \footnotesize + +.. include:: ../../memory-barriers.txt + :literal: + +.. raw:: latex + + \normalsize diff --git a/Documentation/translations/ko_KR/index.rst b/Documentation/translations/ko_KR/index.rst index 2d51f1481310..a20772f9d61c 100644 --- a/Documentation/translations/ko_KR/index.rst +++ b/Documentation/translations/ko_KR/index.rst @@ -12,18 +12,8 @@ :maxdepth: 1 process/howto - - -리눅스 커널 메모리 배리어 -------------------------- + core-api/wrappers/memory-barriers.rst .. raw:: latex - \footnotesize - -.. include:: ./memory-barriers.txt - :literal: - -.. raw:: latex - - }\kerneldocEndKR + }\kerneldocEndKR From f92a24ae7c953263600fc7ea3f0594676ea82138 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 25 Jul 2024 19:00:41 +0100 Subject: [PATCH 104/126] Documentation/fs/9p: Expand goo.gl link The goo.gl URL shortener is deprecated and is due to stop expanding existing links in 2025. The old goo.gl link in the 9p docs doesn't work anyway, replace it by a kernel.org link suggested by Randy instead. Signed-off-by: Dr. David Alan Gilbert Acked-by: Randy Dunlap Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240725180041.80862-1-linux@treblig.org --- Documentation/filesystems/9p.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/9p.rst b/Documentation/filesystems/9p.rst index 1e0e0bb6fdf9..d270a0aa8d55 100644 --- a/Documentation/filesystems/9p.rst +++ b/Documentation/filesystems/9p.rst @@ -31,7 +31,7 @@ Other applications are described in the following papers: * PROSE I/O: Using 9p to enable Application Partitions http://plan9.escet.urjc.es/iwp9/cready/PROSE_iwp9_2006.pdf * VirtFS: A Virtualization Aware File System pass-through - http://goo.gl/3WPDg + https://kernel.org/doc/ols/2010/ols2010-pages-109-120.pdf Usage ===== From 4460e8538ef17c86eae46ccbe096eee8c740a7d0 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 10 Jun 2024 10:28:10 +0500 Subject: [PATCH 105/126] MAINTAINERS: Add selftests/x86 entry There are no maintainers specified for tools/testing/selftests/x86. Shuah has mentioned [1] that the patches should go through x86 tree or in special cases directly to Shuah's tree after getting ack-ed from x86 maintainers. Different people have been confused when sending patches as correct maintainers aren't found by get_maintainer.pl script. Fix this by adding entry to MAINTAINERS file. [1] https://lore.kernel.org/all/90dc0dfc-4c67-4ea1-b705-0585d6e2ec47@linuxfoundation.org Signed-off-by: Muhammad Usama Anjum Signed-off-by: Borislav Petkov (AMD) Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20240610052810.1488793-1-usama.anjum@collabora.com --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6daab3f05958..7715a806c255 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24772,6 +24772,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core F: Documentation/arch/x86/ F: Documentation/devicetree/bindings/x86/ F: arch/x86/ +F: tools/testing/selftests/x86 X86 CPUID DATABASE M: Borislav Petkov From a678164aadbf68d80f7ab79b8bd5cfe3711e42fa Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 26 Aug 2024 10:21:47 +0100 Subject: [PATCH 106/126] x86/EISA: Dereference memory directly instead of using readl() Sparse expect an __iomem pointer, but after converting the EISA probe to memremap() the pointer is a regular memory pointer. Access it directly instead. [ tglx: Converted it to fix the already applied version ] Fixes: 80a4da05642c ("x86/EISA: Use memremap() to probe for the EISA BIOS signature") Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/alpine.DEB.2.21.2408261015270.30766@angie.orcam.me.uk --- arch/x86/kernel/eisa.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c index 5a4f99a098bf..9535a6507db7 100644 --- a/arch/x86/kernel/eisa.c +++ b/arch/x86/kernel/eisa.c @@ -11,13 +11,13 @@ static __init int eisa_bus_probe(void) { - void *p; + u32 *p; if ((xen_pv_domain() && !xen_initial_domain()) || cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return 0; p = memremap(0x0FFFD9, 4, MEMREMAP_WB); - if (p && readl(p) == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24)) + if (p && *p == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24)) EISA_bus = 1; memunmap(p); return 0; From 72ffee678f6f7d9ebf5350a6e38f31fd306c9b8a Mon Sep 17 00:00:00 2001 From: Haoyang Liu Date: Fri, 26 Jul 2024 01:46:31 +0800 Subject: [PATCH 107/126] docs: update dev-tools/kcsan.rst url about KTSAN The KTSAN doc has moved to https://github.com/google/kernel-sanitizers/blob/master/KTSAN.md. Update the url in kcsan.rst accordingly. Signed-off-by: Haoyang Liu Reviewed-by: Dongliang Mu Acked-by: Marco Elver Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240725174632.23803-1-tttturtleruss@hust.edu.cn --- Documentation/dev-tools/kcsan.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst index 02143f060b22..d81c42d1063e 100644 --- a/Documentation/dev-tools/kcsan.rst +++ b/Documentation/dev-tools/kcsan.rst @@ -361,7 +361,8 @@ Alternatives Considered ----------------------- An alternative data race detection approach for the kernel can be found in the -`Kernel Thread Sanitizer (KTSAN) `_. +`Kernel Thread Sanitizer (KTSAN) +`_. KTSAN is a happens-before data race detector, which explicitly establishes the happens-before order between memory operations, which can then be used to determine data races as defined in `Data Races`_. From eb5ed2fae19745fcb7dd0dcfbfbcd8b2847bc5c1 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 5 Sep 2024 13:33:33 +0100 Subject: [PATCH 108/126] docs: submitting-patches: Advertise b4 b4 is now widely used and is quite helpful for a lot of the things that submitting-patches covers, let's advertise it to submitters to try to make their lives easier and reduce the number of procedural issues maintainers see. Reviewed-by: Shuah Khan Signed-off-by: Mark Brown Acked-by: Konstantin Ryabitsev Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240905-documentation-b4-advert-v2-1-24d686ba4117@kernel.org --- Documentation/process/submitting-patches.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst index f310f2f36666..1518bd57adab 100644 --- a/Documentation/process/submitting-patches.rst +++ b/Documentation/process/submitting-patches.rst @@ -842,6 +842,14 @@ Make sure that base commit is in an official maintainer/mainline tree and not in some internal, accessible only to you tree - otherwise it would be worthless. +Tooling +------- + +Many of the technical aspects of this process can be automated using +b4, documented at . This can +help with things like tracking dependencies, running checkpatch and +with formatting and sending mails. + References ---------- From 93292980f390b9245d8e3ce9b0b6c94ee45be217 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Thu, 5 Sep 2024 14:09:41 +0900 Subject: [PATCH 109/126] docs: kerneldoc-preamble.sty: Suppress extra spaces in CJK literal blocks In zh_CN part of translations.pdf, there are several ASCII-art diagrams whose vertical lines look sometimes jagged. This is due to the interference between default settings of xeCJK and fancyvrb (employed in sphinxVerbatim env), where extra space is inserted between a latin char and a non-latin char when they are next to each other (i.e., no explicit white space). This issue can be suppressed by invoking \CJKsetecglue{} at the beginning of every sphinxVerbatim enviornment. \AtBeginEnvironment, provided by the etoolbox package, is useful in this case. Signed-off-by: Akira Yokosawa Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240905050941.31439-1-akiyks@gmail.com --- Documentation/sphinx/kerneldoc-preamble.sty | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/sphinx/kerneldoc-preamble.sty b/Documentation/sphinx/kerneldoc-preamble.sty index d479cfa73658..5d68395539fe 100644 --- a/Documentation/sphinx/kerneldoc-preamble.sty +++ b/Documentation/sphinx/kerneldoc-preamble.sty @@ -199,6 +199,8 @@ % Inactivate CJK after tableofcontents \apptocmd{\sphinxtableofcontents}{\kerneldocCJKoff}{}{} \xeCJKsetup{CJKspace = true}% For inter-phrase space of Korean TOC + % Suppress extra white space at latin .. non-latin in literal blocks + \AtBeginEnvironment{sphinxVerbatim}{\CJKsetecglue{}} }{ % Don't enable CJK % Custom macros to on/off CJK and switch CJK fonts (Dummy) \newcommand{\kerneldocCJKon}{} From 34ea875cca2c2fa4ec8b70fc2b21ee6c7878740e Mon Sep 17 00:00:00 2001 From: I Hsin Cheng Date: Thu, 29 Aug 2024 00:50:36 +0800 Subject: [PATCH 110/126] docs: scheduler: completion: Update member of struct completion The member "wait" in struct completion isn't of type wait_queue_head_t anymore, as it is now "struct swait_queue_head", fix it to match with the current implementation. Signed-off-by: I Hsin Cheng Reviewed-by: Kuan-Wei Chiu Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240828165036.178011-1-richard120310@gmail.com --- Documentation/scheduler/completion.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/scheduler/completion.rst b/Documentation/scheduler/completion.rst index f19aca2062bd..adf0c0a56d02 100644 --- a/Documentation/scheduler/completion.rst +++ b/Documentation/scheduler/completion.rst @@ -51,7 +51,7 @@ which has only two fields:: struct completion { unsigned int done; - wait_queue_head_t wait; + struct swait_queue_head wait; }; This provides the ->wait waitqueue to place tasks on for waiting (if any), and From e04eb52bfaf436e5d04e3a774a60e753a5a2f49e Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Wed, 28 Aug 2024 11:48:58 -0300 Subject: [PATCH 111/126] Documentation: Document the kernel flag bdev_allow_write_mounted Commit ed5cc702d311 ("block: Add config option to not allow writing to mounted devices") added a Kconfig option along with a kernel command-line tuning to control writes to mounted block devices, as a means to deal with fuzzers like Syzkaller, that provokes kernel crashes by directly writing on block devices bypassing the filesystem (so the FS has no awareness and cannot cope with that). The patch just missed adding such kernel command-line option to the kernel documentation, so let's fix that. Cc: Bart Van Assche Cc: Darrick J. Wong Cc: Jens Axboe Reviewed-by: Jan Kara Signed-off-by: Guilherme G. Piccoli Reviewed-by: Darrick J. Wong Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240828145045.309835-1-gpiccoli@igalia.com --- Documentation/admin-guide/kernel-parameters.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 09126bb8cc9f..efc52ddc6864 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -517,6 +517,18 @@ Format: ,, See header of drivers/net/hamradio/baycom_ser_hdx.c. + bdev_allow_write_mounted= + Format: + Control the ability to open a mounted block device + for writing, i.e., allow / disallow writes that bypass + the FS. This was implemented as a means to prevent + fuzzers from crashing the kernel by overwriting the + metadata underneath a mounted FS without its awareness. + This also prevents destructive formatting of mounted + filesystems by naive storage tooling that don't use + O_EXCL. Default is Y and can be changed through the + Kconfig option CONFIG_BLK_DEV_WRITE_MOUNTED. + bert_disable [ACPI] Disable BERT OS support on buggy BIOSes. From bc6cb62007fd6e321385d2df6fae3c38b53c0a82 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 28 Aug 2024 12:59:50 +0800 Subject: [PATCH 112/126] Loongarch: KVM: Add KVM hypercalls documentation for LoongArch Add documentation topic for using pv_virt when running as a guest on KVM hypervisor. Signed-off-by: Bibo Mao Signed-off-by: Xianglai Li Co-developed-by: Mingcong Bai Signed-off-by: Mingcong Bai Link: https://lore.kernel.org/all/5c338084b1bcccc1d57dce9ddb1e7081@aosc.io/ Signed-off-by: Dandan Zhang [jc: fixed htmldocs build error] Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/4769C036576F8816+20240828045950.3484113-1-zhangdandan@uniontech.com --- Documentation/virt/kvm/index.rst | 1 + .../virt/kvm/loongarch/hypercalls.rst | 89 +++++++++++++++++++ Documentation/virt/kvm/loongarch/index.rst | 10 +++ MAINTAINERS | 1 + 4 files changed, 101 insertions(+) create mode 100644 Documentation/virt/kvm/loongarch/hypercalls.rst create mode 100644 Documentation/virt/kvm/loongarch/index.rst diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst index ad13ec55ddfe..9ca5a45c2140 100644 --- a/Documentation/virt/kvm/index.rst +++ b/Documentation/virt/kvm/index.rst @@ -14,6 +14,7 @@ KVM s390/index ppc-pv x86/index + loongarch/index locking vcpu-requests diff --git a/Documentation/virt/kvm/loongarch/hypercalls.rst b/Documentation/virt/kvm/loongarch/hypercalls.rst new file mode 100644 index 000000000000..2d6b94031f1b --- /dev/null +++ b/Documentation/virt/kvm/loongarch/hypercalls.rst @@ -0,0 +1,89 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================================== +The LoongArch paravirtual interface +=================================== + +KVM hypercalls use the HVCL instruction with code 0x100 and the hypercall +number is put in a0. Up to five arguments may be placed in registers a1 - a5. +The return value is placed in v0 (an alias of a0). + +Source code for this interface can be found in arch/loongarch/kvm*. + +Querying for existence +====================== + +To determine if the host is running on KVM, we can utilize the cpucfg() +function at index CPUCFG_KVM_BASE (0x40000000). + +The CPUCFG_KVM_BASE range, spanning from 0x40000000 to 0x400000FF, The +CPUCFG_KVM_BASE range between 0x40000000 - 0x400000FF is marked as reserved. +Consequently, all current and future processors will not implement any +feature within this range. + +On a KVM-virtualized Linux system, a read operation on cpucfg() at index +CPUCFG_KVM_BASE (0x40000000) returns the magic string 'KVM\0'. + +Once you have determined that your host is running on a paravirtualization- +capable KVM, you may now use hypercalls as described below. + +KVM hypercall ABI +================= + +The KVM hypercall ABI is simple, with one scratch register a0 (v0) and at most +five generic registers (a1 - a5) used as input parameters. The FP (Floating- +point) and vector registers are not utilized as input registers and must +remain unmodified during a hypercall. + +Hypercall functions can be inlined as it only uses one scratch register. + +The parameters are as follows: + + ======== ================= ================ + Register IN OUT + ======== ================= ================ + a0 function number Return code + a1 1st parameter - + a2 2nd parameter - + a3 3rd parameter - + a4 4th parameter - + a5 5th parameter - + ======== ================= ================ + +The return codes may be one of the following: + + ==== ========================= + Code Meaning + ==== ========================= + 0 Success + -1 Hypercall not implemented + -2 Bad Hypercall parameter + ==== ========================= + +KVM Hypercalls Documentation +============================ + +The template for each hypercall is as follows: + +1. Hypercall name +2. Purpose + +1. KVM_HCALL_FUNC_IPI +------------------------ + +:Purpose: Send IPIs to multiple vCPUs. + +- a0: KVM_HCALL_FUNC_IPI +- a1: Lower part of the bitmap for destination physical CPUIDs +- a2: Higher part of the bitmap for destination physical CPUIDs +- a3: The lowest physical CPUID in the bitmap + +The hypercall lets a guest send multiple IPIs (Inter-Process Interrupts) with +at most 128 destinations per hypercall. The destinations are represented in a +bitmap contained in the first two input registers (a1 and a2). + +Bit 0 of a1 corresponds to the physical CPUID in the third input register (a3) +and bit 1 corresponds to the physical CPUID in a3+1, and so on. + +PV IPI on LoongArch includes both PV IPI multicast sending and PV IPI receiving, +and SWI is used for PV IPI inject since there is no VM-exits accessing SWI registers. diff --git a/Documentation/virt/kvm/loongarch/index.rst b/Documentation/virt/kvm/loongarch/index.rst new file mode 100644 index 000000000000..83387b4c5345 --- /dev/null +++ b/Documentation/virt/kvm/loongarch/index.rst @@ -0,0 +1,10 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================= +KVM for LoongArch systems +========================= + +.. toctree:: + :maxdepth: 2 + + hypercalls.rst diff --git a/MAINTAINERS b/MAINTAINERS index 9d30466b14a7..a556c90a8b57 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12299,6 +12299,7 @@ L: kvm@vger.kernel.org L: loongarch@lists.linux.dev S: Maintained T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git +F: Documentation/virt/kvm/loongarch/ F: arch/loongarch/include/asm/kvm* F: arch/loongarch/include/uapi/asm/kvm* F: arch/loongarch/kvm/ From 9b8a79f4c1d844d52273a31bc6511fa8ab5e8669 Mon Sep 17 00:00:00 2001 From: Sebastian Muxel Date: Tue, 27 Aug 2024 15:32:24 +0200 Subject: [PATCH 113/126] scripts: sphinx-pre-install: remove unnecessary double check for $cur_version $cur_version is currently being tested twice with the first test resulting in an unhelpful "$sphinx returned an error", not continuing to the more helpful "$sphinx didn't return its version". This patch removes the first test to return the more useful message. Fixes: a8b380c379ef ("scripts: sphinx-pre-install: only ask to activate valid venvs") Signed-off-by: Sebastian Muxel Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240827133224.160776-1-sebastian@muxel.dev --- scripts/sphinx-pre-install | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install index c1121f098542..ad9945ccb0cf 100755 --- a/scripts/sphinx-pre-install +++ b/scripts/sphinx-pre-install @@ -300,8 +300,6 @@ sub check_sphinx() } $cur_version = get_sphinx_version($sphinx); - die ("$sphinx returned an error") if (!$cur_version); - die "$sphinx didn't return its version" if (!$cur_version); if ($cur_version lt $min_version) { From 4a93831daadd9652539db7af77434939e6e44c9e Mon Sep 17 00:00:00 2001 From: Aryabhatta Dey Date: Tue, 27 Aug 2024 18:48:52 +0530 Subject: [PATCH 114/126] Documentation/gpu: Fix typo in Documentation/gpu/komeda-kms.rst Change 'indenpendently' to 'independently'. Signed-off-by: Aryabhatta Dey Acked-by: Liviu Dudau Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/l5wzytcamcc43eadaquqbrfqilq6ajfnnseh37c77eceamtw35@hhtdipi4h22c --- Documentation/gpu/komeda-kms.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/gpu/komeda-kms.rst b/Documentation/gpu/komeda-kms.rst index 633a016563ae..eaea40eb725b 100644 --- a/Documentation/gpu/komeda-kms.rst +++ b/Documentation/gpu/komeda-kms.rst @@ -86,7 +86,7 @@ types of working mode: - Single display mode Two pipelines work together to drive only one display output. - On this mode, pipeline_B doesn't work indenpendently, but outputs its + On this mode, pipeline_B doesn't work independently, but outputs its composition result into pipeline_A, and its pixel timing also derived from pipeline_A.timing_ctrlr. The pipeline_B works just like a "slave" of pipeline_A(master) From 4538480b27a94522ff6432b2d728fafc74f61829 Mon Sep 17 00:00:00 2001 From: Amit Vadhavana Date: Sat, 17 Aug 2024 12:57:24 +0530 Subject: [PATCH 115/126] Documentation: Fix spelling mistakes Correct spelling mistakes in the documentation to improve readability. Signed-off-by: Amit Vadhavana Reviewed-by: Bjorn Helgaas Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240817072724.6861-1-av2082000@gmail.com --- Documentation/arch/arm/stm32/stm32-dma-mdma-chaining.rst | 4 ++-- Documentation/arch/arm64/cpu-hotplug.rst | 2 +- Documentation/arch/powerpc/ultravisor.rst | 2 +- Documentation/arch/riscv/vector.rst | 2 +- Documentation/arch/x86/mds.rst | 2 +- Documentation/arch/x86/x86_64/fsgs.rst | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/arch/arm/stm32/stm32-dma-mdma-chaining.rst b/Documentation/arch/arm/stm32/stm32-dma-mdma-chaining.rst index 2945e0e33104..301aa30890ae 100644 --- a/Documentation/arch/arm/stm32/stm32-dma-mdma-chaining.rst +++ b/Documentation/arch/arm/stm32/stm32-dma-mdma-chaining.rst @@ -359,7 +359,7 @@ Driver updates for STM32 DMA-MDMA chaining support in foo driver descriptor you want a callback to be called at the end of the transfer (dmaengine_prep_slave_sg()) or the period (dmaengine_prep_dma_cyclic()). Depending on the direction, set the callback on the descriptor that finishes - the overal transfer: + the overall transfer: * DMA_DEV_TO_MEM: set the callback on the "MDMA" descriptor * DMA_MEM_TO_DEV: set the callback on the "DMA" descriptor @@ -371,7 +371,7 @@ Driver updates for STM32 DMA-MDMA chaining support in foo driver As STM32 MDMA channel transfer is triggered by STM32 DMA, you must issue STM32 MDMA channel before STM32 DMA channel. - If any, your callback will be called to warn you about the end of the overal + If any, your callback will be called to warn you about the end of the overall transfer or the period completion. Don't forget to terminate both channels. STM32 DMA channel is configured in diff --git a/Documentation/arch/arm64/cpu-hotplug.rst b/Documentation/arch/arm64/cpu-hotplug.rst index 76ba8d932c72..8fb438bf7781 100644 --- a/Documentation/arch/arm64/cpu-hotplug.rst +++ b/Documentation/arch/arm64/cpu-hotplug.rst @@ -26,7 +26,7 @@ There are no systems that support the physical addition (or removal) of CPUs while the system is running, and ACPI is not able to sufficiently describe them. -e.g. New CPUs come with new caches, but the platform's cache toplogy is +e.g. New CPUs come with new caches, but the platform's cache topology is described in a static table, the PPTT. How caches are shared between CPUs is not discoverable, and must be described by firmware. diff --git a/Documentation/arch/powerpc/ultravisor.rst b/Documentation/arch/powerpc/ultravisor.rst index ba6b1bf1cc44..6d0407b2f5a1 100644 --- a/Documentation/arch/powerpc/ultravisor.rst +++ b/Documentation/arch/powerpc/ultravisor.rst @@ -134,7 +134,7 @@ Hardware * PTCR and partition table entries (partition table is in secure memory). An attempt to write to PTCR will cause a Hypervisor - Emulation Assitance interrupt. + Emulation Assistance interrupt. * LDBAR (LD Base Address Register) and IMC (In-Memory Collection) non-architected registers. An attempt to write to them will cause a diff --git a/Documentation/arch/riscv/vector.rst b/Documentation/arch/riscv/vector.rst index 75dd88a62e1d..3987f5f76a9d 100644 --- a/Documentation/arch/riscv/vector.rst +++ b/Documentation/arch/riscv/vector.rst @@ -15,7 +15,7 @@ status for the use of Vector in userspace. The intended usage guideline for these interfaces is to give init systems a way to modify the availability of V for processes running under its domain. Calling these interfaces is not recommended in libraries routines because libraries should not override policies -configured from the parant process. Also, users must noted that these interfaces +configured from the parent process. Also, users must note that these interfaces are not portable to non-Linux, nor non-RISC-V environments, so it is discourage to use in a portable code. To get the availability of V in an ELF program, please read :c:macro:`COMPAT_HWCAP_ISA_V` bit of :c:macro:`ELF_HWCAP` in the diff --git a/Documentation/arch/x86/mds.rst b/Documentation/arch/x86/mds.rst index c58c72362911..5a2e6c0ef04a 100644 --- a/Documentation/arch/x86/mds.rst +++ b/Documentation/arch/x86/mds.rst @@ -162,7 +162,7 @@ Mitigation points 3. It would take a large number of these precisely-timed NMIs to mount an actual attack. There's presumably not enough bandwidth. 4. The NMI in question occurs after a VERW, i.e. when user state is - restored and most interesting data is already scrubbed. Whats left + restored and most interesting data is already scrubbed. What's left is only the data that NMI touches, and that may or may not be of any interest. diff --git a/Documentation/arch/x86/x86_64/fsgs.rst b/Documentation/arch/x86/x86_64/fsgs.rst index 50960e09e1f6..d07e445dac5c 100644 --- a/Documentation/arch/x86/x86_64/fsgs.rst +++ b/Documentation/arch/x86/x86_64/fsgs.rst @@ -125,7 +125,7 @@ FSGSBASE instructions enablement FSGSBASE instructions compiler support ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -GCC version 4.6.4 and newer provide instrinsics for the FSGSBASE +GCC version 4.6.4 and newer provide intrinsics for the FSGSBASE instructions. Clang 5 supports them as well. =================== =========================== @@ -135,7 +135,7 @@ instructions. Clang 5 supports them as well. _writegsbase_u64() Write the GS base register =================== =========================== -To utilize these instrinsics must be included in the source +To utilize these intrinsics must be included in the source code and the compiler option -mfsgsbase has to be added. Compiler support for FS/GS based addressing From 2259b06938410a47bde8ac43c5c9cde433064ce1 Mon Sep 17 00:00:00 2001 From: Karol Przybylski Date: Wed, 14 Aug 2024 17:55:58 +0200 Subject: [PATCH 116/126] docs: block: Fix grammar and spelling mistakes in bfq-iosched.rst This patch corrects several grammar and spelling errors in the Documentation/block/bfq-iosched.rst file. These changes improve the clarity and readability of the documentation. Signed-off-by: Karol Przybylski Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240814155558.3672833-1-karprzy7@gmail.com --- Documentation/block/bfq-iosched.rst | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Documentation/block/bfq-iosched.rst b/Documentation/block/bfq-iosched.rst index df3a8a47f58c..a0ff0eb11e7f 100644 --- a/Documentation/block/bfq-iosched.rst +++ b/Documentation/block/bfq-iosched.rst @@ -9,7 +9,7 @@ controllers), BFQ's main features are: - BFQ guarantees a high system and application responsiveness, and a low latency for time-sensitive applications, such as audio or video players; -- BFQ distributes bandwidth, and not just time, among processes or +- BFQ distributes bandwidth, not just time, among processes or groups (switching back to time distribution when needed to keep throughput high). @@ -111,7 +111,7 @@ Higher speed for code-development tasks If some additional workload happens to be executed in parallel, then BFQ executes the I/O-related components of typical code-development -tasks (compilation, checkout, merge, ...) much more quickly than CFQ, +tasks (compilation, checkout, merge, etc.) much more quickly than CFQ, NOOP or DEADLINE. High throughput @@ -127,9 +127,9 @@ Strong fairness, bandwidth and delay guarantees ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ BFQ distributes the device throughput, and not just the device time, -among I/O-bound applications in proportion their weights, with any +among I/O-bound applications in proportion to their weights, with any workload and regardless of the device parameters. From these bandwidth -guarantees, it is possible to compute tight per-I/O-request delay +guarantees, it is possible to compute a tight per-I/O-request delay guarantees by a simple formula. If not configured for strict service guarantees, BFQ switches to time-based resource sharing (only) for applications that would otherwise cause a throughput loss. @@ -199,7 +199,7 @@ plus a lot of code, are borrowed from CFQ. - On flash-based storage with internal queueing of commands (typically NCQ), device idling happens to be always detrimental - for throughput. So, with these devices, BFQ performs idling + to throughput. So, with these devices, BFQ performs idling only when strictly needed for service guarantees, i.e., for guaranteeing low latency or fairness. In these cases, overall throughput may be sub-optimal. No solution currently exists to @@ -212,7 +212,7 @@ plus a lot of code, are borrowed from CFQ. and to reduce their latency. The most important action taken to achieve this goal is to give to the queues associated with these applications more than their fair share of the device - throughput. For brevity, we call just "weight-raising" the whole + throughput. For brevity, we call it just "weight-raising" the whole sets of actions taken by BFQ to privilege these queues. In particular, BFQ provides a milder form of weight-raising for interactive applications, and a stronger form for soft real-time @@ -231,7 +231,7 @@ plus a lot of code, are borrowed from CFQ. responsive in detecting interleaved I/O (cooperating processes), that it enables BFQ to achieve a high throughput, by queue merging, even for queues for which CFQ needs a different - mechanism, preemption, to get a high throughput. As such EQM is a + mechanism, preemption, to get a high throughput. As such, EQM is a unified mechanism to achieve a high throughput with interleaved I/O. @@ -254,7 +254,7 @@ plus a lot of code, are borrowed from CFQ. - First, with any proportional-share scheduler, the maximum deviation with respect to an ideal service is proportional to the maximum budget (slice) assigned to queues. As a consequence, - BFQ can keep this deviation tight not only because of the + BFQ can keep this deviation tight, not only because of the accurate service of B-WF2Q+, but also because BFQ *does not* need to assign a larger budget to a queue to let the queue receive a higher fraction of the device throughput. @@ -327,7 +327,7 @@ applications. Unset this tunable if you need/want to control weights. slice_idle ---------- -This parameter specifies how long BFQ should idle for next I/O +This parameter specifies how long BFQ should idle for the next I/O request, when certain sync BFQ queues become empty. By default slice_idle is a non-zero value. Idling has a double purpose: boosting throughput and making sure that the desired throughput distribution is @@ -365,7 +365,7 @@ terms of I/O-request dispatches. To guarantee that the actual service order then corresponds to the dispatch order, the strict_guarantees tunable must be set too. -There is an important flipside for idling: apart from the above cases +There is an important flip side to idling: apart from the above cases where it is beneficial also for throughput, idling can severely impact throughput. One important case is random workload. Because of this issue, BFQ tends to avoid idling as much as possible, when it is not @@ -475,7 +475,7 @@ max_budget Maximum amount of service, measured in sectors, that can be provided to a BFQ queue once it is set in service (of course within the limits -of the above timeout). According to what said in the description of +of the above timeout). According to what was said in the description of the algorithm, larger values increase the throughput in proportion to the percentage of sequential I/O requests issued. The price of larger values is that they coarsen the granularity of short-term bandwidth From 49417ad48abb8ddb56168da19ff173d8241bdba5 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Tue, 11 Jun 2024 10:04:36 +0800 Subject: [PATCH 117/126] docs/zh_CN: update the translation of security-bugs Update to commit 5928d411557e ("Documentation: Document the Linux Kernel CVE process") commit 0217f3944aeb ("Documentation: security-bugs.rst: linux-distros relaxed their rules") commit 3c1897ae4b6b ("Documentation: security-bugs.rst: clarify CVE handling") commit 4fee0915e649 ("Documentation: security-bugs.rst: update preferences when dealing with the linux-distros group") commit 44ac5abac86b ("Documentation/security-bugs: move from admin-guide/ to process/") Signed-off-by: Dongliang Mu Reviewed-by: Alex Shi Reviewed-by: Yanteng Si Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240611020514.48770-1-dzm91@hust.edu.cn --- .../translations/zh_CN/admin-guide/index.rst | 1 - .../zh_CN/admin-guide/reporting-issues.rst | 4 +- .../translations/zh_CN/process/index.rst | 3 +- .../security-bugs.rst | 42 ++++++++++++------- .../zh_CN/process/submitting-patches.rst | 2 +- .../zh_TW/admin-guide/reporting-issues.rst | 4 +- .../zh_TW/process/submitting-patches.rst | 2 +- 7 files changed, 34 insertions(+), 24 deletions(-) rename Documentation/translations/zh_CN/{admin-guide => process}/security-bugs.rst (57%) diff --git a/Documentation/translations/zh_CN/admin-guide/index.rst b/Documentation/translations/zh_CN/admin-guide/index.rst index 0db80ab830a0..15d9ab5993a7 100644 --- a/Documentation/translations/zh_CN/admin-guide/index.rst +++ b/Documentation/translations/zh_CN/admin-guide/index.rst @@ -37,7 +37,6 @@ Todolist: reporting-issues reporting-regressions - security-bugs bug-hunting bug-bisect tainted-kernels diff --git a/Documentation/translations/zh_CN/admin-guide/reporting-issues.rst b/Documentation/translations/zh_CN/admin-guide/reporting-issues.rst index 59e51e3539b4..9ff4ba94391d 100644 --- a/Documentation/translations/zh_CN/admin-guide/reporting-issues.rst +++ b/Documentation/translations/zh_CN/admin-guide/reporting-issues.rst @@ -300,7 +300,7 @@ Documentation/admin-guide/reporting-regressions.rst 对此进行了更详细的 添加到回归跟踪列表中,以确保它不会被忽略。 什么是安全问题留给您自己判断。在继续之前,请考虑阅读 -Documentation/translations/zh_CN/admin-guide/security-bugs.rst , +Documentation/translations/zh_CN/process/security-bugs.rst , 因为它提供了如何最恰当地处理安全问题的额外细节。 当发生了完全无法接受的糟糕事情时,此问题就是一个“非常严重的问题”。例如, @@ -983,7 +983,7 @@ Documentation/admin-guide/reporting-regressions.rst ;它还提供了大量其 报告,请将报告的文本转发到这些地址;但请在报告的顶部加上注释,表明您提交了 报告,并附上工单链接。 -更多信息请参见 Documentation/translations/zh_CN/admin-guide/security-bugs.rst 。 +更多信息请参见 Documentation/translations/zh_CN/process/security-bugs.rst 。 发布报告后的责任 diff --git a/Documentation/translations/zh_CN/process/index.rst b/Documentation/translations/zh_CN/process/index.rst index 5a5cd7c01c62..3bcb3bdaf533 100644 --- a/Documentation/translations/zh_CN/process/index.rst +++ b/Documentation/translations/zh_CN/process/index.rst @@ -49,10 +49,11 @@ TODOLIST: embargoed-hardware-issues cve + security-bugs TODOLIST: -* security-bugs +* handling-regressions 其它大多数开发人员感兴趣的社区指南: diff --git a/Documentation/translations/zh_CN/admin-guide/security-bugs.rst b/Documentation/translations/zh_CN/process/security-bugs.rst similarity index 57% rename from Documentation/translations/zh_CN/admin-guide/security-bugs.rst rename to Documentation/translations/zh_CN/process/security-bugs.rst index d6b8f8a4e7f6..a8f5fcbfadc9 100644 --- a/Documentation/translations/zh_CN/admin-guide/security-bugs.rst +++ b/Documentation/translations/zh_CN/process/security-bugs.rst @@ -1,3 +1,5 @@ +.. SPDX-License-Identifier: GPL-2.0-or-later + .. include:: ../disclaimer-zh_CN.rst :Original: :doc:`../../../process/security-bugs` @@ -5,6 +7,7 @@ :译者: 吴想成 Wu XiangCheng + 慕冬亮 Dongliang Mu 安全缺陷 ========= @@ -17,13 +20,13 @@ Linux内核开发人员非常重视安全性。因此我们想知道何时发现 可以通过电子邮件联系Linux内核安全团队。这是一个安全人员 的私有列表,他们将帮助验证错误报告并开发和发布修复程序。如果您已经有了一个 -修复,请将其包含在您的报告中,这样可以大大加快进程。安全团队可能会从区域维护 +修复,请将其包含在您的报告中,这样可以大大加快处理进程。安全团队可能会从区域维护 人员那里获得额外的帮助,以理解和修复安全漏洞。 与任何缺陷一样,提供的信息越多,诊断和修复就越容易。如果您不清楚哪些信息有用, 请查看“Documentation/translations/zh_CN/admin-guide/reporting-issues.rst”中 -概述的步骤。任何利用漏洞的攻击代码都非常有用,未经报告者同意不会对外发布,除 -非已经公开。 +概述的步骤。任何利用漏洞的攻击代码都非常有用,未经报告者同意不会对外发布, +除非已经公开。 请尽可能发送无附件的纯文本电子邮件。如果所有的细节都藏在附件里,那么就很难对 一个复杂的问题进行上下文引用的讨论。把它想象成一个 @@ -49,24 +52,31 @@ Linux内核开发人员非常重视安全性。因此我们想知道何时发现 换句话说,我们唯一感兴趣的是修复缺陷。提交给安全列表的所有其他资料以及对报告 的任何后续讨论,即使在解除限制之后,也将永久保密。 -协调 ------- +与其他团队协调 +-------------- -对敏感缺陷(例如那些可能导致权限提升的缺陷)的修复可能需要与私有邮件列表 -进行协调,以便分发供应商做好准备,在公开披露 -上游补丁时发布一个已修复的内核。发行版将需要一些时间来测试建议的补丁,通常 -会要求至少几天的限制,而供应商更新发布更倾向于周二至周四。若合适,安全团队 -可以协助这种协调,或者报告者可以从一开始就包括linux发行版。在这种情况下,请 -记住在电子邮件主题行前面加上“[vs]”,如linux发行版wiki中所述: -。 +虽然内核安全团队仅关注修复漏洞,但还有其他组织关注修复发行版上的安全问题以及协调 +操作系统厂商的漏洞披露。协调通常由 "linux-distros" 邮件列表处理,而披露则由 +公共 "oss-security" 邮件列表进行。两者紧密关联且被展示在 linux-distros 维基: + + +请注意,这三个列表的各自政策和规则是不同的,因为它们追求不同的目标。内核安全团队 +与其他团队之间的协调很困难,因为对于内核安全团队,保密期(即最大允许天数)是从补丁 +可用时开始,而 "linux-distros" 则从首次发布到列表时开始计算,无论是否存在补丁。 + +因此,内核安全团队强烈建议,作为一位潜在安全问题的报告者,在受影响代码的维护者 +接受补丁之前,且在您阅读上述发行版维基页面并完全理解联系 "linux-distros" +邮件列表会对您和内核社区施加的要求之前,不要联系 "linux-distros" 邮件列表。 +这也意味着通常情况下不要同时抄送两个邮件列表,除非在协调时有已接受但尚未合并的补丁。 +换句话说,在补丁被接受之前,不要抄送 "linux-distros";在修复程序被合并之后, +不要抄送内核安全团队。 CVE分配 -------- -安全团队通常不分配CVE,我们也不需要它们来进行报告或修复,因为这会使过程不必 -要的复杂化,并可能耽误缺陷处理。如果报告者希望在公开披露之前分配一个CVE编号, -他们需要联系上述的私有linux-distros列表。当在提供补丁之前已有这样的CVE编号时, -如报告者愿意,最好在提交消息中提及它。 +安全团队不分配 CVE,同时我们也不需要 CVE 来报告或修复漏洞,因为这会使过程不必要 +的复杂化,并可能延误漏洞处理。如果报告者希望为确认的问题分配一个 CVE 编号, +可以联系 :doc:`内核 CVE 分配团队 <../process/cve>` 获取。 保密协议 --------- diff --git a/Documentation/translations/zh_CN/process/submitting-patches.rst b/Documentation/translations/zh_CN/process/submitting-patches.rst index 7864107e60a8..7ca16bda3709 100644 --- a/Documentation/translations/zh_CN/process/submitting-patches.rst +++ b/Documentation/translations/zh_CN/process/submitting-patches.rst @@ -208,7 +208,7 @@ torvalds@linux-foundation.org 。他收到的邮件很多,所以一般来说 如果您有修复可利用安全漏洞的补丁,请将该补丁发送到 security@kernel.org 。对于 严重的bug,可以考虑短期禁令以允许分销商(有时间)向用户发布补丁;在这种情况下, 显然不应将补丁发送到任何公共列表。 -参见 Documentation/translations/zh_CN/admin-guide/security-bugs.rst 。 +参见 Documentation/translations/zh_CN/process/security-bugs.rst 。 修复已发布内核中严重错误的补丁程序应该抄送给稳定版维护人员,方法是把以下列行 放进补丁的签准区(注意,不是电子邮件收件人):: diff --git a/Documentation/translations/zh_TW/admin-guide/reporting-issues.rst b/Documentation/translations/zh_TW/admin-guide/reporting-issues.rst index bc132b25f2ae..1d4e4c7a6750 100644 --- a/Documentation/translations/zh_TW/admin-guide/reporting-issues.rst +++ b/Documentation/translations/zh_TW/admin-guide/reporting-issues.rst @@ -301,7 +301,7 @@ Documentation/admin-guide/reporting-regressions.rst 對此進行了更詳細的 添加到迴歸跟蹤列表中,以確保它不會被忽略。 什麼是安全問題留給您自己判斷。在繼續之前,請考慮閱讀 -Documentation/translations/zh_CN/admin-guide/security-bugs.rst , +Documentation/translations/zh_CN/process/security-bugs.rst , 因爲它提供瞭如何最恰當地處理安全問題的額外細節。 當發生了完全無法接受的糟糕事情時,此問題就是一個“非常嚴重的問題”。例如, @@ -984,7 +984,7 @@ Documentation/admin-guide/reporting-regressions.rst ;它還提供了大量其 報告,請將報告的文本轉發到這些地址;但請在報告的頂部加上註釋,表明您提交了 報告,並附上工單鏈接。 -更多信息請參見 Documentation/translations/zh_CN/admin-guide/security-bugs.rst 。 +更多信息請參見 Documentation/translations/zh_CN/process/security-bugs.rst 。 發佈報告後的責任 diff --git a/Documentation/translations/zh_TW/process/submitting-patches.rst b/Documentation/translations/zh_TW/process/submitting-patches.rst index f12f2f193f85..64de92c07906 100644 --- a/Documentation/translations/zh_TW/process/submitting-patches.rst +++ b/Documentation/translations/zh_TW/process/submitting-patches.rst @@ -209,7 +209,7 @@ torvalds@linux-foundation.org 。他收到的郵件很多,所以一般來說 如果您有修復可利用安全漏洞的補丁,請將該補丁發送到 security@kernel.org 。對於 嚴重的bug,可以考慮短期禁令以允許分銷商(有時間)向用戶發佈補丁;在這種情況下, 顯然不應將補丁發送到任何公共列表。 -參見 Documentation/translations/zh_CN/admin-guide/security-bugs.rst 。 +參見 Documentation/translations/zh_CN/process/security-bugs.rst 。 修復已發佈內核中嚴重錯誤的補丁程序應該抄送給穩定版維護人員,方法是把以下列行 放進補丁的籤準區(注意,不是電子郵件收件人):: From d895eb31ae563152d398e2584c707d1efe7911ed Mon Sep 17 00:00:00 2001 From: Andrew Kreimer Date: Fri, 6 Sep 2024 11:00:59 +0300 Subject: [PATCH 118/126] accel/qaic: Fix a typo Fix a typo in documentation. Signed-off-by: Andrew Kreimer Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20240906080136.4423-1-algonell@gmail.com --- Documentation/accel/qaic/qaic.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/accel/qaic/qaic.rst b/Documentation/accel/qaic/qaic.rst index efb7771273bb..628bf2f7a416 100644 --- a/Documentation/accel/qaic/qaic.rst +++ b/Documentation/accel/qaic/qaic.rst @@ -93,7 +93,7 @@ commands (does not impact QAIC). uAPI ==== -QAIC creates an accel device per phsyical PCIe device. This accel device exists +QAIC creates an accel device per physical PCIe device. This accel device exists for as long as the PCIe device is known to Linux. The PCIe device may not be in the state to accept requests from userspace at From 406c4c5ee4ea738b454646bc0ee5d7d81413cdad Mon Sep 17 00:00:00 2001 From: Dennis Lam Date: Sun, 8 Sep 2024 12:19:28 -0400 Subject: [PATCH 119/126] docs:mm: fix spelling mistakes in heterogeneous memory management page Signed-off-by: Dennis Lam Reviewed-by: Lorenzo Stoakes Signed-off-by: Jonathan Corbet Message-ID: <20240908161928.3700-1-dennis.lamerice@gmail.com> --- Documentation/mm/hmm.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/mm/hmm.rst b/Documentation/mm/hmm.rst index 0595098a74d9..f6d53c37a2ca 100644 --- a/Documentation/mm/hmm.rst +++ b/Documentation/mm/hmm.rst @@ -66,7 +66,7 @@ combinatorial explosion in the library entry points. Finally, with the advance of high level language constructs (in C++ but in other languages too) it is now possible for the compiler to leverage GPUs and other devices without programmer knowledge. Some compiler identified patterns -are only do-able with a shared address space. It is also more reasonable to use +are only doable with a shared address space. It is also more reasonable to use a shared address space for all other patterns. @@ -267,7 +267,7 @@ functions are designed to make drivers easier to write and to centralize common code across drivers. Before migrating pages to device private memory, special device private -``struct page`` need to be created. These will be used as special "swap" +``struct page`` needs to be created. These will be used as special "swap" page table entries so that a CPU process will fault if it tries to access a page that has been migrated to device private memory. @@ -322,7 +322,7 @@ between device driver specific code and shared common code: The ``invalidate_range_start()`` callback is passed a ``struct mmu_notifier_range`` with the ``event`` field set to ``MMU_NOTIFY_MIGRATE`` and the ``owner`` field set to - the ``args->pgmap_owner`` field passed to migrate_vma_setup(). This is + the ``args->pgmap_owner`` field passed to migrate_vma_setup(). This allows the device driver to skip the invalidation callback and only invalidate device private MMU mappings that are actually migrating. This is explained more in the next section. @@ -405,7 +405,7 @@ can be used to make a memory range inaccessible from userspace. This replaces all mappings for pages in the given range with special swap entries. Any attempt to access the swap entry results in a fault which is -resovled by replacing the entry with the original mapping. A driver gets +resolved by replacing the entry with the original mapping. A driver gets notified that the mapping has been changed by MMU notifiers, after which point it will no longer have exclusive access to the page. Exclusive access is guaranteed to last until the driver drops the page lock and page reference, at @@ -431,7 +431,7 @@ Same decision was made for memory cgroup. Device memory pages are accounted against same memory cgroup a regular page would be accounted to. This does simplify migration to and from device memory. This also means that migration back from device memory to regular memory cannot fail because it would -go above memory cgroup limit. We might revisit this choice latter on once we +go above memory cgroup limit. We might revisit this choice later on once we get more experience in how device memory is used and its impact on memory resource control. From c5d436f05a3f45053cfc820ae140899b916c6e00 Mon Sep 17 00:00:00 2001 From: Andrew Kreimer Date: Sat, 7 Sep 2024 15:21:39 +0300 Subject: [PATCH 120/126] docs/process: fix typos Fix typos in documentation. Signed-off-by: Andrew Kreimer Signed-off-by: Jonathan Corbet Message-ID: <20240907122534.15998-1-algonell@gmail.com> --- Documentation/process/coding-style.rst | 2 +- Documentation/process/maintainer-tip.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst index 8e30c8f7697d..19d2ed47ff79 100644 --- a/Documentation/process/coding-style.rst +++ b/Documentation/process/coding-style.rst @@ -986,7 +986,7 @@ that can go into these 5 milliseconds. A reasonable rule of thumb is to not put inline at functions that have more than 3 lines of code in them. An exception to this rule are the cases where -a parameter is known to be a compiletime constant, and as a result of this +a parameter is known to be a compile time constant, and as a result of this constantness you *know* the compiler will be able to optimize most of your function away at compile time. For a good example of this later case, see the kmalloc() inline function. diff --git a/Documentation/process/maintainer-tip.rst b/Documentation/process/maintainer-tip.rst index ba312345d030..349a27a53343 100644 --- a/Documentation/process/maintainer-tip.rst +++ b/Documentation/process/maintainer-tip.rst @@ -154,7 +154,7 @@ Examples for illustration: We modify the hot cpu handling to cancel the delayed work on the dying cpu and run the worker immediately on a different cpu in same domain. We - donot flush the worker because the MBM overflow worker reschedules the + do not flush the worker because the MBM overflow worker reschedules the worker on same CPU and scans the domain->cpu_mask to get the domain pointer. From 5d5f9229ab0143639ec7cd3beea88c8e763cf5a7 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Sat, 7 Sep 2024 15:02:08 +0800 Subject: [PATCH 121/126] docs/zh_CN: add the translation of kbuild/gcc-plugins.rst Finish the translation of kbuild/gcc-plugins.rst and move gcc-plugins from TODO to the main body. Update to commit 3832d1fd84b6 ("docs/core-api: expand Fedora instructions for GCC plugins") Signed-off-by: Dongliang Mu Reviewed-by: Alex Shi Reviewed-by: Yanteng Si Signed-off-by: Jonathan Corbet Message-ID: <20240907070244.206808-1-dzm91@hust.edu.cn> --- .../translations/zh_CN/kbuild/gcc-plugins.rst | 126 ++++++++++++++++++ .../translations/zh_CN/kbuild/index.rst | 2 +- 2 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 Documentation/translations/zh_CN/kbuild/gcc-plugins.rst diff --git a/Documentation/translations/zh_CN/kbuild/gcc-plugins.rst b/Documentation/translations/zh_CN/kbuild/gcc-plugins.rst new file mode 100644 index 000000000000..67a8abbf5887 --- /dev/null +++ b/Documentation/translations/zh_CN/kbuild/gcc-plugins.rst @@ -0,0 +1,126 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/kbuild/gcc-plugins.rst +:Translator: 慕冬亮 Dongliang Mu + +================ +GCC 插件基础设施 +================ + + +介绍 +==== + +GCC 插件是为编译器提供额外功能的可加载模块 [1]_。它们对于运行时插装和静态分析非常有用。 +我们可以在编译过程中通过回调 [2]_,GIMPLE [3]_,IPA [4]_ 和 RTL Passes [5]_ +(译者注:Pass 是编译器所采用的一种结构化技术,用于完成编译对象的分析、优化或转换等功能) +来分析、修改和添加更多的代码。 + +内核的 GCC 插件基础设施支持构建树外模块、交叉编译和在单独的目录中构建。插件源文件必须由 +C++ 编译器编译。 + +目前 GCC 插件基础设施只支持一些架构。搜索 "select HAVE_GCC_PLUGINS" 来查找支持 +GCC 插件的架构。 + +这个基础设施是从 grsecurity [6]_ 和 PaX [7]_ 移植过来的。 + +-- + +.. [1] https://gcc.gnu.org/onlinedocs/gccint/Plugins.html +.. [2] https://gcc.gnu.org/onlinedocs/gccint/Plugin-API.html#Plugin-API +.. [3] https://gcc.gnu.org/onlinedocs/gccint/GIMPLE.html +.. [4] https://gcc.gnu.org/onlinedocs/gccint/IPA.html +.. [5] https://gcc.gnu.org/onlinedocs/gccint/RTL.html +.. [6] https://grsecurity.net/ +.. [7] https://pax.grsecurity.net/ + + +目的 +==== + +GCC 插件的设计目的是提供一个用于试验 GCC 或 Clang 上游没有的潜在编译器功能的场所。 +一旦它们的实用性得到验证,这些功能将被添加到 GCC(和 Clang)的上游。随后,在所有 +支持的 GCC 版本都支持这些功能后,它们会被从内核中移除。 + +具体来说,新插件应该只实现上游编译器(GCC 和 Clang)不支持的功能。 + +当 Clang 中存在 GCC 中不存在的某项功能时,应努力将该功能做到 GCC 上游(而不仅仅 +是作为内核专用的 GCC 插件),以使整个生态都能从中受益。 + +类似的,如果 GCC 插件提供的功能在 Clang 中 **不** 存在,但该功能被证明是有用的,也应 +努力将该功能上传到 GCC(和 Clang)。 + +在上游 GCC 提供了某项功能后,该插件将无法在相应的 GCC 版本(以及更高版本)下编译。 +一旦所有内核支持的 GCC 版本都提供了该功能,该插件将从内核中移除。 + + +文件 +==== + +**$(src)/scripts/gcc-plugins** + + 这是 GCC 插件的目录。 + +**$(src)/scripts/gcc-plugins/gcc-common.h** + + 这是 GCC 插件的兼容性头文件。 + 应始终包含它,而不是单独的 GCC 头文件。 + +**$(src)/scripts/gcc-plugins/gcc-generate-gimple-pass.h, +$(src)/scripts/gcc-plugins/gcc-generate-ipa-pass.h, +$(src)/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h, +$(src)/scripts/gcc-plugins/gcc-generate-rtl-pass.h** + + 这些头文件可以自动生成 GIMPLE、SIMPLE_IPA、IPA 和 RTL passes 的注册结构。 + 与手动创建结构相比,它们更受欢迎。 + + +用法 +==== + +你必须为你的 GCC 版本安装 GCC 插件头文件,以 Ubuntu 上的 gcc-10 为例:: + + apt-get install gcc-10-plugin-dev + +或者在 Fedora 上:: + + dnf install gcc-plugin-devel libmpc-devel + +或者在 Fedora 上使用包含插件的交叉编译器时:: + + dnf install libmpc-devel + +在内核配置中启用 GCC 插件基础设施与一些你想使用的插件:: + + CONFIG_GCC_PLUGINS=y + CONFIG_GCC_PLUGIN_LATENT_ENTROPY=y + ... + +运行 gcc(本地或交叉编译器),确保能够检测到插件头文件:: + + gcc -print-file-name=plugin + CROSS_COMPILE=arm-linux-gnu- ${CROSS_COMPILE}gcc -print-file-name=plugin + +"plugin" 这个词意味着它们没有被检测到:: + + plugin + +完整的路径则表示插件已经被检测到:: + + /usr/lib/gcc/x86_64-redhat-linux/12/plugin + +编译包括插件在内的最小工具集:: + + make scripts + +或者直接在内核中运行 make,使用循环复杂性 GCC 插件编译整个内核。 + + +4. 如何添加新的 GCC 插件 +======================== + +GCC 插件位于 scripts/gcc-plugins/。你需要将插件源文件放在 scripts/gcc-plugins/ 目录下。 +子目录创建并不支持,你必须添加在 scripts/gcc-plugins/Makefile、scripts/Makefile.gcc-plugins +和相关的 Kconfig 文件中。 diff --git a/Documentation/translations/zh_CN/kbuild/index.rst b/Documentation/translations/zh_CN/kbuild/index.rst index d906a4e88d0f..b51655d981f6 100644 --- a/Documentation/translations/zh_CN/kbuild/index.rst +++ b/Documentation/translations/zh_CN/kbuild/index.rst @@ -13,6 +13,7 @@ :maxdepth: 1 headers_install + gcc-plugins TODO: @@ -24,7 +25,6 @@ TODO: - modules - issues - reproducible-builds -- gcc-plugins - llvm .. only:: subproject and html From e6ba83cb81e84d9e2d003c5ed2749ca81843f555 Mon Sep 17 00:00:00 2001 From: Abdul Rahim Date: Sat, 7 Sep 2024 02:26:56 +0530 Subject: [PATCH 122/126] Documentation: PCI: fix typo in pci.rst Fix typo: "follow" -> "following" in pci.rst Signed-off-by: Abdul Rahim Signed-off-by: Jonathan Corbet Message-ID: <20240906205656.8261-1-abdul.rahim@myyahoo.com> --- Documentation/PCI/pci.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/PCI/pci.rst b/Documentation/PCI/pci.rst index dd7b1c0c21da..f4d2662871ab 100644 --- a/Documentation/PCI/pci.rst +++ b/Documentation/PCI/pci.rst @@ -52,7 +52,7 @@ driver generally needs to perform the following initialization: - Enable DMA/processing engines When done using the device, and perhaps the module needs to be unloaded, -the driver needs to take the follow steps: +the driver needs to take the following steps: - Disable the device from generating IRQs - Release the IRQ (free_irq()) From bf78b46683229eec3a1074302851645bf43a6986 Mon Sep 17 00:00:00 2001 From: Dennis Lam Date: Fri, 6 Sep 2024 16:49:13 -0400 Subject: [PATCH 123/126] docs:mm: fixed spelling and grammar mistakes on vmalloc kernel stack page Signed-off-by: Dennis Lam Reviewed-by: Randy Dunlap Signed-off-by: Jonathan Corbet Message-ID: <20240906204914.42698-2-dennis.lamerice@gmail.com> --- Documentation/mm/vmalloced-kernel-stacks.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/mm/vmalloced-kernel-stacks.rst b/Documentation/mm/vmalloced-kernel-stacks.rst index 4edca515bfd7..5bc0f7ceea89 100644 --- a/Documentation/mm/vmalloced-kernel-stacks.rst +++ b/Documentation/mm/vmalloced-kernel-stacks.rst @@ -110,7 +110,7 @@ Bulk of the code is in: `kernel/fork.c `. stack_vm_area pointer in task_struct keeps track of the virtually allocated -stack and a non-null stack_vm_area pointer serves as a indication that the +stack and a non-null stack_vm_area pointer serves as an indication that the virtually mapped kernel stacks are enabled. :: @@ -120,8 +120,8 @@ virtually mapped kernel stacks are enabled. Stack overflow handling ----------------------- -Leading and trailing guard pages help detect stack overflows. When stack -overflows into the guard pages, handlers have to be careful not overflow +Leading and trailing guard pages help detect stack overflows. When the stack +overflows into the guard pages, handlers have to be careful not to overflow the stack again. When handlers are called, it is likely that very little stack space is left. @@ -148,6 +148,6 @@ Conclusions - THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and simply embed the thread_info (containing only flags) and 'int cpu' into task_struct. -- The thread stack can be free'ed as soon as the task is dead (without +- The thread stack can be freed as soon as the task is dead (without waiting for RCU) and then, if vmapped stacks are in use, cache the entire stack for reuse on the same cpu. From 0cac9253a03f762eda7051e4760a0bf059ee9176 Mon Sep 17 00:00:00 2001 From: Dennis Lam Date: Sun, 8 Sep 2024 14:37:42 -0400 Subject: [PATCH 124/126] docs:filesystem: fix mispelled words on autofs page Signed-off-by: Dennis Lam Signed-off-by: Jonathan Corbet Message-ID: <20240908183741.15352-2-dennis.lamerice@gmail.com> --- Documentation/filesystems/autofs.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/autofs.rst b/Documentation/filesystems/autofs.rst index 3b6e38e646cd..1ac576458c69 100644 --- a/Documentation/filesystems/autofs.rst +++ b/Documentation/filesystems/autofs.rst @@ -18,7 +18,7 @@ key advantages: 2. The names and locations of filesystems can be stored in a remote database and can change at any time. The content - in that data base at the time of access will be used to provide + in that database at the time of access will be used to provide a target for the access. The interpretation of names in the filesystem can even be programmatic rather than database-backed, allowing wildcards for example, and can vary based on the user who @@ -423,7 +423,7 @@ The available ioctl commands are: and objects are expired if the are not in use. **AUTOFS_EXP_FORCED** causes the in use status to be ignored - and objects are expired ieven if they are in use. This assumes + and objects are expired even if they are in use. This assumes that the daemon has requested this because it is capable of performing the umount. From 2409952f645ec7235660a111f9e5474892efbe42 Mon Sep 17 00:00:00 2001 From: Dennis Lam Date: Fri, 6 Sep 2024 15:53:52 -0400 Subject: [PATCH 125/126] docs:filesystems: fix spelling and grammar mistakes Signed-off-by: Dennis Lam Reviewed-by: Theodore Ts'o Signed-off-by: Jonathan Corbet Message-ID: <20240906195400.39949-1-dennis.lamerice@gmail.com> --- Documentation/filesystems/journalling.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/journalling.rst b/Documentation/filesystems/journalling.rst index e18f90ffc6fd..0254f7d57429 100644 --- a/Documentation/filesystems/journalling.rst +++ b/Documentation/filesystems/journalling.rst @@ -137,7 +137,7 @@ Fast commits JBD2 to also allows you to perform file-system specific delta commits known as fast commits. In order to use fast commits, you will need to set following -callbacks that perform correspodning work: +callbacks that perform corresponding work: `journal->j_fc_cleanup_cb`: Cleanup function called after every full commit and fast commit. @@ -149,7 +149,7 @@ File system is free to perform fast commits as and when it wants as long as it gets permission from JBD2 to do so by calling the function :c:func:`jbd2_fc_begin_commit()`. Once a fast commit is done, the client file system should tell JBD2 about it by calling -:c:func:`jbd2_fc_end_commit()`. If file system wants JBD2 to perform a full +:c:func:`jbd2_fc_end_commit()`. If the file system wants JBD2 to perform a full commit immediately after stopping the fast commit it can do so by calling :c:func:`jbd2_fc_end_commit_fallback()`. This is useful if fast commit operation fails for some reason and the only way to guarantee consistency is for JBD2 to @@ -199,7 +199,7 @@ Journal Level .. kernel-doc:: fs/jbd2/recovery.c :internal: -Transasction Level +Transaction Level ~~~~~~~~~~~~~~~~~~ .. kernel-doc:: fs/jbd2/transaction.c From 4f77c3462308c62ffe7129cc18b9ac937f44b5a5 Mon Sep 17 00:00:00 2001 From: Shivam Chaudhary Date: Tue, 10 Sep 2024 10:57:37 +0530 Subject: [PATCH 126/126] Remove duplicate "and" in 'Linux NVMe docs. Remove duplicate occurrence of 'and' in 'Linux NVMe Feature and Quirk Policy' title heading. tested: Not breaking anything. Signed-off-by: Shivam Chaudhary Signed-off-by: Jonathan Corbet Message-ID: <20240910052737.30579-1-cvam0000@gmail.com> --- Documentation/nvme/feature-and-quirk-policy.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/nvme/feature-and-quirk-policy.rst b/Documentation/nvme/feature-and-quirk-policy.rst index c01d836d8e41..e21966bf20a8 100644 --- a/Documentation/nvme/feature-and-quirk-policy.rst +++ b/Documentation/nvme/feature-and-quirk-policy.rst @@ -1,8 +1,8 @@ .. SPDX-License-Identifier: GPL-2.0 -======================================= -Linux NVMe feature and and quirk policy -======================================= +=================================== +Linux NVMe feature and quirk policy +=================================== This file explains the policy used to decide what is supported by the Linux NVMe driver and what is not.