diff --git a/MAINTAINERS b/MAINTAINERS index f81d7c4b33ad..b478e1e5e71b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14748,7 +14748,7 @@ F: include/uapi/linux/nitro_enclaves.h F: samples/nitro_enclaves/ NOHZ, DYNTICKS SUPPORT -M: Frederic Weisbecker +M: Frederic Weisbecker M: Thomas Gleixner M: Ingo Molnar L: linux-kernel@vger.kernel.org diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index a7ec06ce3785..515ca33b854c 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -1,8 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_ARM_JUMP_SLOT|R_ARM_GLOB_DAT|R_ARM_ABS32 +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile hostprogs := vdsomunge diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index beaf9586338f..fe7a53c6781f 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -6,9 +6,7 @@ # Heavily based on the vDSO Makefiles for other archs. # -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_AARCH64_JUMP_SLOT|R_AARCH64_GLOB_DAT|R_AARCH64_ABS64 +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile obj-vdso := vgettimeofday.o note.o sigreturn.o diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index f59bd1a4ead6..d014162c5c71 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -3,9 +3,6 @@ # Makefile for vdso32 # -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_ARM_JUMP_SLOT|R_ARM_GLOB_DAT|R_ARM_ABS32 include $(srctree)/lib/vdso/Makefile # Same as cc-*option, but using CC_COMPAT instead of CC diff --git a/arch/csky/kernel/vdso/Makefile b/arch/csky/kernel/vdso/Makefile index 0b6909f10667..299e4e41ebc5 100644 --- a/arch/csky/kernel/vdso/Makefile +++ b/arch/csky/kernel/vdso/Makefile @@ -1,8 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_CKCORE_ADDR32|R_CKCORE_JUMP_SLOT +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile # Symbols present in the vdso diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile index d89e2ac75f7b..461240ab4436 100644 --- a/arch/loongarch/vdso/Makefile +++ b/arch/loongarch/vdso/Makefile @@ -1,9 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # Objects to go into the VDSO. -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_LARCH_32|R_LARCH_64|R_LARCH_MARK_LA|R_LARCH_JUMP_SLOT +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile obj-vdso-y := elf.o vgetcpu.o vgettimeofday.o sigreturn.o diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile index 18af9474ed0e..eb56581f6d73 100644 --- a/arch/mips/vdso/Makefile +++ b/arch/mips/vdso/Makefile @@ -4,9 +4,7 @@ # Sanitizer runtimes are unavailable and cannot be linked here. KCSAN_SANITIZE := n -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_MIPS_JUMP_SLOT|R_MIPS_GLOB_DAT +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile obj-vdso-y := elf.o vgettimeofday.o sigreturn.o diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile index 66f723f53be2..4c3f34485f08 100644 --- a/arch/powerpc/kernel/vdso/Makefile +++ b/arch/powerpc/kernel/vdso/Makefile @@ -2,7 +2,7 @@ # List of files in the vdso, has to be asm only for now -ARCH_REL_TYPE_ABS := R_PPC_JUMP_SLOT|R_PPC_GLOB_DAT|R_PPC_ADDR32|R_PPC_ADDR24|R_PPC_ADDR16|R_PPC_ADDR16_LO|R_PPC_ADDR16_HI|R_PPC_ADDR16_HA|R_PPC_ADDR14|R_PPC_ADDR14_BRTAKEN|R_PPC_ADDR14_BRNTAKEN|R_PPC_REL24 +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile obj-vdso32 = sigtramp32-32.o gettimeofday-32.o datapage-32.o cacheflush-32.o note-32.o getcpu-32.o diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 06e6b27f3bcc..a04b3bc35ca2 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -1,9 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only # Copied from arch/tile/kernel/vdso/Makefile -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_RISCV_32|R_RISCV_64|R_RISCV_JUMP_SLOT +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile # Symbols present in the vdso vdso-syms = rt_sigreturn diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile index 245bddfe9bc0..bafd3147eb4e 100644 --- a/arch/s390/kernel/vdso32/Makefile +++ b/arch/s390/kernel/vdso32/Makefile @@ -2,9 +2,8 @@ # List of files in the vdso KCOV_INSTRUMENT := n -ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE -ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile obj-vdso32 = vdso_user_wrapper-32.o note-32.o diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index 34f9542636e9..a766d286e15f 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -2,9 +2,8 @@ # List of files in the vdso KCOV_INSTRUMENT := n -ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE -ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile obj-vdso64 = vdso_user_wrapper.o note.o obj-cvdso64 = vdso64_generic.o getcpu.o diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index f4f0625691fd..73f83233d25d 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -13,7 +13,7 @@ #include #include -static enum cc_vendor vendor __ro_after_init; +enum cc_vendor cc_vendor __ro_after_init; static u64 cc_mask __ro_after_init; static bool intel_cc_platform_has(enum cc_attr attr) @@ -99,7 +99,7 @@ static bool amd_cc_platform_has(enum cc_attr attr) bool cc_platform_has(enum cc_attr attr) { - switch (vendor) { + switch (cc_vendor) { case CC_VENDOR_AMD: return amd_cc_platform_has(attr); case CC_VENDOR_INTEL: @@ -119,7 +119,7 @@ u64 cc_mkenc(u64 val) * - for AMD, bit *set* means the page is encrypted * - for AMD with vTOM and for Intel, *clear* means encrypted */ - switch (vendor) { + switch (cc_vendor) { case CC_VENDOR_AMD: if (sev_status & MSR_AMD64_SNP_VTOM) return val & ~cc_mask; @@ -135,7 +135,7 @@ u64 cc_mkenc(u64 val) u64 cc_mkdec(u64 val) { /* See comment in cc_mkenc() */ - switch (vendor) { + switch (cc_vendor) { case CC_VENDOR_AMD: if (sev_status & MSR_AMD64_SNP_VTOM) return val | cc_mask; @@ -149,11 +149,6 @@ u64 cc_mkdec(u64 val) } EXPORT_SYMBOL_GPL(cc_mkdec); -__init void cc_set_vendor(enum cc_vendor v) -{ - vendor = v; -} - __init void cc_set_mask(u64 mask) { cc_mask = mask; diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 1506a22a4fb6..6a1821bd7d5e 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -3,10 +3,7 @@ # Building vDSO images for x86. # -# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before -# the inclusion of generic Makefile. -ARCH_REL_TYPE_ABS := R_X86_64_JUMP_SLOT|R_X86_64_GLOB_DAT|R_X86_64_RELATIVE| -ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE +# Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile # Sanitizer runtimes are unavailable and cannot be linked here. diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index d2c6a2e8d04d..eb08796002f3 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -10,13 +10,30 @@ enum cc_vendor { CC_VENDOR_INTEL, }; -void cc_set_vendor(enum cc_vendor v); -void cc_set_mask(u64 mask); - #ifdef CONFIG_ARCH_HAS_CC_PLATFORM +extern enum cc_vendor cc_vendor; + +static inline enum cc_vendor cc_get_vendor(void) +{ + return cc_vendor; +} + +static inline void cc_set_vendor(enum cc_vendor vendor) +{ + cc_vendor = vendor; +} + +void cc_set_mask(u64 mask); u64 cc_mkenc(u64 val); u64 cc_mkdec(u64 val); #else +static inline enum cc_vendor cc_get_vendor(void) +{ + return CC_VENDOR_NONE; +} + +static inline void cc_set_vendor(enum cc_vendor vendor) { } + static inline u64 cc_mkenc(u64 val) { return val; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 8d73004e4cac..a1e4fa58b357 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -647,7 +647,11 @@ static inline void spin_lock_prefetch(const void *x) #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else -#define INIT_THREAD { } +extern unsigned long __end_init_task[]; + +#define INIT_THREAD { \ + .sp = (unsigned long)&__end_init_task - sizeof(struct pt_regs), \ +} extern unsigned long KSTK_ESP(struct task_struct *task); diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index a336feef0af1..f6a1737c77be 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -59,7 +59,6 @@ extern struct real_mode_header *real_mode_header; extern unsigned char real_mode_blob_end[]; extern unsigned long initial_code; -extern unsigned long initial_gs; extern unsigned long initial_stack; #ifdef CONFIG_AMD_MEM_ENCRYPT extern unsigned long initial_vc_handler; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index b4dbb20dab1a..bf2c51df9e0b 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -199,5 +199,8 @@ extern void nmi_selftest(void); #define nmi_selftest() do { } while (0) #endif -#endif /* __ASSEMBLY__ */ +extern unsigned int smpboot_control; + +#endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_SMP_H */ diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 3b7f4cdbf2e0..1328c221af30 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -111,13 +111,26 @@ int x86_acpi_suspend_lowlevel(void) saved_magic = 0x12345678; #else /* CONFIG_64BIT */ #ifdef CONFIG_SMP - initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); - early_gdt_descr.address = - (unsigned long)get_cpu_gdt_rw(smp_processor_id()); - initial_gs = per_cpu_offset(smp_processor_id()); + /* + * As each CPU starts up, it will find its own stack pointer + * from its current_task->thread.sp. Typically that will be + * the idle thread for a newly-started AP, or even the boot + * CPU which will find it set to &init_task in the static + * per-cpu data. + * + * Make the resuming CPU use the temporary stack at startup + * by setting current->thread.sp to point to that. The true + * %rsp will be restored with the rest of the CPU context, + * by do_suspend_lowlevel(). And unwinders don't care about + * the abuse of ->thread.sp because it's a dead variable + * while the thread is running on the CPU anyway; the true + * value is in the actual %rsp register. + */ + current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack); + smpboot_control = smp_processor_id(); #endif initial_code = (unsigned long)wakeup_long64; - saved_magic = 0x123456789abcdef0L; + saved_magic = 0x123456789abcdef0L; #endif /* CONFIG_64BIT */ /* diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 20d9a604da7c..770557110051 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -422,10 +422,9 @@ static unsigned int reserve_eilvt_offset(int offset, unsigned int new) if (vector && !eilvt_entry_is_changeable(vector, new)) /* may not change if vectors are different */ return rsvd; - rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new); - } while (rsvd != new); + } while (!atomic_try_cmpxchg(&eilvt_offsets[offset], &rsvd, new)); - rsvd &= ~APIC_EILVT_MASKED; + rsvd = new & ~APIC_EILVT_MASKED; if (rsvd && rsvd != vector) pr_info("LVT offset %d assigned for vector 0x%02x\n", offset, rsvd); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 146671de9ddc..4241dc243aa8 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2478,17 +2478,21 @@ static int io_apic_get_redir_entries(int ioapic) unsigned int arch_dynirq_lower_bound(unsigned int from) { + unsigned int ret; + /* * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use * gsi_top if ioapic_dynirq_base hasn't been initialized yet. */ - if (!ioapic_initialized) - return gsi_top; + ret = ioapic_dynirq_base ? : gsi_top; + /* - * For DT enabled machines ioapic_dynirq_base is irrelevant and not - * updated. So simply return @from if ioapic_dynirq_base == 0. + * For DT enabled machines ioapic_dynirq_base is irrelevant and + * always 0. gsi_top can be 0 if there is no IO/APIC registered. + * 0 is an invalid interrupt number for dynamic allocations. Return + * @from instead. */ - return ioapic_dynirq_base ? : from; + return ret ? : from; } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index e696e22d0531..b2b2b7f3e03f 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -9,11 +9,7 @@ #include "local.h" -struct cluster_mask { - unsigned int clusterid; - int node; - struct cpumask mask; -}; +#define apic_cluster(apicid) ((apicid) >> 4) /* * __x2apic_send_IPI_mask() possibly needs to read @@ -23,8 +19,7 @@ struct cluster_mask { static u32 *x86_cpu_to_logical_apicid __read_mostly; static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); -static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks); -static struct cluster_mask *cluster_hotplug_mask; +static DEFINE_PER_CPU_READ_MOSTLY(struct cpumask *, cluster_masks); static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { @@ -60,10 +55,10 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) /* Collapse cpus in a cluster so a single IPI per cluster is sent */ for_each_cpu(cpu, tmpmsk) { - struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu); + struct cpumask *cmsk = per_cpu(cluster_masks, cpu); dest = 0; - for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask) + for_each_cpu_and(clustercpu, tmpmsk, cmsk) dest |= x86_cpu_to_logical_apicid[clustercpu]; if (!dest) @@ -71,7 +66,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); /* Remove cluster CPUs from tmpmask */ - cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask); + cpumask_andnot(tmpmsk, tmpmsk, cmsk); } local_irq_restore(flags); @@ -105,55 +100,98 @@ static u32 x2apic_calc_apicid(unsigned int cpu) static void init_x2apic_ldr(void) { - struct cluster_mask *cmsk = this_cpu_read(cluster_masks); - u32 cluster, apicid = apic_read(APIC_LDR); - unsigned int cpu; + struct cpumask *cmsk = this_cpu_read(cluster_masks); - x86_cpu_to_logical_apicid[smp_processor_id()] = apicid; + BUG_ON(!cmsk); - if (cmsk) - goto update; - - cluster = apicid >> 16; - for_each_online_cpu(cpu) { - cmsk = per_cpu(cluster_masks, cpu); - /* Matching cluster found. Link and update it. */ - if (cmsk && cmsk->clusterid == cluster) - goto update; - } - cmsk = cluster_hotplug_mask; - cmsk->clusterid = cluster; - cluster_hotplug_mask = NULL; -update: - this_cpu_write(cluster_masks, cmsk); - cpumask_set_cpu(smp_processor_id(), &cmsk->mask); + cpumask_set_cpu(smp_processor_id(), cmsk); } -static int alloc_clustermask(unsigned int cpu, int node) +/* + * As an optimisation during boot, set the cluster_mask for all present + * CPUs at once, to prevent each of them having to iterate over the others + * to find the existing cluster_mask. + */ +static void prefill_clustermask(struct cpumask *cmsk, unsigned int cpu, u32 cluster) { + int cpu_i; + + for_each_present_cpu(cpu_i) { + struct cpumask **cpu_cmsk = &per_cpu(cluster_masks, cpu_i); + u32 apicid = apic->cpu_present_to_apicid(cpu_i); + + if (apicid == BAD_APICID || cpu_i == cpu || apic_cluster(apicid) != cluster) + continue; + + if (WARN_ON_ONCE(*cpu_cmsk == cmsk)) + continue; + + BUG_ON(*cpu_cmsk); + *cpu_cmsk = cmsk; + } +} + +static int alloc_clustermask(unsigned int cpu, u32 cluster, int node) +{ + struct cpumask *cmsk = NULL; + unsigned int cpu_i; + + /* + * At boot time, the CPU present mask is stable. The cluster mask is + * allocated for the first CPU in the cluster and propagated to all + * present siblings in the cluster. If the cluster mask is already set + * on entry to this function for a given CPU, there is nothing to do. + */ if (per_cpu(cluster_masks, cpu)) return 0; - /* - * If a hotplug spare mask exists, check whether it's on the right - * node. If not, free it and allocate a new one. - */ - if (cluster_hotplug_mask) { - if (cluster_hotplug_mask->node == node) - return 0; - kfree(cluster_hotplug_mask); - } - cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask), - GFP_KERNEL, node); - if (!cluster_hotplug_mask) + if (system_state < SYSTEM_RUNNING) + goto alloc; + + /* + * On post boot hotplug for a CPU which was not present at boot time, + * iterate over all possible CPUs (even those which are not present + * any more) to find any existing cluster mask. + */ + for_each_possible_cpu(cpu_i) { + u32 apicid = apic->cpu_present_to_apicid(cpu_i); + + if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) { + cmsk = per_cpu(cluster_masks, cpu_i); + /* + * If the cluster is already initialized, just store + * the mask and return. There's no need to propagate. + */ + if (cmsk) { + per_cpu(cluster_masks, cpu) = cmsk; + return 0; + } + } + } + /* + * No CPU in the cluster has ever been initialized, so fall through to + * the boot time code which will also populate the cluster mask for any + * other CPU in the cluster which is (now) present. + */ +alloc: + cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node); + if (!cmsk) return -ENOMEM; - cluster_hotplug_mask->node = node; + per_cpu(cluster_masks, cpu) = cmsk; + prefill_clustermask(cmsk, cpu, cluster); + return 0; } static int x2apic_prepare_cpu(unsigned int cpu) { - if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0) + u32 phys_apicid = apic->cpu_present_to_apicid(cpu); + u32 cluster = apic_cluster(phys_apicid); + u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf)); + + x86_cpu_to_logical_apicid[cpu] = logical_apicid; + + if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0) return -ENOMEM; if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) return -ENOMEM; @@ -162,10 +200,10 @@ static int x2apic_prepare_cpu(unsigned int cpu) static int x2apic_dead_cpu(unsigned int dead_cpu) { - struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu); + struct cpumask *cmsk = per_cpu(cluster_masks, dead_cpu); if (cmsk) - cpumask_clear_cpu(dead_cpu, &cmsk->mask); + cpumask_clear_cpu(dead_cpu, cmsk); free_cpumask_var(per_cpu(ipi_mask, dead_cpu)); return 0; } diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 283dcd2f62c8..dc3576303f1a 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -115,6 +115,7 @@ static void __used common(void) OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); + OFFSET(X86_current_task, pcpu_hot, current_task); #ifdef CONFIG_CALL_DEPTH_TRACKING OFFSET(X86_call_depth, pcpu_hot, call_depth); #endif diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 222efd4a09bc..6a8238702eab 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -61,23 +61,15 @@ SYM_CODE_START_NOALIGN(startup_64) * tables and then reload them. */ - /* Set up the stack for verify_cpu(), similar to initial_stack below */ - leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp + /* Set up the stack for verify_cpu() */ + leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp leaq _text(%rip), %rdi - /* - * initial_gs points to initial fixed_percpu_data struct with storage for - * the stack protector canary. Global pointer fixups are needed at this - * stage, so apply them as is done in fixup_pointer(), and initialize %gs - * such that the canary can be accessed at %gs:40 for subsequent C calls. - */ + /* Setup GSBASE to allow stack canary access for C code */ movl $MSR_GS_BASE, %ecx - movq initial_gs(%rip), %rax - movq $_text, %rdx - subq %rdx, %rax - addq %rdi, %rax - movq %rax, %rdx + leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx + movl %edx, %eax shrq $32, %rdx wrmsr @@ -241,13 +233,36 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) UNWIND_HINT_EMPTY ANNOTATE_NOENDBR // above +#ifdef CONFIG_SMP + movl smpboot_control(%rip), %ecx + + /* Get the per cpu offset for the given CPU# which is in ECX */ + movq __per_cpu_offset(,%rcx,8), %rdx +#else + xorl %edx, %edx /* zero-extended to clear all of RDX */ +#endif /* CONFIG_SMP */ + + /* + * Setup a boot time stack - Any secondary CPU will have lost its stack + * by now because the cr3-switch above unmaps the real-mode stack. + * + * RDX contains the per-cpu offset + */ + movq pcpu_hot + X86_current_task(%rdx), %rax + movq TASK_threadsp(%rax), %rsp + /* * We must switch to a new descriptor in kernel space for the GDT * because soon the kernel won't have access anymore to the userspace * addresses where we're currently running on. We have to do that here * because in 32bit we couldn't load a 64bit linear address. */ - lgdt early_gdt_descr(%rip) + subq $16, %rsp + movw $(GDT_SIZE-1), (%rsp) + leaq gdt_page(%rdx), %rax + movq %rax, 2(%rsp) + lgdt (%rsp) + addq $16, %rsp /* set up data segments */ xorl %eax,%eax @@ -271,16 +286,13 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movl initial_gs(%rip),%eax - movl initial_gs+4(%rip),%edx +#ifndef CONFIG_SMP + leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx +#endif + movl %edx, %eax + shrq $32, %rdx wrmsr - /* - * Setup a boot time stack - Any secondary CPU will have lost its stack - * by now because the cr3-switch above unmaps the real-mode stack - */ - movq initial_stack(%rip), %rsp - /* Setup and Load IDT */ pushq %rsi call early_setup_idt @@ -372,7 +384,11 @@ SYM_CODE_END(secondary_startup_64) SYM_CODE_START(start_cpu0) ANNOTATE_NOENDBR UNWIND_HINT_EMPTY - movq initial_stack(%rip), %rsp + + /* Find the idle task stack */ + movq PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx + movq TASK_threadsp(%rcx), %rsp + jmp .Ljump_to_C_code SYM_CODE_END(start_cpu0) #endif @@ -416,16 +432,9 @@ SYM_CODE_END(vc_boot_ghcb) __REFDATA .balign 8 SYM_DATA(initial_code, .quad x86_64_start_kernel) -SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data)) #ifdef CONFIG_AMD_MEM_ENCRYPT SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb) #endif - -/* - * The FRAME_SIZE gap is a convention which helps the in-kernel unwinder - * reliably detect the end of the stack. - */ -SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE) __FINITDATA __INIT @@ -657,8 +666,7 @@ SYM_DATA_END(level1_fixmap_pgt) .data .align 16 -SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1) -SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page)) +SYM_DATA(smpboot_control, .long 0) .align 16 /* This must match the first entry in level2_kernel_pgt */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9013bb28255a..851477f7d728 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -121,17 +121,20 @@ int arch_update_cpu_topology(void) return retval; } + +static unsigned int smpboot_warm_reset_vector_count; + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; spin_lock_irqsave(&rtc_lock, flags); - CMOS_WRITE(0xa, 0xf); + if (!smpboot_warm_reset_vector_count++) { + CMOS_WRITE(0xa, 0xf); + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> 4; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & 0xf; + } spin_unlock_irqrestore(&rtc_lock, flags); - *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = - start_eip >> 4; - *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = - start_eip & 0xf; } static inline void smpboot_restore_warm_reset_vector(void) @@ -143,10 +146,12 @@ static inline void smpboot_restore_warm_reset_vector(void) * to default values. */ spin_lock_irqsave(&rtc_lock, flags); - CMOS_WRITE(0, 0xf); + if (!--smpboot_warm_reset_vector_count) { + CMOS_WRITE(0, 0xf); + *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; + } spin_unlock_irqrestore(&rtc_lock, flags); - *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } /* @@ -1059,8 +1064,6 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle); -#else - initial_gs = per_cpu_offset(cpu); #endif return 0; } @@ -1086,9 +1089,14 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, start_ip = real_mode_header->trampoline_start64; #endif idle->thread.sp = (unsigned long)task_pt_regs(idle); - early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_code = (unsigned long)start_secondary; - initial_stack = idle->thread.sp; + + if (IS_ENABLED(CONFIG_X86_32)) { + early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); + initial_stack = idle->thread.sp; + } else { + smpboot_control = cpu; + } /* Enable the espfix hack for this CPU */ init_espfix_ap(cpu); diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index e36ea4268bd2..91f7a53519a7 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -49,7 +49,7 @@ SYM_CODE_START(startup_xen) ANNOTATE_NOENDBR cld - mov initial_stack(%rip), %rsp + leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp /* Set up %gs. * diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 2c6e99ca48af..d607f51404fc 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -62,16 +63,18 @@ static inline int clockid_to_fd(const clockid_t clk) * cpu_timer - Posix CPU timer representation for k_itimer * @node: timerqueue node to queue in the task/sig * @head: timerqueue head on which this timer is queued - * @task: Pointer to target task + * @pid: Pointer to target task PID * @elist: List head for the expiry list * @firing: Timer is currently firing + * @handling: Pointer to the task which handles expiry */ struct cpu_timer { - struct timerqueue_node node; - struct timerqueue_head *head; - struct pid *pid; - struct list_head elist; - int firing; + struct timerqueue_node node; + struct timerqueue_head *head; + struct pid *pid; + struct list_head elist; + int firing; + struct task_struct __rcu *handling; }; static inline bool cpu_timer_enqueue(struct timerqueue_head *head, @@ -135,10 +138,12 @@ struct posix_cputimers { /** * posix_cputimers_work - Container for task work based posix CPU timer expiry * @work: The task work to be scheduled + * @mutex: Mutex held around expiry in context of this task work * @scheduled: @work has been scheduled already, no further processing */ struct posix_cputimers_work { struct callback_head work; + struct mutex mutex; unsigned int scheduled; }; diff --git a/kernel/signal.c b/kernel/signal.c index 8cb28f1df294..8f6330f0e9ca 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1003,8 +1003,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) /* * Now find a thread we can wake up to take the signal off the queue. * - * If the main thread wants the signal, it gets first crack. - * Probably the least surprising to the average bear. + * Try the suggested task first (may or may not be the main thread). */ if (wants_signal(sig, p)) t = p; @@ -1970,8 +1969,24 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) ret = -1; rcu_read_lock(); + + /* + * This function is used by POSIX timers to deliver a timer signal. + * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID + * set), the signal must be delivered to the specific thread (queues + * into t->pending). + * + * Where type is not PIDTYPE_PID, signals must be delivered to the + * process. In this case, prefer to deliver to current if it is in + * the same thread group as the target process, which avoids + * unnecessarily waking up a potentially idle task. + */ t = pid_task(pid, type); - if (!t || !likely(lock_task_sighand(t, &flags))) + if (!t) + goto ret; + if (type != PIDTYPE_PID && same_thread_group(t, current)) + t = current; + if (!likely(lock_task_sighand(t, &flags))) goto ret; ret = 1; /* the signal is ignored */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 2f5e9b34022c..e9c6f9d0e42c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -846,6 +846,8 @@ static u64 collect_timerqueue(struct timerqueue_head *head, return expires; ctmr->firing = 1; + /* See posix_cpu_timer_wait_running() */ + rcu_assign_pointer(ctmr->handling, current); cpu_timer_dequeue(ctmr); list_add_tail(&ctmr->elist, firing); } @@ -1161,7 +1163,49 @@ static void handle_posix_cpu_timers(struct task_struct *tsk); #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK static void posix_cpu_timers_work(struct callback_head *work) { + struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work); + + mutex_lock(&cw->mutex); handle_posix_cpu_timers(current); + mutex_unlock(&cw->mutex); +} + +/* + * Invoked from the posix-timer core when a cancel operation failed because + * the timer is marked firing. The caller holds rcu_read_lock(), which + * protects the timer and the task which is expiring it from being freed. + */ +static void posix_cpu_timer_wait_running(struct k_itimer *timr) +{ + struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling); + + /* Has the handling task completed expiry already? */ + if (!tsk) + return; + + /* Ensure that the task cannot go away */ + get_task_struct(tsk); + /* Now drop the RCU protection so the mutex can be locked */ + rcu_read_unlock(); + /* Wait on the expiry mutex */ + mutex_lock(&tsk->posix_cputimers_work.mutex); + /* Release it immediately again. */ + mutex_unlock(&tsk->posix_cputimers_work.mutex); + /* Drop the task reference. */ + put_task_struct(tsk); + /* Relock RCU so the callsite is balanced */ + rcu_read_lock(); +} + +static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr) +{ + /* Ensure that timr->it.cpu.handling task cannot go away */ + rcu_read_lock(); + spin_unlock_irq(&timr->it_lock); + posix_cpu_timer_wait_running(timr); + rcu_read_unlock(); + /* @timr is on stack and is valid */ + spin_lock_irq(&timr->it_lock); } /* @@ -1177,6 +1221,7 @@ void clear_posix_cputimers_work(struct task_struct *p) sizeof(p->posix_cputimers_work.work)); init_task_work(&p->posix_cputimers_work.work, posix_cpu_timers_work); + mutex_init(&p->posix_cputimers_work.mutex); p->posix_cputimers_work.scheduled = false; } @@ -1255,6 +1300,18 @@ static inline void __run_posix_cpu_timers(struct task_struct *tsk) lockdep_posixtimer_exit(); } +static void posix_cpu_timer_wait_running(struct k_itimer *timr) +{ + cpu_relax(); +} + +static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr) +{ + spin_unlock_irq(&timr->it_lock); + cpu_relax(); + spin_lock_irq(&timr->it_lock); +} + static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) { return false; @@ -1363,6 +1420,8 @@ static void handle_posix_cpu_timers(struct task_struct *tsk) */ if (likely(cpu_firing >= 0)) cpu_timer_fire(timer); + /* See posix_cpu_timer_wait_running() */ + rcu_assign_pointer(timer->it.cpu.handling, NULL); spin_unlock(&timer->it_lock); } } @@ -1497,23 +1556,16 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, expires = cpu_timer_getexpires(&timer.it.cpu); error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); if (!error) { - /* - * Timer is now unarmed, deletion can not fail. - */ + /* Timer is now unarmed, deletion can not fail. */ posix_cpu_timer_del(&timer); + } else { + while (error == TIMER_RETRY) { + posix_cpu_timer_wait_running_nsleep(&timer); + error = posix_cpu_timer_del(&timer); + } } - spin_unlock_irq(&timer.it_lock); - while (error == TIMER_RETRY) { - /* - * We need to handle case when timer was or is in the - * middle of firing. In other cases we already freed - * resources. - */ - spin_lock_irq(&timer.it_lock); - error = posix_cpu_timer_del(&timer); - spin_unlock_irq(&timer.it_lock); - } + spin_unlock_irq(&timer.it_lock); if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { /* @@ -1623,6 +1675,7 @@ const struct k_clock clock_posix_cpu = { .timer_del = posix_cpu_timer_del, .timer_get = posix_cpu_timer_get, .timer_rearm = posix_cpu_timer_rearm, + .timer_wait_running = posix_cpu_timer_wait_running, }; const struct k_clock clock_process = { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0c8a87a11b39..808a247205a9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -846,6 +846,10 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer, rcu_read_lock(); unlock_timer(timer, *flags); + /* + * kc->timer_wait_running() might drop RCU lock. So @timer + * cannot be touched anymore after the function returns! + */ if (!WARN_ON_ONCE(!kc->timer_wait_running)) kc->timer_wait_running(timer); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 5c47cc50f9e9..2d716bfd4a66 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -220,9 +220,19 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { + ktime_t next_p; + u32 rem; + tick_do_timer_cpu = cpu; - tick_next_period = ktime_get(); + next_p = ktime_get(); + div_u64_rem(next_p, TICK_NSEC, &rem); + if (rem) { + next_p -= rem; + next_p += TICK_NSEC; + } + + tick_next_period = next_p; #ifdef CONFIG_NO_HZ_FULL /* * The boot CPU may be nohz_full, in which case set diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6485f8707def..bdc64a74e71a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -650,43 +650,67 @@ static void tick_nohz_update_jiffies(ktime_t now) touch_softlockup_watchdog_sched(); } -/* - * Updates the per-CPU time idle statistics counters - */ -static void -update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) +static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { ktime_t delta; - if (ts->idle_active) { - delta = ktime_sub(now, ts->idle_entrytime); - if (nr_iowait_cpu(cpu) > 0) - ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); - else - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - ts->idle_entrytime = now; - } + if (WARN_ON_ONCE(!ts->idle_active)) + return; - if (last_update_time) - *last_update_time = ktime_to_us(now); + delta = ktime_sub(now, ts->idle_entrytime); -} + write_seqcount_begin(&ts->idle_sleeptime_seq); + if (nr_iowait_cpu(smp_processor_id()) > 0) + ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); + else + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); -static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) -{ - update_ts_time_stats(smp_processor_id(), ts, now, NULL); + ts->idle_entrytime = now; ts->idle_active = 0; + write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { + write_seqcount_begin(&ts->idle_sleeptime_seq); ts->idle_entrytime = ktime_get(); ts->idle_active = 1; + write_seqcount_end(&ts->idle_sleeptime_seq); + sched_clock_idle_sleep_event(); } +static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, + bool compute_delta, u64 *last_update_time) +{ + ktime_t now, idle; + unsigned int seq; + + if (!tick_nohz_active) + return -1; + + now = ktime_get(); + if (last_update_time) + *last_update_time = ktime_to_us(now); + + do { + seq = read_seqcount_begin(&ts->idle_sleeptime_seq); + + if (ts->idle_active && compute_delta) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); + + idle = ktime_add(*sleeptime, delta); + } else { + idle = *sleeptime; + } + } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); + + return ktime_to_us(idle); + +} + /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query @@ -694,7 +718,10 @@ static void tick_nohz_start_idle(struct tick_sched *ts) * counters if NULL. * * Return the cumulative idle time (since boot) for a given - * CPU, in microseconds. + * CPU, in microseconds. Note this is partially broken due to + * the counter of iowait tasks that can be remotely updated without + * any synchronization. Therefore it is possible to observe backward + * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. @@ -704,27 +731,9 @@ static void tick_nohz_start_idle(struct tick_sched *ts) u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t now, idle; - - if (!tick_nohz_active) - return -1; - - now = ktime_get(); - if (last_update_time) { - update_ts_time_stats(cpu, ts, now, last_update_time); - idle = ts->idle_sleeptime; - } else { - if (ts->idle_active && !nr_iowait_cpu(cpu)) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); - - idle = ktime_add(ts->idle_sleeptime, delta); - } else { - idle = ts->idle_sleeptime; - } - } - - return ktime_to_us(idle); + return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, + !nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); @@ -735,7 +744,10 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); * counters if NULL. * * Return the cumulative iowait time (since boot) for a given - * CPU, in microseconds. + * CPU, in microseconds. Note this is partially broken due to + * the counter of iowait tasks that can be remotely updated without + * any synchronization. Therefore it is possible to observe backward + * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. @@ -745,26 +757,9 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t now, iowait; - if (!tick_nohz_active) - return -1; - - now = ktime_get(); - if (last_update_time) { - update_ts_time_stats(cpu, ts, now, last_update_time); - iowait = ts->iowait_sleeptime; - } else { - if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); - - iowait = ktime_add(ts->iowait_sleeptime, delta); - } else { - iowait = ts->iowait_sleeptime; - } - } - - return ktime_to_us(iowait); + return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, + nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); @@ -1097,10 +1092,16 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return true; } -static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) +/** + * tick_nohz_idle_stop_tick - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + */ +void tick_nohz_idle_stop_tick(void) { - ktime_t expires; + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); + ktime_t expires; /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the @@ -1132,16 +1133,6 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) } } -/** - * tick_nohz_idle_stop_tick - stop the idle tick from the idle task - * - * When the next event is more than a tick into the future, stop the idle tick - */ -void tick_nohz_idle_stop_tick(void) -{ - __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); -} - void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 504649513399..5ed5a9d41d5a 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -22,65 +22,82 @@ enum tick_nohz_mode { /** * struct tick_sched - sched tick emulation and no idle tick control/stats - * @sched_timer: hrtimer to schedule the periodic tick in high - * resolution mode - * @check_clocks: Notification mechanism about clocksource changes - * @nohz_mode: Mode - one state of tick_nohz_mode + * * @inidle: Indicator that the CPU is in the tick idle mode * @tick_stopped: Indicator that the idle tick has been stopped * @idle_active: Indicator that the CPU is actively in the tick idle mode; * it is reset during irq handling phases. - * @do_timer_lst: CPU was the last one doing do_timer before going idle + * @do_timer_last: CPU was the last one doing do_timer before going idle * @got_idle_tick: Tick timer function has run with @inidle set + * @stalled_jiffies: Number of stalled jiffies detected across ticks + * @last_tick_jiffies: Value of jiffies seen on last tick + * @sched_timer: hrtimer to schedule the periodic tick in high + * resolution mode * @last_tick: Store the last tick expiry time when the tick * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. * @next_tick: Next tick to be fired when in dynticks mode. * @idle_jiffies: jiffies at the entry to idle for idle time accounting + * @idle_waketime: Time when the idle was interrupted + * @idle_entrytime: Time when the idle call was entered + * @nohz_mode: Mode - one state of tick_nohz_mode + * @last_jiffies: Base jiffies snapshot when next event was last computed + * @timer_expires_base: Base time clock monotonic for @timer_expires + * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) + * @next_timer: Expiry time of next expiring timer for debugging purpose only + * @idle_expires: Next tick in idle, for debugging purpose only * @idle_calls: Total number of idle calls * @idle_sleeps: Number of idle calls, where the sched tick was stopped - * @idle_entrytime: Time when the idle call was entered - * @idle_waketime: Time when the idle was interrupted * @idle_exittime: Time when the idle state was left * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding - * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) - * @timer_expires_base: Base time clock monotonic for @timer_expires - * @next_timer: Expiry time of next expiring timer for debugging purpose only * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick - * @last_tick_jiffies: Value of jiffies seen on last tick - * @stalled_jiffies: Number of stalled jiffies detected across ticks + * @check_clocks: Notification mechanism about clocksource changes */ struct tick_sched { - struct hrtimer sched_timer; - unsigned long check_clocks; - enum tick_nohz_mode nohz_mode; - + /* Common flags */ unsigned int inidle : 1; unsigned int tick_stopped : 1; unsigned int idle_active : 1; unsigned int do_timer_last : 1; unsigned int got_idle_tick : 1; + /* Tick handling: jiffies stall check */ + unsigned int stalled_jiffies; + unsigned long last_tick_jiffies; + + /* Tick handling */ + struct hrtimer sched_timer; ktime_t last_tick; ktime_t next_tick; unsigned long idle_jiffies; + ktime_t idle_waketime; + + /* Idle entry */ + seqcount_t idle_sleeptime_seq; + ktime_t idle_entrytime; + + /* Tick stop */ + enum tick_nohz_mode nohz_mode; + unsigned long last_jiffies; + u64 timer_expires_base; + u64 timer_expires; + u64 next_timer; + ktime_t idle_expires; unsigned long idle_calls; unsigned long idle_sleeps; - ktime_t idle_entrytime; - ktime_t idle_waketime; + + /* Idle exit */ ktime_t idle_exittime; ktime_t idle_sleeptime; ktime_t iowait_sleeptime; - unsigned long last_jiffies; - u64 timer_expires; - u64 timer_expires_base; - u64 next_timer; - ktime_t idle_expires; + + /* Full dynticks handling */ atomic_t tick_dep_mask; - unsigned long last_tick_jiffies; - unsigned int stalled_jiffies; + + /* Clocksource changes */ + unsigned long check_clocks; }; extern struct tick_sched *tick_get_tick_sched(int cpu); diff --git a/lib/vdso/Makefile b/lib/vdso/Makefile index e814061d6aa0..9f031eafc465 100644 --- a/lib/vdso/Makefile +++ b/lib/vdso/Makefile @@ -5,18 +5,13 @@ GENERIC_VDSO_DIR := $(dir $(GENERIC_VDSO_MK_PATH)) c-gettimeofday-$(CONFIG_GENERIC_GETTIMEOFDAY) := $(addprefix $(GENERIC_VDSO_DIR), gettimeofday.c) -# This cmd checks that the vdso library does not contain absolute relocation +# This cmd checks that the vdso library does not contain dynamic relocations. # It has to be called after the linking of the vdso library and requires it # as a parameter. # -# $(ARCH_REL_TYPE_ABS) is defined in the arch specific makefile and corresponds -# to the absolute relocation types printed by "objdump -R" and accepted by the -# dynamic linker. -ifndef ARCH_REL_TYPE_ABS -$(error ARCH_REL_TYPE_ABS is not set) -endif - +# As a workaround for some GNU ld ports which produce unneeded R_*_NONE +# dynamic relocations, ignore R_*_NONE. quiet_cmd_vdso_check = VDSOCHK $@ - cmd_vdso_check = if $(OBJDUMP) -R $@ | grep -E -h "$(ARCH_REL_TYPE_ABS)"; \ + cmd_vdso_check = if $(READELF) -rW $@ | grep -v _NONE | grep -q " R_\w*_"; \ then (echo >&2 "$@: dynamic relocations are not supported"; \ rm -f $@; /bin/false); fi diff --git a/tools/testing/selftests/proc/proc-uptime-001.c b/tools/testing/selftests/proc/proc-uptime-001.c index 781f7a50fc3f..f335eec5067e 100644 --- a/tools/testing/selftests/proc/proc-uptime-001.c +++ b/tools/testing/selftests/proc/proc-uptime-001.c @@ -13,7 +13,9 @@ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -// Test that values in /proc/uptime increment monotonically. +// Test that boottime value in /proc/uptime and CLOCK_BOOTTIME increment +// monotonically. We don't test idle time monotonicity due to broken iowait +// task counting, cf: comment above get_cpu_idle_time_us() #undef NDEBUG #include #include @@ -25,20 +27,31 @@ int main(void) { - uint64_t start, u0, u1, i0, i1; + uint64_t start, u0, u1, c0, c1; int fd; fd = open("/proc/uptime", O_RDONLY); assert(fd >= 0); - proc_uptime(fd, &u0, &i0); + u0 = proc_uptime(fd); start = u0; + c0 = clock_boottime(); + do { - proc_uptime(fd, &u1, &i1); + u1 = proc_uptime(fd); + c1 = clock_boottime(); + + /* Is /proc/uptime monotonic ? */ assert(u1 >= u0); - assert(i1 >= i0); + + /* Is CLOCK_BOOTTIME monotonic ? */ + assert(c1 >= c0); + + /* Is CLOCK_BOOTTIME VS /proc/uptime monotonic ? */ + assert(c0 >= u0); + u0 = u1; - i0 = i1; + c0 = c1; } while (u1 - start < 100); return 0; diff --git a/tools/testing/selftests/proc/proc-uptime-002.c b/tools/testing/selftests/proc/proc-uptime-002.c index 7d0aa22bdc12..ae453daa96c1 100644 --- a/tools/testing/selftests/proc/proc-uptime-002.c +++ b/tools/testing/selftests/proc/proc-uptime-002.c @@ -13,8 +13,10 @@ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -// Test that values in /proc/uptime increment monotonically -// while shifting across CPUs. +// Test that boottime value in /proc/uptime and CLOCK_BOOTTIME increment +// monotonically while shifting across CPUs. We don't test idle time +// monotonicity due to broken iowait task counting, cf: comment above +// get_cpu_idle_time_us() #undef NDEBUG #include #include @@ -42,10 +44,10 @@ static inline int sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned lo int main(void) { + uint64_t u0, u1, c0, c1; unsigned int len; unsigned long *m; unsigned int cpu; - uint64_t u0, u1, i0, i1; int fd; /* find out "nr_cpu_ids" */ @@ -60,7 +62,9 @@ int main(void) fd = open("/proc/uptime", O_RDONLY); assert(fd >= 0); - proc_uptime(fd, &u0, &i0); + u0 = proc_uptime(fd); + c0 = clock_boottime(); + for (cpu = 0; cpu < len * 8; cpu++) { memset(m, 0, len); m[cpu / (8 * sizeof(unsigned long))] |= 1UL << (cpu % (8 * sizeof(unsigned long))); @@ -68,11 +72,20 @@ int main(void) /* CPU might not exist, ignore error */ sys_sched_setaffinity(0, len, m); - proc_uptime(fd, &u1, &i1); + u1 = proc_uptime(fd); + c1 = clock_boottime(); + + /* Is /proc/uptime monotonic ? */ assert(u1 >= u0); - assert(i1 >= i0); + + /* Is CLOCK_BOOTTIME monotonic ? */ + assert(c1 >= c0); + + /* Is CLOCK_BOOTTIME VS /proc/uptime monotonic ? */ + assert(c0 >= u0); + u0 = u1; - i0 = i1; + c0 = c1; } return 0; diff --git a/tools/testing/selftests/proc/proc-uptime.h b/tools/testing/selftests/proc/proc-uptime.h index dc6a42b1d6b0..730cce4a3d73 100644 --- a/tools/testing/selftests/proc/proc-uptime.h +++ b/tools/testing/selftests/proc/proc-uptime.h @@ -19,10 +19,22 @@ #include #include #include +#include #include "proc.h" -static void proc_uptime(int fd, uint64_t *uptime, uint64_t *idle) +static uint64_t clock_boottime(void) +{ + struct timespec ts; + int err; + + err = clock_gettime(CLOCK_BOOTTIME, &ts); + assert(err >= 0); + + return (ts.tv_sec * 100) + (ts.tv_nsec / 10000000); +} + +static uint64_t proc_uptime(int fd) { uint64_t val1, val2; char buf[64], *p; @@ -43,18 +55,6 @@ static void proc_uptime(int fd, uint64_t *uptime, uint64_t *idle) assert(p[3] == ' '); val2 = (p[1] - '0') * 10 + p[2] - '0'; - *uptime = val1 * 100 + val2; - p += 4; - - val1 = xstrtoull(p, &p); - assert(p[0] == '.'); - assert('0' <= p[1] && p[1] <= '9'); - assert('0' <= p[2] && p[2] <= '9'); - assert(p[3] == '\n'); - - val2 = (p[1] - '0') * 10 + p[2] - '0'; - *idle = val1 * 100 + val2; - - assert(p + 4 == buf + rv); + return val1 * 100 + val2; } diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 0ba500056e63..8a17c0e8d82b 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -188,6 +188,80 @@ static int check_timer_create(int which) return 0; } +int remain; +__thread int got_signal; + +static void *distribution_thread(void *arg) +{ + while (__atomic_load_n(&remain, __ATOMIC_RELAXED)); + return NULL; +} + +static void distribution_handler(int nr) +{ + if (!__atomic_exchange_n(&got_signal, 1, __ATOMIC_RELAXED)) + __atomic_fetch_sub(&remain, 1, __ATOMIC_RELAXED); +} + +/* + * Test that all running threads _eventually_ receive CLOCK_PROCESS_CPUTIME_ID + * timer signals. This primarily tests that the kernel does not favour any one. + */ +static int check_timer_distribution(void) +{ + int err, i; + timer_t id; + const int nthreads = 10; + pthread_t threads[nthreads]; + struct itimerspec val = { + .it_value.tv_sec = 0, + .it_value.tv_nsec = 1000 * 1000, + .it_interval.tv_sec = 0, + .it_interval.tv_nsec = 1000 * 1000, + }; + + printf("Check timer_create() per process signal distribution... "); + fflush(stdout); + + remain = nthreads + 1; /* worker threads + this thread */ + signal(SIGALRM, distribution_handler); + err = timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); + if (err < 0) { + perror("Can't create timer\n"); + return -1; + } + err = timer_settime(id, 0, &val, NULL); + if (err < 0) { + perror("Can't set timer\n"); + return -1; + } + + for (i = 0; i < nthreads; i++) { + if (pthread_create(&threads[i], NULL, distribution_thread, NULL)) { + perror("Can't create thread\n"); + return -1; + } + } + + /* Wait for all threads to receive the signal. */ + while (__atomic_load_n(&remain, __ATOMIC_RELAXED)); + + for (i = 0; i < nthreads; i++) { + if (pthread_join(threads[i], NULL)) { + perror("Can't join thread\n"); + return -1; + } + } + + if (timer_delete(id)) { + perror("Can't delete timer\n"); + return -1; + } + + printf("[OK]\n"); + return 0; +} + int main(int argc, char **argv) { printf("Testing posix timers. False negative may happen on CPU execution \n"); @@ -217,5 +291,8 @@ int main(int argc, char **argv) if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0) return ksft_exit_fail(); + if (check_timer_distribution() < 0) + return ksft_exit_fail(); + return ksft_exit_pass(); }