From bc13809f1c47245cd584f4ad31ad06a5c5f40e54 Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Sat, 3 Oct 2020 08:03:56 +0200 Subject: [PATCH 001/326] efi/libstub/x86: simplify efi_is_native() CONFIG_EFI_MIXED depends on CONFIG_X86_64=y. There is no need to check CONFIG_X86_64 again. Signed-off-by: Heinrich Schuchardt Link: https://lore.kernel.org/r/20201003060356.4913-1-xypron.glpk@gmx.de Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index bc9758ef292e..7673dc833232 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -213,8 +213,6 @@ static inline bool efi_is_64bit(void) static inline bool efi_is_native(void) { - if (!IS_ENABLED(CONFIG_X86_64)) - return true; return efi_is_64bit(); } From 688eb28211abdf82a3f51e8997f1c8137947227d Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Sun, 11 Oct 2020 10:20:12 -0400 Subject: [PATCH 002/326] efi/x86: Only copy the compressed kernel image in efi_relocate_kernel() The image_size argument to efi_relocate_kernel() is currently specified as init_size, but this is unnecessarily large. The compressed kernel is much smaller, in fact, its image only extends up to the start of _bss, since at this point, the .bss section is still uninitialized. Depending on compression level, this can reduce the amount of data copied by 4-5x. Signed-off-by: Arvind Sankar Link: https://lore.kernel.org/r/20201011142012.96493-1-nivedita@alum.mit.edu Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/x86-stub.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 3672539cb96e..f14c4ff5839f 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -715,8 +715,11 @@ unsigned long efi_main(efi_handle_t handle, (IS_ENABLED(CONFIG_X86_32) && buffer_end > KERNEL_IMAGE_SIZE) || (IS_ENABLED(CONFIG_X86_64) && buffer_end > MAXMEM_X86_64_4LEVEL) || (image_offset == 0)) { + extern char _bss[]; + status = efi_relocate_kernel(&bzimage_addr, - hdr->init_size, hdr->init_size, + (unsigned long)_bss - bzimage_addr, + hdr->init_size, hdr->pref_address, hdr->kernel_alignment, LOAD_PHYSICAL_ADDR); From 6edcf9dc2e1aff3aa1f5a69ee420fb30dd0e968a Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 28 Oct 2020 16:34:02 +0100 Subject: [PATCH 003/326] efi/libstub: EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER should not default to yes EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER is deprecated, so it should not be enabled by default. In light of commit 4da0b2b7e67524cc ("efi/libstub: Re-enable command line initrd loading for x86"), keep the default for X86. Fixes: cf6b83664895a5c7 ("efi/libstub: Make initrd file loader configurable") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20201028153402.1736103-1-geert+renesas@glider.be Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index 36ec1f718893..b452cfa2100b 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -122,7 +122,7 @@ config EFI_ARMSTUB_DTB_LOADER config EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER bool "Enable the command line initrd loader" if !X86 depends on EFI_STUB && (EFI_GENERIC_STUB || X86) - default y + default y if X86 depends on !RISCV help Select this config option to add support for the initrd= command From e1ac4b2406d94eddce8ac2c5ab4235f6075a9602 Mon Sep 17 00:00:00 2001 From: Chester Lin Date: Fri, 30 Oct 2020 14:08:38 +0800 Subject: [PATCH 004/326] efi: generalize efi_get_secureboot Generalize the efi_get_secureboot() function so not only efistub but also other subsystems can use it. Note that the MokSbState handling is not factored out: the variable is boot time only, and so it cannot be parameterized as easily. Also, the IMA code will switch to this version in a future patch, and it does not incorporate the MokSbState exception in the first place. Note that the new efi_get_secureboot_mode() helper treats any failures to read SetupMode as setup mode being disabled. Co-developed-by: Chester Lin Signed-off-by: Chester Lin Acked-by: Mimi Zohar Signed-off-by: Ard Biesheuvel --- arch/x86/boot/compressed/Makefile | 2 +- drivers/firmware/efi/libstub/efistub.h | 2 ++ drivers/firmware/efi/libstub/secureboot.c | 41 +++++++++-------------- include/linux/efi.h | 23 ++++++++++++- 4 files changed, 40 insertions(+), 28 deletions(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index ee249088cbfe..8d358a6fe6ec 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -35,7 +35,7 @@ cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone KBUILD_CFLAGS += $(cflags-y) KBUILD_CFLAGS += -mno-mmx -mno-sse -KBUILD_CFLAGS += -ffreestanding +KBUILD_CFLAGS += -ffreestanding -fshort-wchar KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 2d7abcd99de9..b8ec29d6a74a 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -848,4 +848,6 @@ asmlinkage void __noreturn efi_enter_kernel(unsigned long entrypoint, void efi_handle_post_ebs_state(void); +enum efi_secureboot_mode efi_get_secureboot(void); + #endif diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c index 5efc524b14be..af18d86c1604 100644 --- a/drivers/firmware/efi/libstub/secureboot.c +++ b/drivers/firmware/efi/libstub/secureboot.c @@ -12,15 +12,16 @@ #include "efistub.h" -/* BIOS variables */ -static const efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID; -static const efi_char16_t efi_SecureBoot_name[] = L"SecureBoot"; -static const efi_char16_t efi_SetupMode_name[] = L"SetupMode"; - /* SHIM variables */ static const efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID; static const efi_char16_t shim_MokSBState_name[] = L"MokSBState"; +static efi_status_t get_var(efi_char16_t *name, efi_guid_t *vendor, u32 *attr, + unsigned long *data_size, void *data) +{ + return get_efi_var(name, vendor, attr, data_size, data); +} + /* * Determine whether we're in secure boot mode. * @@ -30,26 +31,18 @@ static const efi_char16_t shim_MokSBState_name[] = L"MokSBState"; enum efi_secureboot_mode efi_get_secureboot(void) { u32 attr; - u8 secboot, setupmode, moksbstate; unsigned long size; + enum efi_secureboot_mode mode; efi_status_t status; + u8 moksbstate; - size = sizeof(secboot); - status = get_efi_var(efi_SecureBoot_name, &efi_variable_guid, - NULL, &size, &secboot); - if (status == EFI_NOT_FOUND) - return efi_secureboot_mode_disabled; - if (status != EFI_SUCCESS) - goto out_efi_err; - - size = sizeof(setupmode); - status = get_efi_var(efi_SetupMode_name, &efi_variable_guid, - NULL, &size, &setupmode); - if (status != EFI_SUCCESS) - goto out_efi_err; - - if (secboot == 0 || setupmode == 1) - return efi_secureboot_mode_disabled; + mode = efi_get_secureboot_mode(get_var); + if (mode == efi_secureboot_mode_unknown) { + efi_err("Could not determine UEFI Secure Boot status.\n"); + return efi_secureboot_mode_unknown; + } + if (mode != efi_secureboot_mode_enabled) + return mode; /* * See if a user has put the shim into insecure mode. If so, and if the @@ -69,8 +62,4 @@ enum efi_secureboot_mode efi_get_secureboot(void) secure_boot_enabled: efi_info("UEFI Secure Boot is enabled.\n"); return efi_secureboot_mode_enabled; - -out_efi_err: - efi_err("Could not determine UEFI Secure Boot status.\n"); - return efi_secureboot_mode_unknown; } diff --git a/include/linux/efi.h b/include/linux/efi.h index d7c0e73af2b9..1cd5d91d8ca1 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1089,7 +1089,28 @@ enum efi_secureboot_mode { efi_secureboot_mode_disabled, efi_secureboot_mode_enabled, }; -enum efi_secureboot_mode efi_get_secureboot(void); + +static inline +enum efi_secureboot_mode efi_get_secureboot_mode(efi_get_variable_t *get_var) +{ + u8 secboot, setupmode = 0; + efi_status_t status; + unsigned long size; + + size = sizeof(secboot); + status = get_var(L"SecureBoot", &EFI_GLOBAL_VARIABLE_GUID, NULL, &size, + &secboot); + if (status == EFI_NOT_FOUND) + return efi_secureboot_mode_disabled; + if (status != EFI_SUCCESS) + return efi_secureboot_mode_unknown; + + size = sizeof(setupmode); + get_var(L"SetupMode", &EFI_GLOBAL_VARIABLE_GUID, NULL, &size, &setupmode); + if (secboot == 0 || setupmode == 1) + return efi_secureboot_mode_disabled; + return efi_secureboot_mode_enabled; +} #ifdef CONFIG_RESET_ATTACK_MITIGATION void efi_enable_reset_attack_mitigation(void); From 25519d68344269f9dc58b5bc72f648248a1fafb9 Mon Sep 17 00:00:00 2001 From: Chester Lin Date: Fri, 30 Oct 2020 14:08:39 +0800 Subject: [PATCH 005/326] ima: generalize x86/EFI arch glue for other EFI architectures Move the x86 IMA arch code into security/integrity/ima/ima_efi.c, so that we will be able to wire it up for arm64 in a future patch. Co-developed-by: Chester Lin Signed-off-by: Chester Lin Acked-by: Mimi Zohar Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 3 ++ arch/x86/kernel/Makefile | 2 - security/integrity/ima/Makefile | 4 ++ .../integrity/ima/ima_efi.c | 45 +++++-------------- 4 files changed, 19 insertions(+), 35 deletions(-) rename arch/x86/kernel/ima_arch.c => security/integrity/ima/ima_efi.c (60%) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 7673dc833232..c98f78330b09 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -380,4 +380,7 @@ static inline void efi_fake_memmap_early(void) } #endif +#define arch_ima_efi_boot_mode \ + ({ extern struct boot_params boot_params; boot_params.secure_boot; }) + #endif /* _ASM_X86_EFI_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 68608bd892c0..5eeb808eb024 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -161,5 +161,3 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o obj-y += vsmp_64.o endif - -obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o diff --git a/security/integrity/ima/Makefile b/security/integrity/ima/Makefile index 67dabca670e2..2499f2485c04 100644 --- a/security/integrity/ima/Makefile +++ b/security/integrity/ima/Makefile @@ -14,3 +14,7 @@ ima-$(CONFIG_HAVE_IMA_KEXEC) += ima_kexec.o ima-$(CONFIG_IMA_BLACKLIST_KEYRING) += ima_mok.o ima-$(CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS) += ima_asymmetric_keys.o ima-$(CONFIG_IMA_QUEUE_EARLY_BOOT_KEYS) += ima_queue_keys.o + +ifeq ($(CONFIG_EFI),y) +ima-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_efi.o +endif diff --git a/arch/x86/kernel/ima_arch.c b/security/integrity/ima/ima_efi.c similarity index 60% rename from arch/x86/kernel/ima_arch.c rename to security/integrity/ima/ima_efi.c index 7dfb1e808928..71786d01946f 100644 --- a/arch/x86/kernel/ima_arch.c +++ b/security/integrity/ima/ima_efi.c @@ -5,50 +5,29 @@ #include #include #include +#include -extern struct boot_params boot_params; +#ifndef arch_ima_efi_boot_mode +#define arch_ima_efi_boot_mode efi_secureboot_mode_unset +#endif static enum efi_secureboot_mode get_sb_mode(void) { - efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID; - efi_status_t status; - unsigned long size; - u8 secboot, setupmode; - - size = sizeof(secboot); + enum efi_secureboot_mode mode; if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { pr_info("ima: secureboot mode unknown, no efi\n"); return efi_secureboot_mode_unknown; } - /* Get variable contents into buffer */ - status = efi.get_variable(L"SecureBoot", &efi_variable_guid, - NULL, &size, &secboot); - if (status == EFI_NOT_FOUND) { + mode = efi_get_secureboot_mode(efi.get_variable); + if (mode == efi_secureboot_mode_disabled) pr_info("ima: secureboot mode disabled\n"); - return efi_secureboot_mode_disabled; - } - - if (status != EFI_SUCCESS) { + else if (mode == efi_secureboot_mode_unknown) pr_info("ima: secureboot mode unknown\n"); - return efi_secureboot_mode_unknown; - } - - size = sizeof(setupmode); - status = efi.get_variable(L"SetupMode", &efi_variable_guid, - NULL, &size, &setupmode); - - if (status != EFI_SUCCESS) /* ignore unknown SetupMode */ - setupmode = 0; - - if (secboot == 0 || setupmode == 1) { - pr_info("ima: secureboot mode disabled\n"); - return efi_secureboot_mode_disabled; - } - - pr_info("ima: secureboot mode enabled\n"); - return efi_secureboot_mode_enabled; + else + pr_info("ima: secureboot mode enabled\n"); + return mode; } bool arch_ima_get_secureboot(void) @@ -57,7 +36,7 @@ bool arch_ima_get_secureboot(void) static bool initialized; if (!initialized && efi_enabled(EFI_BOOT)) { - sb_mode = boot_params.secure_boot; + sb_mode = arch_ima_efi_boot_mode; if (sb_mode == efi_secureboot_mode_unset) sb_mode = get_sb_mode(); From 2b076054e524a92e3a303de487dfb7cf85d8e149 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Nov 2020 12:39:12 +0100 Subject: [PATCH 006/326] remove boolinit.cocci 0/1 for booleans is perfectly valid C. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Julia Lawall --- scripts/coccinelle/misc/boolinit.cocci | 195 ------------------------- 1 file changed, 195 deletions(-) delete mode 100644 scripts/coccinelle/misc/boolinit.cocci diff --git a/scripts/coccinelle/misc/boolinit.cocci b/scripts/coccinelle/misc/boolinit.cocci deleted file mode 100644 index fed6126e2b9d..000000000000 --- a/scripts/coccinelle/misc/boolinit.cocci +++ /dev/null @@ -1,195 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/// Bool initializations should use true and false. Bool tests don't need -/// comparisons. Based on contributions from Joe Perches, Rusty Russell -/// and Bruce W Allan. -/// -// Confidence: High -// Copyright: (C) 2012 Julia Lawall, INRIA/LIP6. -// Copyright: (C) 2012 Gilles Muller, INRIA/LiP6. -// URL: http://coccinelle.lip6.fr/ -// Options: --include-headers - -virtual patch -virtual context -virtual org -virtual report - -@boolok@ -symbol true,false; -@@ -( -true -| -false -) - -@depends on patch@ -bool t; -@@ - -( -- t == true -+ t -| -- true == t -+ t -| -- t != true -+ !t -| -- true != t -+ !t -| -- t == false -+ !t -| -- false == t -+ !t -| -- t != false -+ t -| -- false != t -+ t -) - -@depends on patch disable is_zero, isnt_zero@ -bool t; -@@ - -( -- t == 1 -+ t -| -- t != 1 -+ !t -| -- t == 0 -+ !t -| -- t != 0 -+ t -) - -@depends on patch && boolok@ -bool b; -@@ -( - b = -- 0 -+ false -| - b = -- 1 -+ true -) - -// --------------------------------------------------------------------- - -@r1 depends on !patch@ -bool t; -position p; -@@ - -( -* t@p == true -| -* true == t@p -| -* t@p != true -| -* true != t@p -| -* t@p == false -| -* false == t@p -| -* t@p != false -| -* false != t@p -) - -@r2 depends on !patch disable is_zero, isnt_zero@ -bool t; -position p; -@@ - -( -* t@p == 1 -| -* t@p != 1 -| -* t@p == 0 -| -* t@p != 0 -) - -@r3 depends on !patch && boolok@ -bool b; -position p1; -@@ -( -*b@p1 = 0 -| -*b@p1 = 1 -) - -@r4 depends on !patch@ -bool b; -position p2; -identifier i; -constant c != {0,1}; -@@ -( - b = i -| -*b@p2 = c -) - -@script:python depends on org@ -p << r1.p; -@@ - -cocci.print_main("WARNING: Comparison to bool",p) - -@script:python depends on org@ -p << r2.p; -@@ - -cocci.print_main("WARNING: Comparison of 0/1 to bool variable",p) - -@script:python depends on org@ -p1 << r3.p1; -@@ - -cocci.print_main("WARNING: Assignment of 0/1 to bool variable",p1) - -@script:python depends on org@ -p2 << r4.p2; -@@ - -cocci.print_main("ERROR: Assignment of non-0/1 constant to bool variable",p2) - -@script:python depends on report@ -p << r1.p; -@@ - -coccilib.report.print_report(p[0],"WARNING: Comparison to bool") - -@script:python depends on report@ -p << r2.p; -@@ - -coccilib.report.print_report(p[0],"WARNING: Comparison of 0/1 to bool variable") - -@script:python depends on report@ -p1 << r3.p1; -@@ - -coccilib.report.print_report(p1[0],"WARNING: Assignment of 0/1 to bool variable") - -@script:python depends on report@ -p2 << r4.p2; -@@ - -coccilib.report.print_report(p2[0],"ERROR: Assignment of non-0/1 constant to bool variable") From 853c1a789f5fe8e783586a5c2dcc2ad1b57ac20f Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Fri, 30 Oct 2020 16:25:23 -0700 Subject: [PATCH 007/326] platform/chrome: Don't treat RTC events as wakeup sources The EC sends an RTC host event when the RTC fires, but we don't need to treat that as a wakeup event here. The RTC class already properly handles activating and deactivating a wakeup source in rtc_update_irq() by calling pm_stay_awake() at the start of processing and pm_relax() once all expired RTC timers have been processed. This reduces one wakeup increment but not much else. I noticed this while debugging RTC wakeups and how they always incremented the wakeup count by two instead of one because this is duplicated. Signed-off-by: Stephen Boyd Signed-off-by: Enric Balletbo i Serra Cc: Guenter Roeck Cc: Alessandro Zummo Cc: Alexandre Belloni Cc: Link: https://lore.kernel.org/r/20201030232523.2654478-1-swboyd@chromium.org --- drivers/platform/chrome/cros_ec_proto.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c index 0ecee8b8773d..7c92a6e22d75 100644 --- a/drivers/platform/chrome/cros_ec_proto.c +++ b/drivers/platform/chrome/cros_ec_proto.c @@ -742,12 +742,16 @@ int cros_ec_get_next_event(struct cros_ec_device *ec_dev, * Sensor events need to be parsed by the sensor sub-device. * Defer them, and don't report the wakeup here. */ - if (event_type == EC_MKBP_EVENT_SENSOR_FIFO) - *wake_event = false; - /* Masked host-events should not count as wake events. */ - else if (host_event && - !(host_event & ec_dev->host_event_wake_mask)) + if (event_type == EC_MKBP_EVENT_SENSOR_FIFO) { *wake_event = false; + } else if (host_event) { + /* rtc_update_irq() already handles wakeup events. */ + if (host_event & EC_HOST_EVENT_MASK(EC_HOST_EVENT_RTC)) + *wake_event = false; + /* Masked host-events should not count as wake events. */ + if (!(host_event & ec_dev->host_event_wake_mask)) + *wake_event = false; + } } return ret; From 0498710be002b35bcb43895c4133a4c4bbfd837e Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Thu, 29 Oct 2020 15:27:30 -0700 Subject: [PATCH 008/326] platform/chrome: cros_ec_typec: Relocate set_port_params_v*() functions Move the cros_typec_set_port_params_v0/v1() functions closer to the place where they are called, cros_typec_port_update(). While we are performing the relocation, also move cros_typec_get_mux_info() closer to its call-site. No functional changes are introduced by this commit. Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20201029222738.482366-2-pmalani@chromium.org --- drivers/platform/chrome/cros_ec_typec.c | 136 ++++++++++++------------ 1 file changed, 68 insertions(+), 68 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index 31be31161350..49083e21317d 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -329,74 +329,6 @@ static int cros_typec_ec_command(struct cros_typec_data *typec, return ret; } -static void cros_typec_set_port_params_v0(struct cros_typec_data *typec, - int port_num, struct ec_response_usb_pd_control *resp) -{ - struct typec_port *port = typec->ports[port_num]->port; - enum typec_orientation polarity; - - if (!resp->enabled) - polarity = TYPEC_ORIENTATION_NONE; - else if (!resp->polarity) - polarity = TYPEC_ORIENTATION_NORMAL; - else - polarity = TYPEC_ORIENTATION_REVERSE; - - typec_set_pwr_role(port, resp->role ? TYPEC_SOURCE : TYPEC_SINK); - typec_set_orientation(port, polarity); -} - -static void cros_typec_set_port_params_v1(struct cros_typec_data *typec, - int port_num, struct ec_response_usb_pd_control_v1 *resp) -{ - struct typec_port *port = typec->ports[port_num]->port; - enum typec_orientation polarity; - bool pd_en; - int ret; - - if (!(resp->enabled & PD_CTRL_RESP_ENABLED_CONNECTED)) - polarity = TYPEC_ORIENTATION_NONE; - else if (!resp->polarity) - polarity = TYPEC_ORIENTATION_NORMAL; - else - polarity = TYPEC_ORIENTATION_REVERSE; - typec_set_orientation(port, polarity); - typec_set_data_role(port, resp->role & PD_CTRL_RESP_ROLE_DATA ? - TYPEC_HOST : TYPEC_DEVICE); - typec_set_pwr_role(port, resp->role & PD_CTRL_RESP_ROLE_POWER ? - TYPEC_SOURCE : TYPEC_SINK); - typec_set_vconn_role(port, resp->role & PD_CTRL_RESP_ROLE_VCONN ? - TYPEC_SOURCE : TYPEC_SINK); - - /* Register/remove partners when a connect/disconnect occurs. */ - if (resp->enabled & PD_CTRL_RESP_ENABLED_CONNECTED) { - if (typec->ports[port_num]->partner) - return; - - pd_en = resp->enabled & PD_CTRL_RESP_ENABLED_PD_CAPABLE; - ret = cros_typec_add_partner(typec, port_num, pd_en); - if (ret) - dev_warn(typec->dev, - "Failed to register partner on port: %d\n", - port_num); - } else { - if (!typec->ports[port_num]->partner) - return; - cros_typec_remove_partner(typec, port_num); - } -} - -static int cros_typec_get_mux_info(struct cros_typec_data *typec, int port_num, - struct ec_response_usb_pd_mux_info *resp) -{ - struct ec_params_usb_pd_mux_info req = { - .port = port_num, - }; - - return cros_typec_ec_command(typec, 0, EC_CMD_USB_PD_MUX_INFO, &req, - sizeof(req), resp, sizeof(*resp)); -} - static int cros_typec_usb_safe_state(struct cros_typec_port *port) { port->state.mode = TYPEC_STATE_SAFE; @@ -573,6 +505,74 @@ static int cros_typec_configure_mux(struct cros_typec_data *typec, int port_num, return ret; } +static void cros_typec_set_port_params_v0(struct cros_typec_data *typec, + int port_num, struct ec_response_usb_pd_control *resp) +{ + struct typec_port *port = typec->ports[port_num]->port; + enum typec_orientation polarity; + + if (!resp->enabled) + polarity = TYPEC_ORIENTATION_NONE; + else if (!resp->polarity) + polarity = TYPEC_ORIENTATION_NORMAL; + else + polarity = TYPEC_ORIENTATION_REVERSE; + + typec_set_pwr_role(port, resp->role ? TYPEC_SOURCE : TYPEC_SINK); + typec_set_orientation(port, polarity); +} + +static void cros_typec_set_port_params_v1(struct cros_typec_data *typec, + int port_num, struct ec_response_usb_pd_control_v1 *resp) +{ + struct typec_port *port = typec->ports[port_num]->port; + enum typec_orientation polarity; + bool pd_en; + int ret; + + if (!(resp->enabled & PD_CTRL_RESP_ENABLED_CONNECTED)) + polarity = TYPEC_ORIENTATION_NONE; + else if (!resp->polarity) + polarity = TYPEC_ORIENTATION_NORMAL; + else + polarity = TYPEC_ORIENTATION_REVERSE; + typec_set_orientation(port, polarity); + typec_set_data_role(port, resp->role & PD_CTRL_RESP_ROLE_DATA ? + TYPEC_HOST : TYPEC_DEVICE); + typec_set_pwr_role(port, resp->role & PD_CTRL_RESP_ROLE_POWER ? + TYPEC_SOURCE : TYPEC_SINK); + typec_set_vconn_role(port, resp->role & PD_CTRL_RESP_ROLE_VCONN ? + TYPEC_SOURCE : TYPEC_SINK); + + /* Register/remove partners when a connect/disconnect occurs. */ + if (resp->enabled & PD_CTRL_RESP_ENABLED_CONNECTED) { + if (typec->ports[port_num]->partner) + return; + + pd_en = resp->enabled & PD_CTRL_RESP_ENABLED_PD_CAPABLE; + ret = cros_typec_add_partner(typec, port_num, pd_en); + if (ret) + dev_warn(typec->dev, + "Failed to register partner on port: %d\n", + port_num); + } else { + if (!typec->ports[port_num]->partner) + return; + cros_typec_remove_partner(typec, port_num); + } +} + +static int cros_typec_get_mux_info(struct cros_typec_data *typec, int port_num, + struct ec_response_usb_pd_mux_info *resp) +{ + struct ec_params_usb_pd_mux_info req = { + .port = port_num, + }; + + return cros_typec_ec_command(typec, 0, EC_CMD_USB_PD_MUX_INFO, &req, + sizeof(req), resp, sizeof(*resp)); +} + static int cros_typec_port_update(struct cros_typec_data *typec, int port_num) { struct ec_params_usb_pd_control req; From 7ab5a673f4ce65875c76e9812d2e6da063b87fb7 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Thu, 29 Oct 2020 15:27:32 -0700 Subject: [PATCH 009/326] platform/chrome: cros_ec_typec: Fix remove partner logic The cros_unregister_ports() function can be called in situations where the partner has not been registered yet, and so its related data structures would not have been initialized. Calling cros_typec_remove_partner() in such a situation can lead to null pointer dereferences. So, only call cros_typec_remove_partner() if there is a valid registered partner pointer. Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20201029222738.482366-3-pmalani@chromium.org --- drivers/platform/chrome/cros_ec_typec.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index 49083e21317d..2665d8125910 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -190,7 +190,10 @@ static void cros_unregister_ports(struct cros_typec_data *typec) for (i = 0; i < typec->num_ports; i++) { if (!typec->ports[i]) continue; - cros_typec_remove_partner(typec, i); + + if (typec->ports[i]->partner) + cros_typec_remove_partner(typec, i); + usb_role_switch_put(typec->ports[i]->role_sw); typec_switch_put(typec->ports[i]->ori_sw); typec_mux_put(typec->ports[i]->mux); From 514acf1cefd020eb21d7c180050a8d66b723d2d8 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Thu, 29 Oct 2020 15:27:34 -0700 Subject: [PATCH 010/326] platform/chrome: cros_ec_typec: Clear partner identity on device removal The partner identity struct isn't reset when a partner is removed, meaning a subsequent partner can inherit an old partner's identity VDOs before discovery is complete. So, clear that struct when a partner removal is detected. Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20201029222738.482366-4-pmalani@chromium.org --- drivers/platform/chrome/cros_ec_typec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index 2665d8125910..faef56bcb9c5 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -181,6 +181,7 @@ static void cros_typec_remove_partner(struct cros_typec_data *typec, typec_unregister_partner(port->partner); port->partner = NULL; + memset(&port->p_identity, 0, sizeof(port->p_identity)); } static void cros_unregister_ports(struct cros_typec_data *typec) From cd2c40ff90b0e385c18f881ab5e17f7137864223 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Thu, 29 Oct 2020 15:27:36 -0700 Subject: [PATCH 011/326] platform/chrome: cros_ec: Import Type C host commands Import the EC_CMD_TYPEC_STATUS and EC_CMD_TYPEC_DISCOVERY Chrome OS EC host commands from the EC code base [1]. These commands can be used by the application processor to query Power Delivery (PD) discovery information concerning connected Type C peripherals. Also add the EC_FEATURE_TYPEC_CMD feature flag, which is used to determine whether these commands are supported by the EC. [1]: https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/master/include/ec_commands.h Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20201029222738.482366-5-pmalani@chromium.org --- .../linux/platform_data/cros_ec_commands.h | 155 ++++++++++++++++++ 1 file changed, 155 insertions(+) diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index 1fcfe9e63cb9..7f54fdcdd8cb 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -1284,6 +1284,8 @@ enum ec_feature_code { EC_FEATURE_SCP = 39, /* The MCU is an Integrated Sensor Hub */ EC_FEATURE_ISH = 40, + /* New TCPMv2 TYPEC_ prefaced commands supported */ + EC_FEATURE_TYPEC_CMD = 41, }; #define EC_FEATURE_MASK_0(event_code) BIT(event_code % 32) @@ -5528,6 +5530,159 @@ struct ec_response_regulator_get_voltage { uint32_t voltage_mv; } __ec_align4; +/* + * Gather all discovery information for the given port and partner type. + * + * Note that if discovery has not yet completed, only the currently completed + * responses will be filled in. If the discovery data structures are changed + * in the process of the command running, BUSY will be returned. + * + * VDO field sizes are set to the maximum possible number of VDOs a VDM may + * contain, while the number of SVIDs here is selected to fit within the PROTO2 + * maximum parameter size. + */ +#define EC_CMD_TYPEC_DISCOVERY 0x0131 + +enum typec_partner_type { + TYPEC_PARTNER_SOP = 0, + TYPEC_PARTNER_SOP_PRIME = 1, +}; + +struct ec_params_typec_discovery { + uint8_t port; + uint8_t partner_type; /* enum typec_partner_type */ +} __ec_align1; + +struct svid_mode_info { + uint16_t svid; + uint16_t mode_count; /* Number of modes partner sent */ + uint32_t mode_vdo[6]; /* Max VDOs allowed after VDM header is 6 */ +}; + +struct ec_response_typec_discovery { + uint8_t identity_count; /* Number of identity VDOs partner sent */ + uint8_t svid_count; /* Number of SVIDs partner sent */ + uint16_t reserved; + uint32_t discovery_vdo[6]; /* Max VDOs allowed after VDM header is 6 */ + struct svid_mode_info svids[0]; +} __ec_align1; + +/* + * Gather all status information for a port. + * + * Note: this covers many of the return fields from the deprecated + * EC_CMD_USB_PD_CONTROL command, except those that are redundant with the + * discovery data. The "enum pd_cc_states" is defined with the deprecated + * EC_CMD_USB_PD_CONTROL command. + * + * This also combines in the EC_CMD_USB_PD_MUX_INFO flags. + */ +#define EC_CMD_TYPEC_STATUS 0x0133 + +/* + * Power role. + * + * Note this is also used for PD header creation, and values align to those in + * the Power Delivery Specification Revision 3.0 (See + * 6.2.1.1.4 Port Power Role). + */ +enum pd_power_role { + PD_ROLE_SINK = 0, + PD_ROLE_SOURCE = 1 +}; + +/* + * Data role. + * + * Note this is also used for PD header creation, and the first two values + * align to those in the Power Delivery Specification Revision 3.0 (See + * 6.2.1.1.6 Port Data Role). + */ +enum pd_data_role { + PD_ROLE_UFP = 0, + PD_ROLE_DFP = 1, + PD_ROLE_DISCONNECTED = 2, +}; + +enum pd_vconn_role { + PD_ROLE_VCONN_OFF = 0, + PD_ROLE_VCONN_SRC = 1, +}; + +/* + * Note: BIT(0) may be used to determine whether the polarity is CC1 or CC2, + * regardless of whether a debug accessory is connected. + */ +enum tcpc_cc_polarity { + /* + * _CCx: is used to indicate the polarity while not connected to + * a Debug Accessory. Only one CC line will assert a resistor and + * the other will be open. + */ + POLARITY_CC1 = 0, + POLARITY_CC2 = 1, + + /* + * _CCx_DTS is used to indicate the polarity while connected to a + * SRC Debug Accessory. Assert resistors on both lines. + */ + POLARITY_CC1_DTS = 2, + POLARITY_CC2_DTS = 3, + + /* + * The current TCPC code relies on these specific POLARITY values. + * Adding in a check to verify if the list grows for any reason + * that this will give a hint that other places need to be + * adjusted. + */ + POLARITY_COUNT +}; + +#define PD_STATUS_EVENT_SOP_DISC_DONE BIT(0) +#define PD_STATUS_EVENT_SOP_PRIME_DISC_DONE BIT(1) + +struct ec_params_typec_status { + uint8_t port; +} __ec_align1; + +struct ec_response_typec_status { + uint8_t pd_enabled; /* PD communication enabled - bool */ + uint8_t dev_connected; /* Device connected - bool */ + uint8_t sop_connected; /* Device is SOP PD capable - bool */ + uint8_t source_cap_count; /* Number of Source Cap PDOs */ + + uint8_t power_role; /* enum pd_power_role */ + uint8_t data_role; /* enum pd_data_role */ + uint8_t vconn_role; /* enum pd_vconn_role */ + uint8_t sink_cap_count; /* Number of Sink Cap PDOs */ + + uint8_t polarity; /* enum tcpc_cc_polarity */ + uint8_t cc_state; /* enum pd_cc_states */ + uint8_t dp_pin; /* DP pin mode (MODE_DP_IN_[A-E]) */ + uint8_t mux_state; /* USB_PD_MUX* - encoded mux state */ + + char tc_state[32]; /* TC state name */ + + uint32_t events; /* PD_STATUS_EVENT bitmask */ + + /* + * BCD PD revisions for partners + * + * The format has the PD major reversion in the upper nibble, and PD + * minor version in the next nibble. Following two nibbles are + * currently 0. + * ex. PD 3.2 would map to 0x3200 + * + * PD major/minor will be 0 if no PD device is connected. + */ + uint16_t sop_revision; + uint16_t sop_prime_revision; + + uint32_t source_cap_pdos[7]; /* Max 7 PDOs can be present */ + + uint32_t sink_cap_pdos[7]; /* Max 7 PDOs can be present */ +} __ec_align1; + /*****************************************************************************/ /* The command range 0x200-0x2FF is reserved for Rotor. */ From 80f8cef60d79f23c02e546ba3de2fce84d5e8bdb Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Thu, 29 Oct 2020 15:27:38 -0700 Subject: [PATCH 012/326] platform/chrome: cros_ec_typec: Introduce TYPEC_STATUS Make a call to the newly introduced EC_CMD_TYPEC_STATUS command. Currently we just check to see if the SOP (port-partner) discovery was done and emit a debug level print for it. Subsequent patches will retrieve and parse the discovery data and fill out the Type C connector class data structures. Also check the EC_FEATURE_TYPEC_CMD feature flag at probe, and only call the new TYPEC_STATUS command if the feature flag is supported. Reported-by: kernel test robot Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20201029222738.482366-6-pmalani@chromium.org --- drivers/platform/chrome/cros_ec_typec.c | 52 +++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index faef56bcb9c5..f578d0bfbe5a 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -48,6 +48,9 @@ struct cros_typec_port { /* Port alt modes. */ struct typec_altmode p_altmode[CROS_EC_ALTMODE_MAX]; + + /* Flag indicating that PD discovery data parsing is completed. */ + bool disc_done; }; /* Platform-specific data for the Chrome OS EC Type C controller. */ @@ -60,6 +63,7 @@ struct cros_typec_data { struct cros_typec_port *ports[EC_USB_PD_MAX_PORTS]; struct notifier_block nb; struct work_struct port_work; + bool typec_cmd_supported; }; static int cros_typec_parse_port_props(struct typec_capability *cap, @@ -182,6 +186,7 @@ static void cros_typec_remove_partner(struct cros_typec_data *typec, typec_unregister_partner(port->partner); port->partner = NULL; memset(&port->p_identity, 0, sizeof(port->p_identity)); + port->disc_done = false; } static void cros_unregister_ports(struct cros_typec_data *typec) @@ -577,6 +582,31 @@ static int cros_typec_get_mux_info(struct cros_typec_data *typec, int port_num, sizeof(req), resp, sizeof(*resp)); } +static void cros_typec_handle_status(struct cros_typec_data *typec, int port_num) +{ + struct ec_response_typec_status resp; + struct ec_params_typec_status req = { + .port = port_num, + }; + int ret; + + ret = cros_typec_ec_command(typec, 0, EC_CMD_TYPEC_STATUS, &req, sizeof(req), + &resp, sizeof(resp)); + if (ret < 0) { + dev_warn(typec->dev, "EC_CMD_TYPEC_STATUS failed for port: %d\n", port_num); + return; + } + + if (typec->ports[port_num]->disc_done) + return; + + /* Handle any events appropriately. */ + if (resp.events & PD_STATUS_EVENT_SOP_DISC_DONE) { + dev_dbg(typec->dev, "SOP Discovery done for port: %d\n", port_num); + typec->ports[port_num]->disc_done = true; + } +} + static int cros_typec_port_update(struct cros_typec_data *typec, int port_num) { struct ec_params_usb_pd_control req; @@ -613,6 +643,9 @@ static int cros_typec_port_update(struct cros_typec_data *typec, int port_num) cros_typec_set_port_params_v0(typec, port_num, (struct ec_response_usb_pd_control *) &resp); + if (typec->typec_cmd_supported) + cros_typec_handle_status(typec, port_num); + /* Update the switches if they exist, according to requested state */ ret = cros_typec_get_mux_info(typec, port_num, &mux_resp); if (ret < 0) { @@ -661,6 +694,23 @@ static int cros_typec_get_cmd_version(struct cros_typec_data *typec) return 0; } +/* Check the EC feature flags to see if TYPEC_* commands are supported. */ +static int cros_typec_cmds_supported(struct cros_typec_data *typec) +{ + struct ec_response_get_features resp = {}; + int ret; + + ret = cros_typec_ec_command(typec, 0, EC_CMD_GET_FEATURES, NULL, 0, + &resp, sizeof(resp)); + if (ret < 0) { + dev_warn(typec->dev, + "Failed to get features, assuming typec commands unsupported.\n"); + return 0; + } + + return resp.flags[EC_FEATURE_TYPEC_CMD / 32] & EC_FEATURE_MASK_1(EC_FEATURE_TYPEC_CMD); +} + static void cros_typec_port_work(struct work_struct *work) { struct cros_typec_data *typec = container_of(work, struct cros_typec_data, port_work); @@ -720,6 +770,8 @@ static int cros_typec_probe(struct platform_device *pdev) return ret; } + typec->typec_cmd_supported = !!cros_typec_cmds_supported(typec); + ret = cros_typec_ec_command(typec, 0, EC_CMD_USB_PD_PORTS, NULL, 0, &resp, sizeof(resp)); if (ret < 0) From f6f668118918f533676e51f3214f5a104562b59c Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Thu, 29 Oct 2020 15:27:40 -0700 Subject: [PATCH 013/326] platform/chrome: cros_ec_typec: Parse partner PD ID VDOs Use EC_CMD_TYPE_DISCOVERY to retrieve and store the discovery data for the port partner. With that data, update the PD Identity VDO values for the partner, which were earlier not initialized. Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra Acked-by: Heikki Krogerus Cc: Heikki Krogerus Link: https://lore.kernel.org/r/20201029222738.482366-7-pmalani@chromium.org --- drivers/platform/chrome/cros_ec_typec.c | 60 ++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index f578d0bfbe5a..f14550dac614 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,7 @@ struct cros_typec_port { /* Flag indicating that PD discovery data parsing is completed. */ bool disc_done; + struct ec_response_typec_discovery *sop_disc; }; /* Platform-specific data for the Chrome OS EC Type C controller. */ @@ -298,6 +300,12 @@ static int cros_typec_init_ports(struct cros_typec_data *typec) port_num); cros_typec_register_port_altmodes(typec, port_num); + + cros_port->sop_disc = devm_kzalloc(dev, EC_PROTO2_MAX_RESPONSE_SIZE, GFP_KERNEL); + if (!cros_port->sop_disc) { + ret = -ENOMEM; + goto unregister_ports; + } } return 0; @@ -582,6 +590,51 @@ static int cros_typec_get_mux_info(struct cros_typec_data *typec, int port_num, sizeof(req), resp, sizeof(*resp)); } +static int cros_typec_handle_sop_disc(struct cros_typec_data *typec, int port_num) +{ + struct cros_typec_port *port = typec->ports[port_num]; + struct ec_response_typec_discovery *sop_disc = port->sop_disc; + struct ec_params_typec_discovery req = { + .port = port_num, + .partner_type = TYPEC_PARTNER_SOP, + }; + int ret = 0; + int i; + + if (!port->partner) { + dev_err(typec->dev, + "SOP Discovery received without partner registered, port: %d\n", + port_num); + ret = -EINVAL; + goto disc_exit; + } + + memset(sop_disc, 0, EC_PROTO2_MAX_RESPONSE_SIZE); + ret = cros_typec_ec_command(typec, 0, EC_CMD_TYPEC_DISCOVERY, &req, sizeof(req), + sop_disc, EC_PROTO2_MAX_RESPONSE_SIZE); + if (ret < 0) { + dev_err(typec->dev, "Failed to get SOP discovery data for port: %d\n", port_num); + goto disc_exit; + } + + /* First, update the PD identity VDOs for the partner. */ + if (sop_disc->identity_count > 0) + port->p_identity.id_header = sop_disc->discovery_vdo[0]; + if (sop_disc->identity_count > 1) + port->p_identity.cert_stat = sop_disc->discovery_vdo[1]; + if (sop_disc->identity_count > 2) + port->p_identity.product = sop_disc->discovery_vdo[2]; + + /* Copy the remaining identity VDOs till a maximum of 6. */ + for (i = 3; i < sop_disc->identity_count && i < VDO_MAX_OBJECTS; i++) + port->p_identity.vdo[i - 3] = sop_disc->discovery_vdo[i]; + + ret = typec_partner_set_identity(port->partner); + +disc_exit: + return ret; +} + static void cros_typec_handle_status(struct cros_typec_data *typec, int port_num) { struct ec_response_typec_status resp; @@ -602,7 +655,12 @@ static void cros_typec_handle_status(struct cros_typec_data *typec, int port_num /* Handle any events appropriately. */ if (resp.events & PD_STATUS_EVENT_SOP_DISC_DONE) { - dev_dbg(typec->dev, "SOP Discovery done for port: %d\n", port_num); + ret = cros_typec_handle_sop_disc(typec, port_num); + if (ret < 0) { + dev_err(typec->dev, "Couldn't parse SOP Disc data, port: %d\n", port_num); + return; + } + typec->ports[port_num]->disc_done = true; } } From de0f49487db3667f5204dcec6d3482c9bd1a0a30 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Thu, 29 Oct 2020 15:27:42 -0700 Subject: [PATCH 014/326] platform/chrome: cros_ec_typec: Register partner altmodes Use the discovery data from the Chrome EC to register parter altmodes with the Type C Connector Class framework. Also introduce a node struct to keep track of the list of registered alt modes. Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra Acked-by: Heikki Krogerus Cc: Heikki Krogerus Link: https://lore.kernel.org/r/20201029222738.482366-8-pmalani@chromium.org --- drivers/platform/chrome/cros_ec_typec.c | 77 +++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index f14550dac614..ce031a10eb1b 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -31,6 +32,12 @@ enum { CROS_EC_ALTMODE_MAX, }; +/* Container for altmode pointer nodes. */ +struct cros_typec_altmode_node { + struct typec_altmode *amode; + struct list_head list; +}; + /* Per port data. */ struct cros_typec_port { struct typec_port *port; @@ -53,6 +60,7 @@ struct cros_typec_port { /* Flag indicating that PD discovery data parsing is completed. */ bool disc_done; struct ec_response_typec_discovery *sop_disc; + struct list_head partner_mode_list; }; /* Platform-specific data for the Chrome OS EC Type C controller. */ @@ -172,11 +180,25 @@ static int cros_typec_add_partner(struct cros_typec_data *typec, int port_num, return ret; } +static void cros_typec_unregister_altmodes(struct cros_typec_data *typec, int port_num) +{ + struct cros_typec_port *port = typec->ports[port_num]; + struct cros_typec_altmode_node *node, *tmp; + + list_for_each_entry_safe(node, tmp, &port->partner_mode_list, list) { + list_del(&node->list); + typec_unregister_altmode(node->amode); + devm_kfree(typec->dev, node); + } +} + static void cros_typec_remove_partner(struct cros_typec_data *typec, int port_num) { struct cros_typec_port *port = typec->ports[port_num]; + cros_typec_unregister_altmodes(typec, port_num); + port->state.alt = NULL; port->state.mode = TYPEC_STATE_USB; port->state.data = NULL; @@ -306,6 +328,8 @@ static int cros_typec_init_ports(struct cros_typec_data *typec) ret = -ENOMEM; goto unregister_ports; } + + INIT_LIST_HEAD(&cros_port->partner_mode_list); } return 0; @@ -590,6 +614,49 @@ static int cros_typec_get_mux_info(struct cros_typec_data *typec, int port_num, sizeof(req), resp, sizeof(*resp)); } +static int cros_typec_register_altmodes(struct cros_typec_data *typec, int port_num) +{ + struct cros_typec_port *port = typec->ports[port_num]; + struct ec_response_typec_discovery *sop_disc = port->sop_disc; + struct cros_typec_altmode_node *node; + struct typec_altmode_desc desc; + struct typec_altmode *amode; + int ret = 0; + int i, j; + + for (i = 0; i < sop_disc->svid_count; i++) { + for (j = 0; j < sop_disc->svids[i].mode_count; j++) { + memset(&desc, 0, sizeof(desc)); + desc.svid = sop_disc->svids[i].svid; + desc.mode = j; + desc.vdo = sop_disc->svids[i].mode_vdo[j]; + + amode = typec_partner_register_altmode(port->partner, &desc); + if (IS_ERR(amode)) { + ret = PTR_ERR(amode); + goto err_cleanup; + } + + /* If no memory is available we should unregister and exit. */ + node = devm_kzalloc(typec->dev, sizeof(*node), GFP_KERNEL); + if (!node) { + ret = -ENOMEM; + typec_unregister_altmode(amode); + goto err_cleanup; + } + + node->amode = amode; + list_add_tail(&node->list, &port->partner_mode_list); + } + } + + return 0; + +err_cleanup: + cros_typec_unregister_altmodes(typec, port_num); + return ret; +} + static int cros_typec_handle_sop_disc(struct cros_typec_data *typec, int port_num) { struct cros_typec_port *port = typec->ports[port_num]; @@ -630,6 +697,16 @@ static int cros_typec_handle_sop_disc(struct cros_typec_data *typec, int port_nu port->p_identity.vdo[i - 3] = sop_disc->discovery_vdo[i]; ret = typec_partner_set_identity(port->partner); + if (ret < 0) { + dev_err(typec->dev, "Failed to update partner PD identity, port: %d\n", port_num); + goto disc_exit; + } + + ret = cros_typec_register_altmodes(typec, port_num); + if (ret < 0) { + dev_err(typec->dev, "Failed to register partner altmodes, port: %d\n", port_num); + goto disc_exit; + } disc_exit: return ret; From 8d39cee0592e0129280e5a3cc480d64649c5e63f Mon Sep 17 00:00:00 2001 From: Chester Lin Date: Fri, 30 Oct 2020 14:08:40 +0800 Subject: [PATCH 015/326] arm64/ima: add ima_arch support Add arm64 IMA arch support. The code and arch policy is mainly inherited from x86. Co-developed-by: Chester Lin Signed-off-by: Chester Lin Acked-by: Mimi Zohar Acked-by: Catalin Marinas Signed-off-by: Ard Biesheuvel --- arch/arm64/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f858c352f72a..04e78a367c2c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1849,6 +1849,7 @@ config EFI select EFI_RUNTIME_WRAPPERS select EFI_STUB select EFI_GENERIC_STUB + imply IMA_SECURE_AND_OR_TRUSTED_BOOT default y help This option provides support for runtime services provided From b283477d394ac41ca59ee20eb9293ae9002eb1d7 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 3 Nov 2020 07:50:04 +0100 Subject: [PATCH 016/326] efi: x86/xen: switch to efi_get_secureboot_mode helper Now that we have a static inline helper to discover the platform's secure boot mode that can be shared between the EFI stub and the kernel proper, switch to it, and drop some comments about keeping them in sync manually. Signed-off-by: Ard Biesheuvel --- arch/x86/xen/efi.c | 37 ++++++----------------- drivers/firmware/efi/libstub/secureboot.c | 3 -- 2 files changed, 9 insertions(+), 31 deletions(-) diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 205a9bc981b0..7d7ffb9c826a 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -93,37 +93,22 @@ static efi_system_table_t __init *xen_efi_probe(void) /* * Determine whether we're in secure boot mode. - * - * Please keep the logic in sync with - * drivers/firmware/efi/libstub/secureboot.c:efi_get_secureboot(). */ static enum efi_secureboot_mode xen_efi_get_secureboot(void) { - static efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID; static efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID; + enum efi_secureboot_mode mode; efi_status_t status; - u8 moksbstate, secboot, setupmode; + u8 moksbstate; unsigned long size; - size = sizeof(secboot); - status = efi.get_variable(L"SecureBoot", &efi_variable_guid, - NULL, &size, &secboot); - - if (status == EFI_NOT_FOUND) - return efi_secureboot_mode_disabled; - - if (status != EFI_SUCCESS) - goto out_efi_err; - - size = sizeof(setupmode); - status = efi.get_variable(L"SetupMode", &efi_variable_guid, - NULL, &size, &setupmode); - - if (status != EFI_SUCCESS) - goto out_efi_err; - - if (secboot == 0 || setupmode == 1) - return efi_secureboot_mode_disabled; + mode = efi_get_secureboot_mode(efi.get_variable); + if (mode == efi_secureboot_mode_unknown) { + pr_err("Could not determine UEFI Secure Boot status.\n"); + return efi_secureboot_mode_unknown; + } + if (mode != efi_secureboot_mode_enabled) + return mode; /* See if a user has put the shim into insecure mode. */ size = sizeof(moksbstate); @@ -140,10 +125,6 @@ static enum efi_secureboot_mode xen_efi_get_secureboot(void) secure_boot_enabled: pr_info("UEFI Secure Boot is enabled.\n"); return efi_secureboot_mode_enabled; - - out_efi_err: - pr_err("Could not determine UEFI Secure Boot status.\n"); - return efi_secureboot_mode_unknown; } void __init xen_efi_init(struct boot_params *boot_params) diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c index af18d86c1604..8a18930f3eb6 100644 --- a/drivers/firmware/efi/libstub/secureboot.c +++ b/drivers/firmware/efi/libstub/secureboot.c @@ -24,9 +24,6 @@ static efi_status_t get_var(efi_char16_t *name, efi_guid_t *vendor, u32 *attr, /* * Determine whether we're in secure boot mode. - * - * Please keep the logic in sync with - * arch/x86/xen/efi.c:xen_efi_get_secureboot(). */ enum efi_secureboot_mode efi_get_secureboot(void) { From 1a57b1a3e11086a4f183b245754b213b1d9b2d40 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 18 Nov 2020 15:35:17 +0800 Subject: [PATCH 017/326] ACPI/nfit: avoid accessing uninitialized memory in acpi_nfit_ctl() The ACPI_ALLOCATE() does not zero the "buf", so when the condition "integer->type != ACPI_TYPE_INTEGER" in int_to_buf() is met, the result is unpredictable in acpi_nfit_ctl(). Signed-off-by: Zhen Lei Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/20201118073517.1884-1-thunder.leizhen@huawei.com Signed-off-by: Dan Williams --- drivers/acpi/nfit/core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 442608220b5c..cda7b6c52504 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -282,18 +282,19 @@ err: static union acpi_object *int_to_buf(union acpi_object *integer) { - union acpi_object *buf = ACPI_ALLOCATE(sizeof(*buf) + 4); + union acpi_object *buf = NULL; void *dst = NULL; - if (!buf) - goto err; - if (integer->type != ACPI_TYPE_INTEGER) { WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n", integer->type); goto err; } + buf = ACPI_ALLOCATE(sizeof(*buf) + 4); + if (!buf) + goto err; + dst = buf + 1; buf->type = ACPI_TYPE_BUFFER; buf->buffer.length = 4; From 2dd2a1740ee19cd2636d247276cf27bfa434b0e2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 20 Nov 2020 08:50:07 -0800 Subject: [PATCH 018/326] libnvdimm/namespace: Fix reaping of invalidated block-window-namespace labels A recent change to ndctl to attempt to reconfigure namespaces in place uncovered a label accounting problem in block-window-type namespaces. The ndctl "create.sh" test is able to trigger this signature: WARNING: CPU: 34 PID: 9167 at drivers/nvdimm/label.c:1100 __blk_label_update+0x9a3/0xbc0 [libnvdimm] [..] RIP: 0010:__blk_label_update+0x9a3/0xbc0 [libnvdimm] [..] Call Trace: uuid_store+0x21b/0x2f0 [libnvdimm] kernfs_fop_write+0xcf/0x1c0 vfs_write+0xcc/0x380 ksys_write+0x68/0xe0 When allocated capacity for a namespace is renamed (new UUID) the labels with the old UUID need to be deleted. The ndctl behavior to always destroy namespaces on reconfiguration hid this problem. The immediate impact of this bug is limited since block-window-type namespaces only seem to exist in the specification and not in any shipping products. However, the label handling code is being reused for other technologies like CXL region labels, so there is a benefit to making sure both vertical labels sets (block-window) and horizontal label sets (pmem) have a functional reference implementation in libnvdimm. Fixes: c4703ce11c23 ("libnvdimm/namespace: Fix label tracking error") Cc: Cc: Vishal Verma Cc: Dave Jiang Cc: Ira Weiny Signed-off-by: Dan Williams --- drivers/nvdimm/label.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index 47a4828b8b31..6f2be7a34598 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -980,6 +980,15 @@ static int __blk_label_update(struct nd_region *nd_region, } } + /* release slots associated with any invalidated UUIDs */ + mutex_lock(&nd_mapping->lock); + list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) + if (test_and_clear_bit(ND_LABEL_REAP, &label_ent->flags)) { + reap_victim(nd_mapping, label_ent); + list_move(&label_ent->list, &list); + } + mutex_unlock(&nd_mapping->lock); + /* * Find the resource associated with the first label in the set * per the v1.2 namespace specification. From 9a7e3d7f056831a6193d6d737fb7a26dfdceb04b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 23 Nov 2020 17:43:53 -0800 Subject: [PATCH 019/326] ACPI: NFIT: Fix input validation of bus-family Dan reports that smatch thinks userspace can craft an out-of-bound bus family number. However, nd_cmd_clear_to_send() blocks all non-zero values of bus-family since only the kernel can initiate these commands. However, in the speculation path, family is a user controlled array index value so mask it for speculation safety. Also, since the nd_cmd_clear_to_send() safety is non-obvious and possibly may change in the future include input validation as if userspace could get past the nd_cmd_clear_to_send() gatekeeper. Link: http://lore.kernel.org/r/20201111113000.GA1237157@mwanda Reported-by: Dan Carpenter Fixes: 6450ddbd5d8e ("ACPI: NFIT: Define runtime firmware activation commands") Cc: Signed-off-by: Dan Williams --- drivers/acpi/nfit/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index cda7b6c52504..b11b08a60684 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -479,8 +480,11 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, cmd_mask = nd_desc->cmd_mask; if (cmd == ND_CMD_CALL && call_pkg->nd_family) { family = call_pkg->nd_family; - if (!test_bit(family, &nd_desc->bus_family_mask)) + if (family > NVDIMM_BUS_FAMILY_MAX || + !test_bit(family, &nd_desc->bus_family_mask)) return -EINVAL; + family = array_index_nospec(family, + NVDIMM_BUS_FAMILY_MAX + 1); dsm_mask = acpi_desc->family_dsm_mask[family]; guid = to_nfit_bus_uuid(family); } else { From 50a4952fd67b7f7f551e82ac07c51c1a7a74d474 Mon Sep 17 00:00:00 2001 From: Alexander Lochmann Date: Thu, 15 Oct 2020 15:24:52 +0200 Subject: [PATCH 020/326] Updated locking documentation for transaction_t We used LockDoc to derive locking rules for each member of struct transaction_t. Based on those results, we extended the existing documentation by more members of struct transaction_t, and updated the existing documentation. Link: https://lore.kernel.org/r/10cfbef1-994c-c604-f8a6-b1042fcc622f@tu-dortmund.de Signed-off-by: Alexander Lochmann Signed-off-by: Horst Schirmeier Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 578ff196b3ce..d2a4860feb72 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -538,6 +538,7 @@ struct transaction_chp_stats_s { * The transaction keeps track of all of the buffers modified by a * running transaction, and all of the buffers committed but not yet * flushed to home for finished transactions. + * (Locking Documentation improved by LockDoc) */ /* @@ -658,12 +659,12 @@ struct transaction_s unsigned long t_start; /* - * When commit was requested + * When commit was requested [j_state_lock] */ unsigned long t_requested; /* - * Checkpointing stats [j_checkpoint_sem] + * Checkpointing stats [j_list_lock] */ struct transaction_chp_stats_s t_chp_stats; From 7b721e6d334c9699fcd94553a7fe073ec717d926 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 20 Oct 2020 19:41:09 +0800 Subject: [PATCH 021/326] ext4: remove redundant operation that set bh to NULL The out_fail branch path don't release the bh and the second bh is valid only in the for statement, so we don't need to set them to NULL. Signed-off-by: Kaixu Xia Reviewed-by: zhangyi (F) Link: https://lore.kernel.org/r/1603194069-17557-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 94472044f4c1..40e7aebaf3d0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4074,7 +4074,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "unable to read superblock"); ret = PTR_ERR(bh); - bh = NULL; goto out_fail; } /* @@ -4703,7 +4702,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "can't read group descriptor %d", i); db_count = i; ret = PTR_ERR(bh); - bh = NULL; goto failed_mount2; } rcu_read_lock(); From 46bac5352929f75c1ec0c2395b06dcbc0fbaee69 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Tue, 20 Oct 2020 16:22:01 +0800 Subject: [PATCH 022/326] ext4: remove the null check of bio_vec page bv_page can't be NULL in a valid bio_vec, so we can remove the NULL check, as we did in other places when calling bio_for_each_segment_all() to go through all bio_vec of a bio. Reviewed-by: Jan Kara Signed-off-by: Xianting Tian Link: https://lore.kernel.org/r/20201020082201.34257-1-tian.xianting@h3c.com Signed-off-by: Theodore Ts'o --- fs/ext4/page-io.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index defd2e10dfd1..cb135a94482d 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -111,9 +111,6 @@ static void ext4_finish_bio(struct bio *bio) unsigned under_io = 0; unsigned long flags; - if (!page) - continue; - if (fscrypt_is_bounce_page(page)) { bounce_page = page; page = fscrypt_pagecache_page(bounce_page); From face525ecb30b3d7e35be21911a933980ab504f9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 21 Oct 2020 14:23:26 +0100 Subject: [PATCH 023/326] ext4: remove redundant assignment of variable ex Variable ex is assigned a variable that is not being read, the assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20201021132326.148052-1-colin.king@canonical.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 17d7096b3212..90ba74c14694 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5988,7 +5988,6 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) kfree(path); break; } - ex = path2[path2->p_depth].p_ext; for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { cmp1 = cmp2 = 0; if (i <= path->p_depth) From f177ee0882af031b3d7a1e66e1639a58c7932dee Mon Sep 17 00:00:00 2001 From: Roman Anufriev Date: Thu, 22 Oct 2020 06:20:59 +0300 Subject: [PATCH 024/326] ext4: add helpers for checking whether quota can be enabled/is journalled Right now, there are several places, where we check whether fs is capable of enabling quota or if quota is journalled with quite long and non-self-descriptive condition statements. This patch wraps these statements into helpers for better readability and easier usage. Link: https://lore.kernel.org/r/1603336860-16153-1-git-send-email-dotdot@yandex-team.ru Reviewed-by: Jan Kara Signed-off-by: Roman Anufriev Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 15 +++++++++++++++ fs/ext4/ext4_jbd2.h | 9 +++------ fs/ext4/super.c | 5 +---- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 65ecaf96d0a4..fb4b36585ff6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3361,6 +3361,21 @@ static inline void ext4_unlock_group(struct super_block *sb, spin_unlock(ext4_group_lock_ptr(sb, group)); } +#ifdef CONFIG_QUOTA +static inline bool ext4_quota_capable(struct super_block *sb) +{ + return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb)); +} + +static inline bool ext4_is_quota_journalled(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + return (ext4_has_feature_quota(sb) || + sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]); +} +#endif + /* * Block validity checking */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 00dc668e052b..a124c68b0c75 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -86,17 +86,14 @@ #ifdef CONFIG_QUOTA /* Amount of blocks needed for quota update - we know that the structure was * allocated so we need to update only data block */ -#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ - ext4_has_feature_quota(sb)) ? 1 : 0) +#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0) /* Amount of blocks needed for quota insert/delete - we do some block writes * but inode, sb and group updates are done only once */ -#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ - ext4_has_feature_quota(sb)) ?\ +#define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\ (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ +3+DQUOT_INIT_REWRITE) : 0) -#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ - ext4_has_feature_quota(sb)) ?\ +#define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\ (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ +3+DQUOT_DEL_REWRITE) : 0) #else diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 40e7aebaf3d0..d9e1946a634e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6210,11 +6210,8 @@ static int ext4_release_dquot(struct dquot *dquot) static int ext4_mark_dquot_dirty(struct dquot *dquot) { struct super_block *sb = dquot->dq_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - /* Are we journaling quotas? */ - if (ext4_has_feature_quota(sb) || - sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { + if (ext4_is_quota_journalled(sb)) { dquot_mark_dquot_dirty(dquot); return ext4_write_dquot(dquot); } else { From ca9b404ff137d67e559c5bd77533408bb0fa41dc Mon Sep 17 00:00:00 2001 From: Roman Anufriev Date: Thu, 22 Oct 2020 06:21:00 +0300 Subject: [PATCH 025/326] ext4: print quota journalling mode on (re-)mount Right now, it is hard to understand which quota journalling type is enabled: you need to be quite familiar with kernel code and trace it or really understand what different combinations of fs flags/mount options lead to. This patch adds printing of current quota jounalling mode on each mount/remount, thus making it easier to check it at a glance/in autotests. The semantics is similar to ext4 data journalling modes: * journalled - quota configured, journalling will be enabled * writeback - quota configured, journalling won't be enabled * none - quota isn't configured * disabled - kernel compiled without CONFIG_QUOTA feature Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/1603336860-16153-2-git-send-email-dotdot@yandex-team.ru Signed-off-by: Roman Anufriev Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d9e1946a634e..132adb2a2c01 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4005,6 +4005,21 @@ static void ext4_set_resv_clusters(struct super_block *sb) atomic64_set(&sbi->s_resv_clusters, resv_clusters); } +static const char *ext4_quota_mode(struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + if (!ext4_quota_capable(sb)) + return "none"; + + if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb)) + return "journalled"; + else + return "writeback"; +#else + return "disabled"; +#endif +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) { struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); @@ -5090,10 +5105,11 @@ no_journal: if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Opts: %.*s%s%s", descr, + "Opts: %.*s%s%s. Quota mode: %s.", descr, (int) sizeof(sbi->s_es->s_mount_opts), sbi->s_es->s_mount_opts, - *sbi->s_es->s_mount_opts ? "; " : "", orig_data); + *sbi->s_es->s_mount_opts ? "; " : "", orig_data, + ext4_quota_mode(sb)); if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ @@ -6031,7 +6047,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) */ *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags); - ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); + ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s. Quota mode: %s.", + orig_data, ext4_quota_mode(sb)); kfree(orig_data); return 0; From 837c23fbc1b812f814c75388b6b364349c02efd8 Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Sat, 7 Nov 2020 23:58:11 +0800 Subject: [PATCH 026/326] ext4: use ASSERT() to replace J_ASSERT() There are currently multiple forms of assertion, such as J_ASSERT(). J_ASEERT() is provided for the jbd module, which is a public module. Maybe we should use custom ASSERT() like other file systems, such as xfs, which would be better. Signed-off-by: Chunguang Xu Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/1604764698-4269-1-git-send-email-brookxu@tencent.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 2 +- fs/ext4/ext4.h | 10 ++++++++++ fs/ext4/fsync.c | 2 +- fs/ext4/indirect.c | 4 ++-- fs/ext4/inode.c | 10 +++++----- fs/ext4/namei.c | 12 ++++-------- fs/ext4/super.c | 2 +- 7 files changed, 24 insertions(+), 18 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1d640b145637..f45f9feebe59 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -185,7 +185,7 @@ static int ext4_init_block_bitmap(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; - J_ASSERT_BH(bh, buffer_locked(bh)); + ASSERT(buffer_locked(bh)); /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fb4b36585ff6..cd95965be3d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -98,6 +98,16 @@ #define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif +#define ASSERT(assert) \ +do { \ + if (unlikely(!(assert))) { \ + printk(KERN_EMERG \ + "Assertion failure in %s() at %s:%d: '%s'\n", \ + __func__, __FILE__, __LINE__, #assert); \ + BUG(); \ + } \ +} while (0) + /* data type for block offset of block group */ typedef int ext4_grpblk_t; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index a42ca95840f2..113bfb023a4a 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -136,7 +136,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (unlikely(ext4_forced_shutdown(sbi))) return -EIO; - J_ASSERT(ext4_journal_current_handle() == NULL); + ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file_enter(file, datasync); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 05efa682bc2f..1223a18c3ff9 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -534,8 +534,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t first_block = 0; trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); - J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); - J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); + ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); + ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); depth = ext4_block_to_path(inode, map->m_lblk, offsets, &blocks_to_boundary); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0d8385aea898..b147c2e20469 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -830,8 +830,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, int create = map_flags & EXT4_GET_BLOCKS_CREATE; int err; - J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) - || handle != NULL || create == 0); + ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + || handle != NULL || create == 0); map.m_lblk = block; map.m_len = 1; @@ -846,9 +846,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { - J_ASSERT(create != 0); - J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) - || (handle != NULL)); + ASSERT(create != 0); + ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + || (handle != NULL)); /* * Now that we do not always journal data, we should diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 33509266f5a0..7a890ff214f1 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -182,10 +182,6 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, return bh; } -#ifndef assert -#define assert(test) J_ASSERT(test) -#endif - #ifdef DX_DEBUG #define dxtrace(command) command #else @@ -849,7 +845,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, break; } } - assert (at == p - 1); + ASSERT(at == p - 1); } at = p - 1; @@ -1265,8 +1261,8 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) struct dx_entry *old = frame->at, *new = old + 1; int count = dx_get_count(entries); - assert(count < dx_get_limit(entries)); - assert(old < entries + count); + ASSERT(count < dx_get_limit(entries)); + ASSERT(old < entries + count); memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); dx_set_hash(new, hash); dx_set_block(new, block); @@ -2961,7 +2957,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) * hold i_mutex, or the inode can not be referenced from outside, * so i_nlink should not be bumped due to race */ - J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 132adb2a2c01..f86220a8df50 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1240,7 +1240,7 @@ static void ext4_put_super(struct super_block *sb) * in-memory list had better be clean by this point. */ if (!list_empty(&sbi->s_orphan)) dump_orphan_list(sb, sbi); - J_ASSERT(list_empty(&sbi->s_orphan)); + ASSERT(list_empty(&sbi->s_orphan)); sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); From 6bd97bf273bdb4944904e57480f6545bca48ad77 Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Sat, 7 Nov 2020 23:58:12 +0800 Subject: [PATCH 027/326] ext4: remove redundant mb_regenerate_buddy() After this patch (163a203), if an abnormal bitmap is detected, we will mark the group as corrupt, and we will not use this group in the future. Therefore, it should be meaningless to regenerate the buddy bitmap of this group, It might be better to delete it. Signed-off-by: Chunguang Xu Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/1604764698-4269-2-git-send-email-brookxu@tencent.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 24af9ed5c3e5..5b745551e7a9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -822,24 +822,6 @@ void ext4_mb_generate_buddy(struct super_block *sb, spin_unlock(&sbi->s_bal_lock); } -static void mb_regenerate_buddy(struct ext4_buddy *e4b) -{ - int count; - int order = 1; - void *buddy; - - while ((buddy = mb_find_buddy(e4b, order++, &count))) { - ext4_set_bits(buddy, 0, count); - } - e4b->bd_info->bb_fragments = 0; - memset(e4b->bd_info->bb_counters, 0, - sizeof(*e4b->bd_info->bb_counters) * - (e4b->bd_sb->s_blocksize_bits + 2)); - - ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, - e4b->bd_bitmap, e4b->bd_group); -} - /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve @@ -1512,7 +1494,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); } - mb_regenerate_buddy(e4b); goto done; } From ce3cca337401123c9cf96896fc4e1657bb016bd3 Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Sat, 7 Nov 2020 23:58:13 +0800 Subject: [PATCH 028/326] ext4: simplify the code of mb_find_order_for_block The code of mb_find_order_for_block is a bit obscure, but we can simplify it with mb_find_buddy(), make the code more concise. Signed-off-by: Chunguang Xu Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/1604764698-4269-3-git-send-email-brookxu@tencent.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5b745551e7a9..29dfeb043050 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1289,22 +1289,18 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) { - int order = 1; - int bb_incr = 1 << (e4b->bd_blkbits - 1); + int order = 1, max; void *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); - bb = e4b->bd_buddy; while (order <= e4b->bd_blkbits + 1) { - block = block >> 1; - if (!mb_test_bit(block, bb)) { + bb = mb_find_buddy(e4b, order, &max); + if (!mb_test_bit(block >> order, bb)) { /* this block is part of buddy of order 'order' */ return order; } - bb += bb_incr; - bb_incr >>= 1; order++; } return 0; From 91c1c092f27da4164d55ca81e0a483108f8a3235 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Dec 2020 17:33:33 +0100 Subject: [PATCH 029/326] efi: capsule: use atomic kmap for transient sglist mappings Don't use the heavy-weight kmap() API to create short-lived mappings of the scatter-gather list entries that are released as soon as the entries are written. Instead, use kmap_atomic(), which is more suited to this purpose. Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/capsule.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firmware/efi/capsule.c b/drivers/firmware/efi/capsule.c index 598b7800d14e..43f6fe7bfe80 100644 --- a/drivers/firmware/efi/capsule.c +++ b/drivers/firmware/efi/capsule.c @@ -244,7 +244,7 @@ int efi_capsule_update(efi_capsule_header_t *capsule, phys_addr_t *pages) for (i = 0; i < sg_count; i++) { efi_capsule_block_desc_t *sglist; - sglist = kmap(sg_pages[i]); + sglist = kmap_atomic(sg_pages[i]); for (j = 0; j < SGLIST_PER_PAGE && count > 0; j++) { u64 sz = min_t(u64, imagesize, @@ -265,7 +265,7 @@ int efi_capsule_update(efi_capsule_header_t *capsule, phys_addr_t *pages) else sglist[j].data = page_to_phys(sg_pages[i + 1]); - kunmap(sg_pages[i]); + kunmap_atomic(sglist); } mutex_lock(&capsule_mutex); From 4dbe44fb538c59a4adae5abfa9ded2f310250315 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 7 Dec 2020 18:40:53 +0100 Subject: [PATCH 030/326] efi: capsule: clean scatter-gather entries from the D-cache Scatter-gather lists passed to UpdateCapsule() should be cleaned from the D-cache to ensure that they are visible to the CPU after a warm reboot before the MMU is enabled. On ARM and arm64 systems, this implies a D-cache clean by virtual address to the point of coherency. However, due to the fact that the firmware itself is not able to map physical addresses back to virtual addresses when running under the OS, this must be done by the caller. Signed-off-by: Ard Biesheuvel --- arch/arm/include/asm/efi.h | 5 +++++ arch/arm64/include/asm/efi.h | 5 +++++ drivers/firmware/efi/capsule.c | 12 ++++++++++++ 3 files changed, 22 insertions(+) diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index 3ee4f4381985..e9a06e164e06 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -93,4 +93,9 @@ struct efi_arm_entry_state { u32 sctlr_after_ebs; }; +static inline void efi_capsule_flush_cache_range(void *addr, int size) +{ + __cpuc_flush_dcache_area(addr, size); +} + #endif /* _ASM_ARM_EFI_H */ diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 973b14415271..00bd1e179d36 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -141,4 +141,9 @@ static inline void efi_set_pgd(struct mm_struct *mm) void efi_virtmap_load(void); void efi_virtmap_unload(void); +static inline void efi_capsule_flush_cache_range(void *addr, int size) +{ + __flush_dcache_area(addr, size); +} + #endif /* _ASM_EFI_H */ diff --git a/drivers/firmware/efi/capsule.c b/drivers/firmware/efi/capsule.c index 43f6fe7bfe80..768430293669 100644 --- a/drivers/firmware/efi/capsule.c +++ b/drivers/firmware/efi/capsule.c @@ -12,6 +12,7 @@ #include #include #include +#include #include typedef struct { @@ -265,6 +266,17 @@ int efi_capsule_update(efi_capsule_header_t *capsule, phys_addr_t *pages) else sglist[j].data = page_to_phys(sg_pages[i + 1]); +#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) + /* + * At runtime, the firmware has no way to find out where the + * sglist elements are mapped, if they are mapped in the first + * place. Therefore, on architectures that can only perform + * cache maintenance by virtual address, the firmware is unable + * to perform this maintenance, and so it is up to the OS to do + * it instead. + */ + efi_capsule_flush_cache_range(sglist, PAGE_SIZE); +#endif kunmap_atomic(sglist); } From c0249238feefbbb99d517d06ace4338393901b67 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 12 Nov 2020 15:42:27 +0100 Subject: [PATCH 031/326] efi: arm: reduce minimum alignment of uncompressed kernel Now that we reduced the minimum relative alignment between PHYS_OFFSET and PAGE_OFFSET to 2 MiB, we can take this into account when allocating memory for the decompressed kernel when booting via EFI. This minimizes the amount of unusable memory we may end up with due to the base of DRAM being occupied by firmware. Signed-off-by: Ard Biesheuvel --- arch/arm/include/asm/efi.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index e9a06e164e06..153680541aea 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -66,13 +66,12 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) #define MAX_UNCOMP_KERNEL_SIZE SZ_32M /* - * phys-to-virt patching requires that the physical to virtual offset fits - * into the immediate field of an add/sub instruction, which comes down to the - * 24 least significant bits being zero, and so the offset should be a multiple - * of 16 MB. Since PAGE_OFFSET itself is a multiple of 16 MB, the physical - * base should be aligned to 16 MB as well. + * phys-to-virt patching requires that the physical to virtual offset is a + * multiple of 2 MiB. However, using an alignment smaller than TEXT_OFFSET + * here throws off the memory allocation logic, so let's use the lowest power + * of two greater than 2 MiB and greater than TEXT_OFFSET. */ -#define EFI_PHYS_ALIGN SZ_16M +#define EFI_PHYS_ALIGN max(SZ_2M, roundup_pow_of_two(TEXT_OFFSET)) /* on ARM, the FDT should be located in a lowmem region */ static inline unsigned long efi_get_max_fdt_addr(unsigned long image_addr) @@ -83,7 +82,7 @@ static inline unsigned long efi_get_max_fdt_addr(unsigned long image_addr) /* on ARM, the initrd should be loaded in a lowmem region */ static inline unsigned long efi_get_max_initrd_addr(unsigned long image_addr) { - return round_down(image_addr, EFI_PHYS_ALIGN) + SZ_512M; + return round_down(image_addr, SZ_4M) + SZ_512M; } struct efi_arm_entry_state { From ff20661bb54cd57a18207b33cc57eb8d5c758a86 Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Fri, 27 Nov 2020 20:20:51 +0100 Subject: [PATCH 032/326] efi/efi_test: read RuntimeServicesSupported Since the UEFI 2.8A specification the UEFI enabled firmware provides a configuration table EFI_RT_PROPERTIES_TABLE which indicates which runtime services are enabled. The EFI stub reads this table and saves the value of the field RuntimeServicesSupported internally. The Firmware Test Suite requires the value to determine if UEFI runtime services are correctly implemented. With this patch an IOCTL call is provided to read the value of the field RuntimeServicesSupported, e.g. #define EFI_RUNTIME_GET_SUPPORTED_MASK \ _IOR('p', 0x0C, unsigned int) unsigned int mask; fd = open("/dev/efi_test", O_RDWR); ret = ioctl(fd, EFI_RUNTIME_GET_SUPPORTED_MASK, &mask); Signed-off-by: Heinrich Schuchardt Link: https://lore.kernel.org/r/20201127192051.1430-1-xypron.glpk@gmx.de Acked-by: Colin Ian King Acked-by: Ivan Hu Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/test/efi_test.c | 16 ++++++++++++++++ drivers/firmware/efi/test/efi_test.h | 3 +++ 2 files changed, 19 insertions(+) diff --git a/drivers/firmware/efi/test/efi_test.c b/drivers/firmware/efi/test/efi_test.c index ddf9eae396fe..47d67bb0a516 100644 --- a/drivers/firmware/efi/test/efi_test.c +++ b/drivers/firmware/efi/test/efi_test.c @@ -663,6 +663,19 @@ out: return rv; } +static long efi_runtime_get_supported_mask(unsigned long arg) +{ + unsigned int __user *supported_mask; + int rv = 0; + + supported_mask = (unsigned int *)arg; + + if (put_user(efi.runtime_supported_mask, supported_mask)) + rv = -EFAULT; + + return rv; +} + static long efi_test_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -699,6 +712,9 @@ static long efi_test_ioctl(struct file *file, unsigned int cmd, case EFI_RUNTIME_RESET_SYSTEM: return efi_runtime_reset_system(arg); + + case EFI_RUNTIME_GET_SUPPORTED_MASK: + return efi_runtime_get_supported_mask(arg); } return -ENOTTY; diff --git a/drivers/firmware/efi/test/efi_test.h b/drivers/firmware/efi/test/efi_test.h index f2446aa1c2e3..117349e57993 100644 --- a/drivers/firmware/efi/test/efi_test.h +++ b/drivers/firmware/efi/test/efi_test.h @@ -118,4 +118,7 @@ struct efi_resetsystem { #define EFI_RUNTIME_RESET_SYSTEM \ _IOW('p', 0x0B, struct efi_resetsystem) +#define EFI_RUNTIME_GET_SUPPORTED_MASK \ + _IOR('p', 0x0C, unsigned int) + #endif /* _DRIVERS_FIRMWARE_EFI_TEST_H_ */ From 54649911f31b6e7c2a79a1426ca98259139e4c35 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 29 Oct 2020 14:49:01 +0100 Subject: [PATCH 033/326] efi: stub: get rid of efi_get_max_fdt_addr() Now that ARM started following the example of arm64 and RISC-V, and no longer imposes any restrictions on the placement of the FDT in memory at boot, we no longer need per-arch implementations of efi_get_max_fdt_addr() to factor out the differences. So get rid of it. Signed-off-by: Ard Biesheuvel Reviewed-by: Atish Patra Link: https://lore.kernel.org/r/20201029134901.9773-1-ardb@kernel.org --- arch/arm/include/asm/efi.h | 6 ------ arch/arm64/include/asm/efi.h | 6 ------ arch/riscv/include/asm/efi.h | 6 ------ drivers/firmware/efi/libstub/efi-stub.c | 1 - drivers/firmware/efi/libstub/efistub.h | 1 - drivers/firmware/efi/libstub/fdt.c | 3 +-- 6 files changed, 1 insertion(+), 22 deletions(-) diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index 153680541aea..abae071a02e1 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -73,12 +73,6 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) */ #define EFI_PHYS_ALIGN max(SZ_2M, roundup_pow_of_two(TEXT_OFFSET)) -/* on ARM, the FDT should be located in a lowmem region */ -static inline unsigned long efi_get_max_fdt_addr(unsigned long image_addr) -{ - return round_down(image_addr, EFI_PHYS_ALIGN) + SZ_512M; -} - /* on ARM, the initrd should be loaded in a lowmem region */ static inline unsigned long efi_get_max_initrd_addr(unsigned long image_addr) { diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index 00bd1e179d36..3578aba9c608 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -64,12 +64,6 @@ efi_status_t __efi_rt_asm_wrapper(void *, const char *, ...); #define EFI_KIMG_ALIGN \ (SEGMENT_ALIGN > THREAD_ALIGN ? SEGMENT_ALIGN : THREAD_ALIGN) -/* on arm64, the FDT may be located anywhere in system RAM */ -static inline unsigned long efi_get_max_fdt_addr(unsigned long image_addr) -{ - return ULONG_MAX; -} - /* * On arm64, we have to ensure that the initrd ends up in the linear region, * which is a 1 GB aligned region of size '1UL << (VA_BITS_MIN - 1)' that is diff --git a/arch/riscv/include/asm/efi.h b/arch/riscv/include/asm/efi.h index 7542282f1141..6d98cd999680 100644 --- a/arch/riscv/include/asm/efi.h +++ b/arch/riscv/include/asm/efi.h @@ -27,12 +27,6 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); #define ARCH_EFI_IRQ_FLAGS_MASK (SR_IE | SR_SPIE) -/* on RISC-V, the FDT may be located anywhere in system RAM */ -static inline unsigned long efi_get_max_fdt_addr(unsigned long image_addr) -{ - return ULONG_MAX; -} - /* Load initrd at enough distance from DRAM start */ static inline unsigned long efi_get_max_initrd_addr(unsigned long image_addr) { diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index 914a343c7785..ec2f3985bef3 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -273,7 +273,6 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, install_memreserve_table(); status = allocate_new_fdt_and_exit_boot(handle, &fdt_addr, - efi_get_max_fdt_addr(image_addr), initrd_addr, initrd_size, cmdline_ptr, fdt_addr, fdt_size); if (status != EFI_SUCCESS) diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index b8ec29d6a74a..b50a6c67d9bd 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -750,7 +750,6 @@ efi_status_t efi_exit_boot_services(void *handle, efi_status_t allocate_new_fdt_and_exit_boot(void *handle, unsigned long *new_fdt_addr, - unsigned long max_addr, u64 initrd_addr, u64 initrd_size, char *cmdline_ptr, unsigned long fdt_addr, diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c index 368cd60000ee..365c3a43a198 100644 --- a/drivers/firmware/efi/libstub/fdt.c +++ b/drivers/firmware/efi/libstub/fdt.c @@ -238,7 +238,6 @@ static efi_status_t exit_boot_func(struct efi_boot_memmap *map, efi_status_t allocate_new_fdt_and_exit_boot(void *handle, unsigned long *new_fdt_addr, - unsigned long max_addr, u64 initrd_addr, u64 initrd_size, char *cmdline_ptr, unsigned long fdt_addr, @@ -275,7 +274,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, efi_info("Exiting boot services and installing virtual address map...\n"); map.map = &memory_map; - status = efi_allocate_pages(MAX_FDT_SIZE, new_fdt_addr, max_addr); + status = efi_allocate_pages(MAX_FDT_SIZE, new_fdt_addr, ULONG_MAX); if (status != EFI_SUCCESS) { efi_err("Unable to allocate memory for new device tree.\n"); goto fail; From 8041ac642a1b31a2479488bc93fb16045726de23 Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Sat, 7 Nov 2020 23:58:15 +0800 Subject: [PATCH 034/326] ext4: update ext4_data_block_valid related comments Since ext4_data_block_valid() has been renamed to ext4_inode_block_valid(), the related comments need to be updated. Signed-off-by: Chunguang Xu Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/1604764698-4269-5-git-send-email-brookxu@tencent.com Signed-off-by: Theodore Ts'o --- fs/ext4/block_validity.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 8e6ca23ed172..c13422a770e4 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -206,7 +206,7 @@ static void ext4_destroy_system_zone(struct rcu_head *rcu) * * The update of system_blks pointer in this function is protected by * sb->s_umount semaphore. However we have to be careful as we can be - * racing with ext4_data_block_valid() calls reading system_blks rbtree + * racing with ext4_inode_block_valid() calls reading system_blks rbtree * protected only by RCU. That's why we first build the rbtree and then * swap it in place. */ @@ -258,7 +258,7 @@ int ext4_setup_system_zone(struct super_block *sb) /* * System blks rbtree complete, announce it once to prevent racing - * with ext4_data_block_valid() accessing the rbtree at the same + * with ext4_inode_block_valid() accessing the rbtree at the same * time. */ rcu_assign_pointer(sbi->s_system_blks, system_blks); @@ -278,7 +278,7 @@ err: * * The update of system_blks pointer in this function is protected by * sb->s_umount semaphore. However we have to be careful as we can be - * racing with ext4_data_block_valid() calls reading system_blks rbtree + * racing with ext4_inode_block_valid() calls reading system_blks rbtree * protected only by RCU. So we first clear the system_blks pointer and * then free the rbtree only after RCU grace period expires. */ From 41fca96e635be523c28b8d57f2d1b1e51d1221d8 Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Sat, 7 Nov 2020 23:58:17 +0800 Subject: [PATCH 035/326] ext4: delete nonsensical (commented-out) code inside ext4_xattr_block_set() Signed-off-by: Chunguang Xu Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/1604764698-4269-7-git-send-email-brookxu@tencent.com Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 6127e94ea4f5..4e3b1f8c2e81 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1927,7 +1927,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, } else { /* Allocate a buffer where we construct the new block. */ s->base = kzalloc(sb->s_blocksize, GFP_NOFS); - /* assert(header == s->base) */ error = -ENOMEM; if (s->base == NULL) goto cleanup; From 6ae9b5ffcaeba64c290dfb8bd7b0194b1fdf0c92 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Thu, 5 Nov 2020 18:03:05 -0800 Subject: [PATCH 036/326] platform/chrome: cros_ec_typec: Tolerate unrecognized mux flags On occasion, the Chrome Embedded Controller (EC) can send a mux configuration which doesn't map to a particular data mode. For instance, dedicated Type C chargers, when connected, may cause only USB_PD_MUX_POLARITY_INVERTED to be set. This is a valid flag combination and should not lead to a driver abort. Modify the mux configuration handling to not return an error when an unrecognized mux flag combination is encountered. Concordantly, make the ensuing print a debug level print so as to not pollute the kernel logs. Cc: Keith Short Signed-off-by: Prashant Malani Acked-by: Heikki Krogerus Signed-off-by: Benson Leung Link: https://lore.kernel.org/r/20201106020305.767202-1-pmalani@chromium.org --- drivers/platform/chrome/cros_ec_typec.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index ce031a10eb1b..5b8db02ab84a 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -537,10 +537,9 @@ static int cros_typec_configure_mux(struct cros_typec_data *typec, int port_num, port->state.mode = TYPEC_STATE_USB; ret = typec_mux_set(port->mux, &port->state); } else { - dev_info(typec->dev, - "Unsupported mode requested, mux flags: %x\n", - mux_flags); - ret = -ENOTSUPP; + dev_dbg(typec->dev, + "Unrecognized mode requested, mux flags: %x\n", + mux_flags); } return ret; From edf7ddbf1c5eb98b720b063b73e20e8a4a1ce673 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 31 Oct 2020 21:40:21 -0700 Subject: [PATCH 037/326] fs/namespace.c: WARN if mnt_count has become negative Missing calls to mntget() (or equivalently, too many calls to mntput()) are hard to detect because mntput() delays freeing mounts using task_work_add(), then again using call_rcu(). As a result, mnt_count can often be decremented to -1 without getting a KASAN use-after-free report. Such cases are still bugs though, and they point to real use-after-frees being possible. For an example of this, see the bug fixed by commit 1b0b9cc8d379 ("vfs: fsmount: add missing mntget()"), discussed at https://lkml.kernel.org/linux-fsdevel/20190605135401.GB30925@xxxxxxxxxxxxxxxxxxxxxxxxx/T/#u. This bug *should* have been trivial to find. But actually, it wasn't found until syzkaller happened to use fchdir() to manipulate the reference count just right for the bug to be noticeable. Address this by making mntput_no_expire() issue a WARN if mnt_count has become negative. Suggested-by: Miklos Szeredi Signed-off-by: Eric Biggers Signed-off-by: Al Viro --- fs/namespace.c | 9 ++++++--- fs/pnode.h | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index cebaa3e81794..93006abe7946 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -156,10 +156,10 @@ static inline void mnt_add_count(struct mount *mnt, int n) /* * vfsmount lock must be held for write */ -unsigned int mnt_get_count(struct mount *mnt) +int mnt_get_count(struct mount *mnt) { #ifdef CONFIG_SMP - unsigned int count = 0; + int count = 0; int cpu; for_each_possible_cpu(cpu) { @@ -1139,6 +1139,7 @@ static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); static void mntput_no_expire(struct mount *mnt) { LIST_HEAD(list); + int count; rcu_read_lock(); if (likely(READ_ONCE(mnt->mnt_ns))) { @@ -1162,7 +1163,9 @@ static void mntput_no_expire(struct mount *mnt) */ smp_mb(); mnt_add_count(mnt, -1); - if (mnt_get_count(mnt)) { + count = mnt_get_count(mnt); + if (count != 0) { + WARN_ON(count < 0); rcu_read_unlock(); unlock_mount_hash(); return; diff --git a/fs/pnode.h b/fs/pnode.h index 49a058c73e4c..26f74e092bd9 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -44,7 +44,7 @@ int propagate_mount_busy(struct mount *, int); void propagate_mount_unlock(struct mount *); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); -unsigned int mnt_get_count(struct mount *mnt); +int mnt_get_count(struct mount *mnt); void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, From 88149082bb8ef31b289673669e080ec6a00c2e59 Mon Sep 17 00:00:00 2001 From: Hao Li Date: Tue, 8 Dec 2020 10:08:43 +0800 Subject: [PATCH 038/326] fs: Handle I_DONTCACHE in iput_final() instead of generic_drop_inode() If generic_drop_inode() returns true, it means iput_final() can evict this inode regardless of whether it is dirty or not. If we check I_DONTCACHE in generic_drop_inode(), any inode with this bit set will be evicted unconditionally. This is not the desired behavior because I_DONTCACHE only means the inode shouldn't be cached on the LRU list. As for whether we need to evict this inode, this is what generic_drop_inode() should do. This patch corrects the usage of I_DONTCACHE. This patch was proposed in [1]. [1]: https://lore.kernel.org/linux-fsdevel/20200831003407.GE12096@dread.disaster.area/ Fixes: dae2f8ed7992 ("fs: Lift XFS_IDONTCACHE to the VFS layer") Signed-off-by: Hao Li Reviewed-by: Dave Chinner Reviewed-by: Ira Weiny Signed-off-by: Al Viro --- fs/inode.c | 4 +++- include/linux/fs.h | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 9d78c37b00b8..5eea9912a0b9 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1627,7 +1627,9 @@ static void iput_final(struct inode *inode) else drop = generic_drop_inode(inode); - if (!drop && (sb->s_flags & SB_ACTIVE)) { + if (!drop && + !(inode->i_state & I_DONTCACHE) && + (sb->s_flags & SB_ACTIVE)) { inode_add_lru(inode); spin_unlock(&inode->i_lock); return; diff --git a/include/linux/fs.h b/include/linux/fs.h index 8667d0cdc71e..8bde32cf9711 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2878,8 +2878,7 @@ extern int inode_needs_sync(struct inode *inode); extern int generic_delete_inode(struct inode *inode); static inline int generic_drop_inode(struct inode *inode) { - return !inode->i_nlink || inode_unhashed(inode) || - (inode->i_state & I_DONTCACHE); + return !inode->i_nlink || inode_unhashed(inode); } extern void d_mark_dontcache(struct inode *inode); From 77573fa310d95e4293efdec98dace74cd9e52f43 Mon Sep 17 00:00:00 2001 From: Hao Li Date: Tue, 8 Dec 2020 10:10:50 +0800 Subject: [PATCH 039/326] fs: Kill DCACHE_DONTCACHE dentry even if DCACHE_REFERENCED is set If DCACHE_REFERENCED is set, fast_dput() will return true, and then retain_dentry() have no chance to check DCACHE_DONTCACHE. As a result, the dentry won't be killed and the corresponding inode can't be evicted. In the following example, the DAX policy can't take effects unless we do a drop_caches manually. # DCACHE_LRU_LIST will be set echo abcdefg > test.txt # DCACHE_REFERENCED will be set and DCACHE_DONTCACHE can't do anything xfs_io -c 'chattr +x' test.txt # Drop caches to make DAX changing take effects echo 2 > /proc/sys/vm/drop_caches What this patch does is preventing fast_dput() from returning true if DCACHE_DONTCACHE is set. Then retain_dentry() will detect the DCACHE_DONTCACHE and will return false. As a result, the dentry will be killed and the inode will be evicted. In this way, if we change per-file DAX policy, it will take effects automatically after this file is closed by all processes. I also add some comments to make the code more clear. Signed-off-by: Hao Li Reviewed-by: Jan Kara Reviewed-by: Ira Weiny Signed-off-by: Al Viro --- fs/dcache.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/dcache.c b/fs/dcache.c index ea0485861d93..97e81a844a96 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -793,10 +793,17 @@ static inline bool fast_dput(struct dentry *dentry) * a reference to the dentry and change that, but * our work is done - we can leave the dentry * around with a zero refcount. + * + * Nevertheless, there are two cases that we should kill + * the dentry anyway. + * 1. free disconnected dentries as soon as their refcount + * reached zero. + * 2. free dentries if they should not be cached. */ smp_rmb(); d_flags = READ_ONCE(dentry->d_flags); - d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED; + d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | + DCACHE_DISCONNECTED | DCACHE_DONTCACHE; /* Nothing to do? Dropping the reference was all we needed? */ if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry)) From 1a97d899ecbc4b60c8e8f9b41cde443510b5b1bf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Sep 2020 17:55:58 +0100 Subject: [PATCH 040/326] Make sure that make_create_in_sticky() never sees uninitialized value of dir_mode make sure nd->dir_mode is always initialized after success exit from link_path_walk(); in case of empty path it did not happen. Reported-by: Anant Thazhemadam Tested-by: Anant Thazhemadam Signed-off-by: Al Viro --- fs/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index d4a6dd772303..fc193c684a57 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2114,8 +2114,10 @@ static int link_path_walk(const char *name, struct nameidata *nd) return PTR_ERR(name); while (*name=='/') name++; - if (!*name) + if (!*name) { + nd->dir_mode = 0; // short-circuit the 'hardening' idiocy return 0; + } /* At this point we know we have a real path component. */ for(;;) { From 33114c4359592d5c8a3d840eee9ff40039caf26f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 13 Dec 2020 01:54:29 +0900 Subject: [PATCH 041/326] kbuild: do not use scripts/ld-version.sh for checking spatch version scripts/ld-version.sh was, as its file name implies, originally intended for the GNU ld version, but is (ab)used for the spatch version too. Use 'sort -CV' for the version comparison, then coccicheck does not need to use scripts/ld-version.sh. Fix nsdeps as well. Signed-off-by: Masahiro Yamada Signed-off-by: Julia Lawall --- scripts/coccicheck | 14 +++++--------- scripts/nsdeps | 4 +--- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/scripts/coccicheck b/scripts/coccicheck index 209bb0427b43..d7f6b7ff130a 100755 --- a/scripts/coccicheck +++ b/scripts/coccicheck @@ -16,7 +16,6 @@ if [ ! -x "$SPATCH" ]; then fi SPATCH_VERSION=$($SPATCH --version | head -1 | awk '{print $3}') -SPATCH_VERSION_NUM=$(echo $SPATCH_VERSION | ${DIR}/scripts/ld-version.sh) USE_JOBS="no" $SPATCH --help | grep "\-\-jobs" > /dev/null && USE_JOBS="yes" @@ -186,14 +185,11 @@ coccinelle () { OPT=`grep "Options:" $COCCI | cut -d':' -f2` REQ=`grep "Requires:" $COCCI | cut -d':' -f2 | sed "s| ||"` - REQ_NUM=$(echo $REQ | ${DIR}/scripts/ld-version.sh) - if [ "$REQ_NUM" != "0" ] ; then - if [ "$SPATCH_VERSION_NUM" -lt "$REQ_NUM" ] ; then - echo "Skipping coccinelle SmPL patch: $COCCI" - echo "You have coccinelle: $SPATCH_VERSION" - echo "This SmPL patch requires: $REQ" - return - fi + if [ -n "$REQ" ] && ! { echo "$REQ"; echo "$SPATCH_VERSION"; } | sort -CV ; then + echo "Skipping coccinelle SmPL patch: $COCCI" + echo "You have coccinelle: $SPATCH_VERSION" + echo "This SmPL patch requires: $REQ" + return fi # The option '--parse-cocci' can be used to syntactically check the SmPL files. diff --git a/scripts/nsdeps b/scripts/nsdeps index dab4c1a0e27d..e8ce2a4d704a 100644 --- a/scripts/nsdeps +++ b/scripts/nsdeps @@ -12,11 +12,9 @@ if [ ! -x "$SPATCH" ]; then exit 1 fi -SPATCH_REQ_VERSION_NUM=$(echo $SPATCH_REQ_VERSION | ${DIR}/scripts/ld-version.sh) SPATCH_VERSION=$($SPATCH --version | head -1 | awk '{print $3}') -SPATCH_VERSION_NUM=$(echo $SPATCH_VERSION | ${DIR}/scripts/ld-version.sh) -if [ "$SPATCH_VERSION_NUM" -lt "$SPATCH_REQ_VERSION_NUM" ] ; then +if ! { echo "$SPATCH_REQ_VERSION"; echo "$SPATCH_VERSION"; } | sort -CV ; then echo "spatch needs to be version $SPATCH_REQ_VERSION or higher" exit 1 fi From 7f6f1dfb2dcbe5d2bfa213f2df5d74c147cd5954 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 8 Nov 2020 08:25:49 -0800 Subject: [PATCH 042/326] watchdog: armada_37xx: Add missing dependency on HAS_IOMEM The following kbuild warning is seen on a system without HAS_IOMEM. WARNING: unmet direct dependencies detected for MFD_SYSCON Depends on [n]: HAS_IOMEM [=n] Selected by [y]: - ARMADA_37XX_WATCHDOG [=y] && WATCHDOG [=y] && (ARCH_MVEBU || COMPILE_TEST This results in a subsequent compile error. drivers/watchdog/armada_37xx_wdt.o: in function `armada_37xx_wdt_probe': armada_37xx_wdt.c:(.text+0xdc): undefined reference to `devm_ioremap' Add the missing dependency. Reported-by: Necip Fazil Yildiran Fixes: 54e3d9b518c8 ("watchdog: Add support for Armada 37xx CPU watchdog") Link: https://lore.kernel.org/r/20201108162550.27660-1-linux@roeck-us.net Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index fd7968635e6d..b3e8bdaa2a11 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -386,6 +386,7 @@ config ARM_SBSA_WATCHDOG config ARMADA_37XX_WATCHDOG tristate "Armada 37xx watchdog" depends on ARCH_MVEBU || COMPILE_TEST + depends on HAS_IOMEM select MFD_SYSCON select WATCHDOG_CORE help From 8ae2511112d2e18bc7d324b77f965d34083a25a2 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 8 Nov 2020 08:25:50 -0800 Subject: [PATCH 043/326] watchdog: sirfsoc: Add missing dependency on HAS_IOMEM If HAS_IOMEM is not defined and SIRFSOC_WATCHDOG is enabled, the build fails with the following error. drivers/watchdog/sirfsoc_wdt.o: in function `sirfsoc_wdt_probe': sirfsoc_wdt.c:(.text+0x112): undefined reference to `devm_platform_ioremap_resource' Reported-by: Necip Fazil Yildiran Fixes: da2a68b3eb47 ("watchdog: Enable COMPILE_TEST where possible") Link: https://lore.kernel.org/r/20201108162550.27660-2-linux@roeck-us.net Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index b3e8bdaa2a11..f8e9be65036a 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -790,6 +790,7 @@ config MOXART_WDT config SIRFSOC_WATCHDOG tristate "SiRFSOC watchdog" + depends on HAS_IOMEM depends on ARCH_SIRF || COMPILE_TEST select WATCHDOG_CORE default y From f61a59acb462840bebcc192f754fe71b6a16ff99 Mon Sep 17 00:00:00 2001 From: Lingling Xu Date: Thu, 29 Oct 2020 10:39:31 +0800 Subject: [PATCH 044/326] watchdog: sprd: remove watchdog disable from resume fail path sprd_wdt_start() would return fail if the loading operation is not completed in a certain time, disabling watchdog for that case would probably cause the kernel crash when kick watchdog later, that's too bad, so remove the watchdog disable operation for the fail case to make sure other parts in the kernel can run normally. [ chunyan: Massaged changelog ] Fixes: 477603467009 ("watchdog: Add Spreadtrum watchdog driver") Signed-off-by: Lingling Xu Signed-off-by: Chunyan Zhang Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201029023933.24548-2-zhang.lyra@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/sprd_wdt.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/watchdog/sprd_wdt.c b/drivers/watchdog/sprd_wdt.c index 65cb55f3916f..f3c90b4afead 100644 --- a/drivers/watchdog/sprd_wdt.c +++ b/drivers/watchdog/sprd_wdt.c @@ -345,15 +345,10 @@ static int __maybe_unused sprd_wdt_pm_resume(struct device *dev) if (ret) return ret; - if (watchdog_active(&wdt->wdd)) { + if (watchdog_active(&wdt->wdd)) ret = sprd_wdt_start(&wdt->wdd); - if (ret) { - sprd_wdt_disable(wdt); - return ret; - } - } - return 0; + return ret; } static const struct dev_pm_ops sprd_wdt_pm_ops = { From 3e07d240939803bed9feb2a353d94686a411a7ca Mon Sep 17 00:00:00 2001 From: Lingling Xu Date: Mon, 9 Nov 2020 11:00:54 +0800 Subject: [PATCH 045/326] watchdog: sprd: check busy bit before new loading rather than after that As the specification described, users must check busy bit before start a new loading operation to make sure that the previous loading is done and the device is ready to accept a new one. [ chunyan: Massaged changelog ] Fixes: 477603467009 ("watchdog: Add Spreadtrum watchdog driver") Signed-off-by: Lingling Xu Signed-off-by: Chunyan Zhang Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201029023933.24548-3-zhang.lyra@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/sprd_wdt.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/watchdog/sprd_wdt.c b/drivers/watchdog/sprd_wdt.c index f3c90b4afead..b9b1daa9e2a4 100644 --- a/drivers/watchdog/sprd_wdt.c +++ b/drivers/watchdog/sprd_wdt.c @@ -108,18 +108,6 @@ static int sprd_wdt_load_value(struct sprd_wdt *wdt, u32 timeout, u32 tmr_step = timeout * SPRD_WDT_CNT_STEP; u32 prtmr_step = pretimeout * SPRD_WDT_CNT_STEP; - sprd_wdt_unlock(wdt->base); - writel_relaxed((tmr_step >> SPRD_WDT_CNT_HIGH_SHIFT) & - SPRD_WDT_LOW_VALUE_MASK, wdt->base + SPRD_WDT_LOAD_HIGH); - writel_relaxed((tmr_step & SPRD_WDT_LOW_VALUE_MASK), - wdt->base + SPRD_WDT_LOAD_LOW); - writel_relaxed((prtmr_step >> SPRD_WDT_CNT_HIGH_SHIFT) & - SPRD_WDT_LOW_VALUE_MASK, - wdt->base + SPRD_WDT_IRQ_LOAD_HIGH); - writel_relaxed(prtmr_step & SPRD_WDT_LOW_VALUE_MASK, - wdt->base + SPRD_WDT_IRQ_LOAD_LOW); - sprd_wdt_lock(wdt->base); - /* * Waiting the load value operation done, * it needs two or three RTC clock cycles. @@ -134,6 +122,19 @@ static int sprd_wdt_load_value(struct sprd_wdt *wdt, u32 timeout, if (delay_cnt >= SPRD_WDT_LOAD_TIMEOUT) return -EBUSY; + + sprd_wdt_unlock(wdt->base); + writel_relaxed((tmr_step >> SPRD_WDT_CNT_HIGH_SHIFT) & + SPRD_WDT_LOW_VALUE_MASK, wdt->base + SPRD_WDT_LOAD_HIGH); + writel_relaxed((tmr_step & SPRD_WDT_LOW_VALUE_MASK), + wdt->base + SPRD_WDT_LOAD_LOW); + writel_relaxed((prtmr_step >> SPRD_WDT_CNT_HIGH_SHIFT) & + SPRD_WDT_LOW_VALUE_MASK, + wdt->base + SPRD_WDT_IRQ_LOAD_HIGH); + writel_relaxed(prtmr_step & SPRD_WDT_LOW_VALUE_MASK, + wdt->base + SPRD_WDT_IRQ_LOAD_LOW); + sprd_wdt_lock(wdt->base); + return 0; } From 2a6c9c65b2fe1023f8bec543d3c70a107fd8b9fb Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Mon, 9 Nov 2020 11:00:55 +0800 Subject: [PATCH 046/326] watchdog: sprd: change to use usleep_range() instead of busy loop After changing to check busy bit for the previous loading operation instead of the current one, for most of cases, the busy bit is not set for the first time of read, so there's no need to check so frequently, so this patch use usleep_range() to replace cpu_relax() to avoid busy loop. Also this patch change the max times to 11 which would be enough, since according to the specification, the busy bit would be set after a new loading operation and last 2 or 3 RTC clock cycles (about 60us~92us). Fixes: 477603467009 ("watchdog: Add Spreadtrum watchdog driver") Original-by: Lingling Xu Signed-off-by: Chunyan Zhang Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201029023933.24548-4-zhang.lyra@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/sprd_wdt.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/watchdog/sprd_wdt.c b/drivers/watchdog/sprd_wdt.c index b9b1daa9e2a4..4e689b6ff141 100644 --- a/drivers/watchdog/sprd_wdt.c +++ b/drivers/watchdog/sprd_wdt.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -53,7 +54,7 @@ #define SPRD_WDT_CNT_HIGH_SHIFT 16 #define SPRD_WDT_LOW_VALUE_MASK GENMASK(15, 0) -#define SPRD_WDT_LOAD_TIMEOUT 1000 +#define SPRD_WDT_LOAD_TIMEOUT 11 struct sprd_wdt { void __iomem *base; @@ -109,15 +110,17 @@ static int sprd_wdt_load_value(struct sprd_wdt *wdt, u32 timeout, u32 prtmr_step = pretimeout * SPRD_WDT_CNT_STEP; /* - * Waiting the load value operation done, - * it needs two or three RTC clock cycles. + * Checking busy bit to make sure the previous loading operation is + * done. According to the specification, the busy bit would be set + * after a new loading operation and last 2 or 3 RTC clock + * cycles (about 60us~92us). */ do { val = readl_relaxed(wdt->base + SPRD_WDT_INT_RAW); if (!(val & SPRD_WDT_LD_BUSY_BIT)) break; - cpu_relax(); + usleep_range(10, 100); } while (delay_cnt++ < SPRD_WDT_LOAD_TIMEOUT); if (delay_cnt >= SPRD_WDT_LOAD_TIMEOUT) From 7c7164f935c8190af7e3663f4e82edc0607dc3a4 Mon Sep 17 00:00:00 2001 From: Etienne Carriere Date: Fri, 6 Nov 2020 15:23:27 +0100 Subject: [PATCH 047/326] watchdog: stm32_iwdg: don't print an error on probe deferral Do not print an error trace when deferring probe for clock resources. Signed-off-by: Etienne Carriere Signed-off-by: Christophe Roullier Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201106142327.3129-2-christophe.roullier@st.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/stm32_iwdg.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/watchdog/stm32_iwdg.c b/drivers/watchdog/stm32_iwdg.c index 25188d6bbe15..a3436c296c97 100644 --- a/drivers/watchdog/stm32_iwdg.c +++ b/drivers/watchdog/stm32_iwdg.c @@ -162,18 +162,15 @@ static int stm32_iwdg_clk_init(struct platform_device *pdev, u32 ret; wdt->clk_lsi = devm_clk_get(dev, "lsi"); - if (IS_ERR(wdt->clk_lsi)) { - dev_err(dev, "Unable to get lsi clock\n"); - return PTR_ERR(wdt->clk_lsi); - } + if (IS_ERR(wdt->clk_lsi)) + return dev_err_probe(dev, PTR_ERR(wdt->clk_lsi), "Unable to get lsi clock\n"); /* optional peripheral clock */ if (wdt->data->has_pclk) { wdt->clk_pclk = devm_clk_get(dev, "pclk"); - if (IS_ERR(wdt->clk_pclk)) { - dev_err(dev, "Unable to get pclk clock\n"); - return PTR_ERR(wdt->clk_pclk); - } + if (IS_ERR(wdt->clk_pclk)) + return dev_err_probe(dev, PTR_ERR(wdt->clk_pclk), + "Unable to get pclk clock\n"); ret = clk_prepare_enable(wdt->clk_pclk); if (ret) { From 4600736f050f210bcdaafd2ef730ad736da9bc0c Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Fri, 6 Nov 2020 14:05:07 +0100 Subject: [PATCH 048/326] watchdog: remove pnx83xx driver Commit 625326ea9c84 ("MIPS: Remove PNX833x alias NXP_STB22x") removed support for PNX833x, so it's time to remove watchdog driver, too. Signed-off-by: Thomas Bogendoerfer Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201106130508.103598-1-tsbogend@alpha.franken.de Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 10 -- drivers/watchdog/Makefile | 1 - drivers/watchdog/pnx833x_wdt.c | 277 --------------------------------- 3 files changed, 288 deletions(-) delete mode 100644 drivers/watchdog/pnx833x_wdt.c diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index f8e9be65036a..5d3c3ec918a8 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -1698,16 +1698,6 @@ config WDT_MTX1 Hardware driver for the MTX-1 boards. This is a watchdog timer that will reboot the machine after a 100 seconds timer expired. -config PNX833X_WDT - tristate "PNX833x Hardware Watchdog" - depends on SOC_PNX8335 - depends on BROKEN - help - Hardware driver for the PNX833x's watchdog. This is a - watchdog timer that will reboot the machine after a programmable - timer has expired and no process has written to /dev/watchdog during - that time. - config SIBYTE_WDOG tristate "Sibyte SoC hardware watchdog" depends on CPU_SB1 || (MIPS && COMPILE_TEST) diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index 071a2e50be98..5c74ee19d441 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -161,7 +161,6 @@ obj-$(CONFIG_RC32434_WDT) += rc32434_wdt.o obj-$(CONFIG_INDYDOG) += indydog.o obj-$(CONFIG_JZ4740_WDT) += jz4740_wdt.o obj-$(CONFIG_WDT_MTX1) += mtx-1_wdt.o -obj-$(CONFIG_PNX833X_WDT) += pnx833x_wdt.o obj-$(CONFIG_SIBYTE_WDOG) += sb_wdog.o obj-$(CONFIG_AR7_WDT) += ar7_wdt.o obj-$(CONFIG_TXX9_WDT) += txx9wdt.o diff --git a/drivers/watchdog/pnx833x_wdt.c b/drivers/watchdog/pnx833x_wdt.c deleted file mode 100644 index 4097d076aab8..000000000000 --- a/drivers/watchdog/pnx833x_wdt.c +++ /dev/null @@ -1,277 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * PNX833x Hardware Watchdog Driver - * Copyright 2008 NXP Semiconductors - * Daniel Laird - * Andre McCurdy - * - * Heavily based upon - IndyDog 0.3 - * A Hardware Watchdog Device for SGI IP22 - * - * (c) Copyright 2002 Guido Guenther , All Rights Reserved. - * - * based on softdog.c by Alan Cox - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define WATCHDOG_TIMEOUT 30 /* 30 sec Maximum timeout */ -#define WATCHDOG_COUNT_FREQUENCY 68000000U /* Watchdog counts at 68MHZ. */ -#define PNX_WATCHDOG_TIMEOUT (WATCHDOG_TIMEOUT * WATCHDOG_COUNT_FREQUENCY) -#define PNX_TIMEOUT_VALUE 2040000000U - -/** CONFIG block */ -#define PNX833X_CONFIG (0x07000U) -#define PNX833X_CONFIG_CPU_WATCHDOG (0x54) -#define PNX833X_CONFIG_CPU_WATCHDOG_COMPARE (0x58) -#define PNX833X_CONFIG_CPU_COUNTERS_CONTROL (0x1c) - -/** RESET block */ -#define PNX833X_RESET (0x08000U) -#define PNX833X_RESET_CONFIG (0x08) - -static int pnx833x_wdt_alive; - -/* Set default timeout in MHZ.*/ -static int pnx833x_wdt_timeout = PNX_WATCHDOG_TIMEOUT; -module_param(pnx833x_wdt_timeout, int, 0); -MODULE_PARM_DESC(timeout, "Watchdog timeout in Mhz. (68Mhz clock), default=" - __MODULE_STRING(PNX_TIMEOUT_VALUE) "(30 seconds)."); - -static bool nowayout = WATCHDOG_NOWAYOUT; -module_param(nowayout, bool, 0); -MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" - __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); - -#define START_DEFAULT 1 -static int start_enabled = START_DEFAULT; -module_param(start_enabled, int, 0); -MODULE_PARM_DESC(start_enabled, "Watchdog is started on module insertion " - "(default=" __MODULE_STRING(START_DEFAULT) ")"); - -static void pnx833x_wdt_start(void) -{ - /* Enable watchdog causing reset. */ - PNX833X_REG(PNX833X_RESET + PNX833X_RESET_CONFIG) |= 0x1; - /* Set timeout.*/ - PNX833X_REG(PNX833X_CONFIG + - PNX833X_CONFIG_CPU_WATCHDOG_COMPARE) = pnx833x_wdt_timeout; - /* Enable watchdog. */ - PNX833X_REG(PNX833X_CONFIG + - PNX833X_CONFIG_CPU_COUNTERS_CONTROL) |= 0x1; - - pr_info("Started watchdog timer\n"); -} - -static void pnx833x_wdt_stop(void) -{ - /* Disable watchdog causing reset. */ - PNX833X_REG(PNX833X_RESET + PNX833X_CONFIG) &= 0xFFFFFFFE; - /* Disable watchdog.*/ - PNX833X_REG(PNX833X_CONFIG + - PNX833X_CONFIG_CPU_COUNTERS_CONTROL) &= 0xFFFFFFFE; - - pr_info("Stopped watchdog timer\n"); -} - -static void pnx833x_wdt_ping(void) -{ - PNX833X_REG(PNX833X_CONFIG + - PNX833X_CONFIG_CPU_WATCHDOG_COMPARE) = pnx833x_wdt_timeout; -} - -/* - * Allow only one person to hold it open - */ -static int pnx833x_wdt_open(struct inode *inode, struct file *file) -{ - if (test_and_set_bit(0, &pnx833x_wdt_alive)) - return -EBUSY; - - if (nowayout) - __module_get(THIS_MODULE); - - /* Activate timer */ - if (!start_enabled) - pnx833x_wdt_start(); - - pnx833x_wdt_ping(); - - pr_info("Started watchdog timer\n"); - - return stream_open(inode, file); -} - -static int pnx833x_wdt_release(struct inode *inode, struct file *file) -{ - /* Shut off the timer. - * Lock it in if it's a module and we defined ...NOWAYOUT */ - if (!nowayout) - pnx833x_wdt_stop(); /* Turn the WDT off */ - - clear_bit(0, &pnx833x_wdt_alive); - return 0; -} - -static ssize_t pnx833x_wdt_write(struct file *file, const char *data, size_t len, loff_t *ppos) -{ - /* Refresh the timer. */ - if (len) - pnx833x_wdt_ping(); - - return len; -} - -static long pnx833x_wdt_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - int options, new_timeout = 0; - uint32_t timeout, timeout_left = 0; - - static const struct watchdog_info ident = { - .options = WDIOF_KEEPALIVEPING | WDIOF_SETTIMEOUT, - .firmware_version = 0, - .identity = "Hardware Watchdog for PNX833x", - }; - - switch (cmd) { - default: - return -ENOTTY; - - case WDIOC_GETSUPPORT: - if (copy_to_user((struct watchdog_info *)arg, - &ident, sizeof(ident))) - return -EFAULT; - return 0; - - case WDIOC_GETSTATUS: - case WDIOC_GETBOOTSTATUS: - return put_user(0, (int *)arg); - - case WDIOC_SETOPTIONS: - if (get_user(options, (int *)arg)) - return -EFAULT; - - if (options & WDIOS_DISABLECARD) - pnx833x_wdt_stop(); - - if (options & WDIOS_ENABLECARD) - pnx833x_wdt_start(); - - return 0; - - case WDIOC_KEEPALIVE: - pnx833x_wdt_ping(); - return 0; - - case WDIOC_SETTIMEOUT: - { - if (get_user(new_timeout, (int *)arg)) - return -EFAULT; - - pnx833x_wdt_timeout = new_timeout; - PNX833X_REG(PNX833X_CONFIG + - PNX833X_CONFIG_CPU_WATCHDOG_COMPARE) = new_timeout; - return put_user(new_timeout, (int *)arg); - } - - case WDIOC_GETTIMEOUT: - timeout = PNX833X_REG(PNX833X_CONFIG + - PNX833X_CONFIG_CPU_WATCHDOG_COMPARE); - return put_user(timeout, (int *)arg); - - case WDIOC_GETTIMELEFT: - timeout_left = PNX833X_REG(PNX833X_CONFIG + - PNX833X_CONFIG_CPU_WATCHDOG); - return put_user(timeout_left, (int *)arg); - - } -} - -static int pnx833x_wdt_notify_sys(struct notifier_block *this, - unsigned long code, void *unused) -{ - if (code == SYS_DOWN || code == SYS_HALT) - pnx833x_wdt_stop(); /* Turn the WDT off */ - - return NOTIFY_DONE; -} - -static const struct file_operations pnx833x_wdt_fops = { - .owner = THIS_MODULE, - .llseek = no_llseek, - .write = pnx833x_wdt_write, - .unlocked_ioctl = pnx833x_wdt_ioctl, - .compat_ioctl = compat_ptr_ioctl, - .open = pnx833x_wdt_open, - .release = pnx833x_wdt_release, -}; - -static struct miscdevice pnx833x_wdt_miscdev = { - .minor = WATCHDOG_MINOR, - .name = "watchdog", - .fops = &pnx833x_wdt_fops, -}; - -static struct notifier_block pnx833x_wdt_notifier = { - .notifier_call = pnx833x_wdt_notify_sys, -}; - -static int __init watchdog_init(void) -{ - int ret, cause; - - /* Lets check the reason for the reset.*/ - cause = PNX833X_REG(PNX833X_RESET); - /*If bit 31 is set then watchdog was cause of reset.*/ - if (cause & 0x80000000) { - pr_info("The system was previously reset due to the watchdog firing - please investigate...\n"); - } - - ret = register_reboot_notifier(&pnx833x_wdt_notifier); - if (ret) { - pr_err("cannot register reboot notifier (err=%d)\n", ret); - return ret; - } - - ret = misc_register(&pnx833x_wdt_miscdev); - if (ret) { - pr_err("cannot register miscdev on minor=%d (err=%d)\n", - WATCHDOG_MINOR, ret); - unregister_reboot_notifier(&pnx833x_wdt_notifier); - return ret; - } - - pr_info("Hardware Watchdog Timer for PNX833x: Version 0.1\n"); - - if (start_enabled) - pnx833x_wdt_start(); - - return 0; -} - -static void __exit watchdog_exit(void) -{ - misc_deregister(&pnx833x_wdt_miscdev); - unregister_reboot_notifier(&pnx833x_wdt_notifier); -} - -module_init(watchdog_init); -module_exit(watchdog_exit); - -MODULE_AUTHOR("Daniel Laird/Andre McCurdy"); -MODULE_DESCRIPTION("Hardware Watchdog Device for PNX833x"); -MODULE_LICENSE("GPL"); From 8650d0f9e9334f2e1c209f1e2ac8341f91e30d75 Mon Sep 17 00:00:00 2001 From: Robert Marko Date: Sat, 31 Oct 2020 13:11:15 +0100 Subject: [PATCH 049/326] watchdog: qcom_wdt: set WDOG_HW_RUNNING bit when appropriate If the watchdog hardware is enabled/running during boot, e.g. due to a boot loader configuring it, we must tell the watchdog framework about this fact so that it can ping the watchdog until userspace opens the device and takes over control. Do so using the WDOG_HW_RUNNING flag that exists for exactly that use-case. Signed-off-by: Robert Marko Cc: Luka Perkov Reviewed-by: Guenter Roeck Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20201031121115.542752-1-robert.marko@sartura.hr Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/qcom-wdt.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/watchdog/qcom-wdt.c b/drivers/watchdog/qcom-wdt.c index ab7465d186fd..07d399c4edc4 100644 --- a/drivers/watchdog/qcom-wdt.c +++ b/drivers/watchdog/qcom-wdt.c @@ -152,6 +152,13 @@ static int qcom_wdt_restart(struct watchdog_device *wdd, unsigned long action, return 0; } +static int qcom_wdt_is_running(struct watchdog_device *wdd) +{ + struct qcom_wdt *wdt = to_qcom_wdt(wdd); + + return (readl(wdt_addr(wdt, WDT_EN)) & QCOM_WDT_ENABLE); +} + static const struct watchdog_ops qcom_wdt_ops = { .start = qcom_wdt_start, .stop = qcom_wdt_stop, @@ -294,6 +301,17 @@ static int qcom_wdt_probe(struct platform_device *pdev) wdt->wdd.timeout = min(wdt->wdd.max_timeout, 30U); watchdog_init_timeout(&wdt->wdd, 0, dev); + /* + * If WDT is already running, call WDT start which + * will stop the WDT, set timeouts as bootloader + * might use different ones and set running bit + * to inform the WDT subsystem to ping the WDT + */ + if (qcom_wdt_is_running(&wdt->wdd)) { + qcom_wdt_start(&wdt->wdd); + set_bit(WDOG_HW_RUNNING, &wdt->wdd.status); + } + ret = devm_watchdog_register_device(dev, &wdt->wdd); if (ret) return ret; From 8711071e9700b67045fe5518161d63f7a03e3c9e Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Fri, 30 Oct 2020 23:49:09 +0800 Subject: [PATCH 050/326] watchdog: rti-wdt: fix reference leak in rti_wdt_probe pm_runtime_get_sync() will increment pm usage counter even it failed. Forgetting to call pm_runtime_put_noidle will result in reference leak in rti_wdt_probe, so we should fix it. Signed-off-by: Zhang Qilong Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201030154909.100023-1-zhangqilong3@huawei.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/rti_wdt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/watchdog/rti_wdt.c b/drivers/watchdog/rti_wdt.c index 836319cbaca9..359302f71f7e 100644 --- a/drivers/watchdog/rti_wdt.c +++ b/drivers/watchdog/rti_wdt.c @@ -227,8 +227,10 @@ static int rti_wdt_probe(struct platform_device *pdev) pm_runtime_enable(dev); ret = pm_runtime_get_sync(dev); - if (ret) + if (ret) { + pm_runtime_put_noidle(dev); return dev_err_probe(dev, ret, "runtime pm failed\n"); + } platform_set_drvdata(pdev, wdt); From 9747f12b5be9f55bfba72a82b619355cd861bdfe Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 19 Oct 2020 10:53:42 -0700 Subject: [PATCH 051/326] watchdog: geodewdt: remove unneeded break A break is not needed if it is preceded by a return Signed-off-by: Tom Rix Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201019175342.2646-1-trix@redhat.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/geodewdt.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/watchdog/geodewdt.c b/drivers/watchdog/geodewdt.c index 83418924e30a..0b699c783d57 100644 --- a/drivers/watchdog/geodewdt.c +++ b/drivers/watchdog/geodewdt.c @@ -150,8 +150,6 @@ static long geodewdt_ioctl(struct file *file, unsigned int cmd, case WDIOC_GETSUPPORT: return copy_to_user(argp, &ident, sizeof(ident)) ? -EFAULT : 0; - break; - case WDIOC_GETSTATUS: case WDIOC_GETBOOTSTATUS: return put_user(0, p); From 347755d2a88e54e7462be23f1e1a1018d9be4a4b Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Sun, 25 Oct 2020 13:45:18 +0100 Subject: [PATCH 052/326] watchdog: sbc_fitpc2_wdt: add __user annotations After a change to the put_user() macro on x86, kernel test robot has started sending me complaints (from sparse) about passing kernel pointers to put_user(). So add the __user annotations to the various casts in fitpc2_wdt_ioctl(), and while in here, also make the write method actually match the prototype of file_operations::write. Reported-by: kernel test robot Signed-off-by: Rasmus Villemoes Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201025124518.31647-1-linux@rasmusvillemoes.dk Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/sbc_fitpc2_wdt.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/watchdog/sbc_fitpc2_wdt.c b/drivers/watchdog/sbc_fitpc2_wdt.c index 04483d6453d6..13db71e16583 100644 --- a/drivers/watchdog/sbc_fitpc2_wdt.c +++ b/drivers/watchdog/sbc_fitpc2_wdt.c @@ -78,7 +78,7 @@ static int fitpc2_wdt_open(struct inode *inode, struct file *file) return stream_open(inode, file); } -static ssize_t fitpc2_wdt_write(struct file *file, const char *data, +static ssize_t fitpc2_wdt_write(struct file *file, const char __user *data, size_t len, loff_t *ppos) { size_t i; @@ -125,16 +125,16 @@ static long fitpc2_wdt_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case WDIOC_GETSUPPORT: - ret = copy_to_user((struct watchdog_info *)arg, &ident, + ret = copy_to_user((struct watchdog_info __user *)arg, &ident, sizeof(ident)) ? -EFAULT : 0; break; case WDIOC_GETSTATUS: - ret = put_user(0, (int *)arg); + ret = put_user(0, (int __user *)arg); break; case WDIOC_GETBOOTSTATUS: - ret = put_user(0, (int *)arg); + ret = put_user(0, (int __user *)arg); break; case WDIOC_KEEPALIVE: @@ -143,7 +143,7 @@ static long fitpc2_wdt_ioctl(struct file *file, unsigned int cmd, break; case WDIOC_SETTIMEOUT: - ret = get_user(time, (int *)arg); + ret = get_user(time, (int __user *)arg); if (ret) break; @@ -157,7 +157,7 @@ static long fitpc2_wdt_ioctl(struct file *file, unsigned int cmd, fallthrough; case WDIOC_GETTIMEOUT: - ret = put_user(margin, (int *)arg); + ret = put_user(margin, (int __user *)arg); break; } From 42e967f3c6cb3828f07a3822d7249bccb55221a4 Mon Sep 17 00:00:00 2001 From: Zhao Qiang Date: Fri, 27 Nov 2020 15:52:17 +0800 Subject: [PATCH 053/326] wdt: sp805: add watchdog_stop on reboot Call watchdog_stop_on_reboot in probe func Signed-off-by: Zhao Qiang Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201127075217.31312-1-qiang.zhao@nxp.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/sp805_wdt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/sp805_wdt.c b/drivers/watchdog/sp805_wdt.c index 190d26e2e75f..958dc32a708f 100644 --- a/drivers/watchdog/sp805_wdt.c +++ b/drivers/watchdog/sp805_wdt.c @@ -291,6 +291,7 @@ sp805_wdt_probe(struct amba_device *adev, const struct amba_id *id) set_bit(WDOG_HW_RUNNING, &wdt->wdd.status); } + watchdog_stop_on_reboot(&wdt->wdd); ret = watchdog_register_device(&wdt->wdd); if (ret) goto err; From acc195bd2cc48445ea35d00036d8c0afcc4fcc9c Mon Sep 17 00:00:00 2001 From: Jerry Hoemann Date: Sun, 22 Nov 2020 19:08:39 -0700 Subject: [PATCH 054/326] watchdog/hpwdt: Disable NMI in Crash Kernel NMIs received during the crash path are problematic as hpwdt_pretimeout handling of the NMI would cause a reentry into kdump. The situation is complicated in that I/O errors can be signaled as NMI circumventing hpwdt_pretimeout's attempt to not claim NMI not associated with either the WDT or the iLO NMI switch. These NMI can additionally cause a secondary NMI which cause the system to hang. By disabling pretimeout and hpwdtimeout in crash path we both reduce the risk of receiving an NMI and simuletaneously leave the WDT running (if it was already in use) to allow the WDT to break the system out of hangs by the WDT reset. Signed-off-by: Jerry Hoemann Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/1606097320-56762-2-git-send-email-jerry.hoemann@hpe.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/hpwdt.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index 7d34bcf1c45b..eeb4df24a7e7 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -21,6 +21,7 @@ #include #include #include +#include #define HPWDT_VERSION "2.0.3" #define SECS_TO_TICKS(secs) ((secs) * 1000 / 128) @@ -334,6 +335,11 @@ static int hpwdt_init_one(struct pci_dev *dev, watchdog_set_nowayout(&hpwdt_dev, nowayout); watchdog_init_timeout(&hpwdt_dev, soft_margin, NULL); + if (is_kdump_kernel()) { + pretimeout = 0; + kdumptimeout = 0; + } + if (pretimeout && hpwdt_dev.timeout <= PRETIMEOUT_SEC) { dev_warn(&dev->dev, "timeout <= pretimeout. Setting pretimeout to zero\n"); pretimeout = 0; From 5674b74e52c052a34e0c4e8a14cdb0924f816d5e Mon Sep 17 00:00:00 2001 From: Jerry Hoemann Date: Sun, 22 Nov 2020 19:08:40 -0700 Subject: [PATCH 055/326] watchdog/hpwdt: Reflect changes Bump driver number to reflect recent changes. Signed-off-by: Jerry Hoemann Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/1606097320-56762-3-git-send-email-jerry.hoemann@hpe.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/hpwdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index eeb4df24a7e7..cbd1498ff015 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -23,7 +23,7 @@ #include #include -#define HPWDT_VERSION "2.0.3" +#define HPWDT_VERSION "2.0.4" #define SECS_TO_TICKS(secs) ((secs) * 1000 / 128) #define TICKS_TO_SECS(ticks) ((ticks) * 128 / 1000) #define HPWDT_MAX_TICKS 65535 From 24f98562bb5b1cd6184c583fb53a6068992bec4b Mon Sep 17 00:00:00 2001 From: Wong Vee Khee Date: Thu, 12 Nov 2020 01:22:05 +0800 Subject: [PATCH 056/326] watchdog: wdat_wdt: Fix missing kerneldoc reported by W=1 Fix the following warning while compiling with W=1. drivers/watchdog/wdat_wdt.c:48: warning: Function parameter or member 'instructions' not described in 'wdat_wdt' Signed-off-by: Wong Vee Khee Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201111172205.17215-1-vee.khee.wong@intel.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/wdat_wdt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/watchdog/wdat_wdt.c b/drivers/watchdog/wdat_wdt.c index 3065dd670a18..cec7917790e5 100644 --- a/drivers/watchdog/wdat_wdt.c +++ b/drivers/watchdog/wdat_wdt.c @@ -34,9 +34,9 @@ struct wdat_instruction { * @period: How long is one watchdog period in ms * @stopped_in_sleep: Is this watchdog stopped by the firmware in S1-S5 * @stopped: Was the watchdog stopped by the driver in suspend - * @actions: An array of instruction lists indexed by an action number from - * the WDAT table. There can be %NULL entries for not implemented - * actions. + * @instructions: An array of instruction lists indexed by an action number from + * the WDAT table. There can be %NULL entries for not implemented + * actions. */ struct wdat_wdt { struct platform_device *pdev; From 6f733cb2e7db38f8141b14740bcde577844a03b7 Mon Sep 17 00:00:00 2001 From: Wang Wensheng Date: Mon, 9 Nov 2020 13:05:12 +0000 Subject: [PATCH 057/326] watchdog: Fix potential dereferencing of null pointer A reboot notifier, which stops the WDT by calling the stop hook without any check, would be registered when we set WDOG_STOP_ON_REBOOT flag. Howerer we allow the WDT driver to omit the stop hook since commit "d0684c8a93549" ("watchdog: Make stop function optional") and provide a module parameter for user that controls the WDOG_STOP_ON_REBOOT flag in commit 9232c80659e94 ("watchdog: Add stop_on_reboot parameter to control reboot policy"). Together that commits make user potential to insert a watchdog driver that don't provide a stop hook but with the stop_on_reboot parameter set, then dereferencing of null pointer occurs on system reboot. Check the stop hook before registering the reboot notifier to fix the issue. Fixes: d0684c8a9354 ("watchdog: Make stop function optional") Signed-off-by: Wang Wensheng Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201109130512.28121-1-wangwensheng4@huawei.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/watchdog_core.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c index 423844757812..0e9a99559609 100644 --- a/drivers/watchdog/watchdog_core.c +++ b/drivers/watchdog/watchdog_core.c @@ -267,15 +267,19 @@ static int __watchdog_register_device(struct watchdog_device *wdd) } if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) { - wdd->reboot_nb.notifier_call = watchdog_reboot_notifier; + if (!wdd->ops->stop) + pr_warn("watchdog%d: stop_on_reboot not supported\n", wdd->id); + else { + wdd->reboot_nb.notifier_call = watchdog_reboot_notifier; - ret = register_reboot_notifier(&wdd->reboot_nb); - if (ret) { - pr_err("watchdog%d: Cannot register reboot notifier (%d)\n", - wdd->id, ret); - watchdog_dev_unregister(wdd); - ida_simple_remove(&watchdog_ida, id); - return ret; + ret = register_reboot_notifier(&wdd->reboot_nb); + if (ret) { + pr_err("watchdog%d: Cannot register reboot notifier (%d)\n", + wdd->id, ret); + watchdog_dev_unregister(wdd); + ida_simple_remove(&watchdog_ida, id); + return ret; + } } } From 89c866f5a238f6f68b0f71fab55f77a07e8f3adb Mon Sep 17 00:00:00 2001 From: "Enrico Weigelt, metux IT consult" Date: Tue, 17 Nov 2020 16:22:12 +0100 Subject: [PATCH 058/326] watchdog: iTCO_wdt: use module_platform_device() macro Reducing init boilerplate by using the module_platform_device macro. Signed-off-by: Enrico Weigelt, metux IT consult Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201117152214.32244-1-info@metux.net Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/iTCO_wdt.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index a370a185a41c..f2ddc8fc71cd 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -651,21 +651,7 @@ static struct platform_driver iTCO_wdt_driver = { }, }; -static int __init iTCO_wdt_init_module(void) -{ - pr_info("Intel TCO WatchDog Timer Driver v%s\n", DRV_VERSION); - - return platform_driver_register(&iTCO_wdt_driver); -} - -static void __exit iTCO_wdt_cleanup_module(void) -{ - platform_driver_unregister(&iTCO_wdt_driver); - pr_info("Watchdog Module Unloaded\n"); -} - -module_init(iTCO_wdt_init_module); -module_exit(iTCO_wdt_cleanup_module); +module_platform_driver(iTCO_wdt_driver); MODULE_AUTHOR("Wim Van Sebroeck "); MODULE_DESCRIPTION("Intel TCO WatchDog Timer Driver"); From e0a6aa30504cb8179d07609fb6386705e8f00663 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 13 Dec 2020 09:39:40 +0100 Subject: [PATCH 059/326] efi: ia64: disable the capsule loader EFI capsule loading is a feature that was introduced into EFI long after its initial introduction on Itanium, and it is highly unlikely that IA64 systems are receiving firmware updates in the first place, let alone using EFI capsules. So let's disable capsule support altogether on IA64. This fixes a build error on IA64 due to a recent change that added an unconditional include of asm/efi.h, which IA64 does not provide. While at it, tweak the make rules a bit so that the EFI capsule component that is always builtin (even if the EFI capsule loader itself is built as a module) is omitted for all architectures if the module is not enabled in the build. Cc: Tony Luck Link: https://lore.kernel.org/linux-efi/20201214152200.38353-1-ardb@kernel.org Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/Kconfig | 2 +- drivers/firmware/efi/Makefile | 5 ++++- include/linux/efi.h | 10 ++++------ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index b452cfa2100b..5ac2a37ed025 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -147,7 +147,7 @@ config EFI_BOOTLOADER_CONTROL config EFI_CAPSULE_LOADER tristate "EFI capsule loader" - depends on EFI + depends on EFI && !IA64 help This option exposes a loader interface "/dev/efi_capsule_loader" for users to load EFI capsules. This driver requires working runtime diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index d6ca2da19339..467e94259679 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -12,7 +12,10 @@ KASAN_SANITIZE_runtime-wrappers.o := n obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o obj-$(CONFIG_EFI) += efi.o vars.o reboot.o memattr.o tpm.o -obj-$(CONFIG_EFI) += capsule.o memmap.o +obj-$(CONFIG_EFI) += memmap.o +ifneq ($(CONFIG_EFI_CAPSULE_LOADER),) +obj-$(CONFIG_EFI) += capsule.o +endif obj-$(CONFIG_EFI_PARAMS_FROM_FDT) += fdtparams.o obj-$(CONFIG_EFI_VARS) += efivars.o obj-$(CONFIG_EFI_ESRT) += esrt.o diff --git a/include/linux/efi.h b/include/linux/efi.h index 1cd5d91d8ca1..763b816ba19c 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -817,12 +817,6 @@ static inline bool efi_enabled(int feature) static inline void efi_reboot(enum reboot_mode reboot_mode, const char *__unused) {} -static inline bool -efi_capsule_pending(int *reset_type) -{ - return false; -} - static inline bool efi_soft_reserve_enabled(void) { return false; @@ -1038,6 +1032,7 @@ bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data, bool efivar_variable_is_removable(efi_guid_t vendor, const char *name, size_t len); +#if IS_ENABLED(CONFIG_EFI_CAPSULE_LOADER) extern bool efi_capsule_pending(int *reset_type); extern int efi_capsule_supported(efi_guid_t guid, u32 flags, @@ -1045,6 +1040,9 @@ extern int efi_capsule_supported(efi_guid_t guid, u32 flags, extern int efi_capsule_update(efi_capsule_header_t *capsule, phys_addr_t *pages); +#else +static inline bool efi_capsule_pending(int *reset_type) { return false; } +#endif #ifdef CONFIG_EFI_RUNTIME_MAP int efi_runtime_map_init(struct kobject *); From d72c8b0e1cacc39495cd413433d260e8ae59374a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 13 Dec 2020 16:07:03 +0100 Subject: [PATCH 060/326] efi: arm: force use of unsigned type for EFI_PHYS_ALIGN Ensure that EFI_PHYS_ALIGN is an unsigned type, to prevent spurious warnings from the type checks in the definition of the max() macro. Link: https://lore.kernel.org/linux-efi/20201213151306.73558-1-ardb@kernel.org Signed-off-by: Ard Biesheuvel --- arch/arm/include/asm/efi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index abae071a02e1..9de7ab2ce05d 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -71,7 +71,7 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) * here throws off the memory allocation logic, so let's use the lowest power * of two greater than 2 MiB and greater than TEXT_OFFSET. */ -#define EFI_PHYS_ALIGN max(SZ_2M, roundup_pow_of_two(TEXT_OFFSET)) +#define EFI_PHYS_ALIGN max(UL(SZ_2M), roundup_pow_of_two(TEXT_OFFSET)) /* on ARM, the initrd should be loaded in a lowmem region */ static inline unsigned long efi_get_max_initrd_addr(unsigned long image_addr) From ae7927023243dcc7389b2d59b16c09cbbeaecc36 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Dec 2020 17:14:08 +0100 Subject: [PATCH 061/326] sched: Optimize finish_lock_switch() The kernel test robot measured a -1.6% performance regression on will-it-scale/sched_yield due to commit: 2558aacff858 ("sched/hotplug: Ensure only per-cpu kthreads run during hotplug") Even though we were careful to replace a single load with another single load from the same cacheline. Restore finish_lock_switch() to the exact state before the offending patch and solve the problem differently. Fixes: 2558aacff858 ("sched/hotplug: Ensure only per-cpu kthreads run during hotplug") Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201210161408.GX3021@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 40 +++++++++++++++------------------------- kernel/sched/sched.h | 13 +++++-------- 2 files changed, 20 insertions(+), 33 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7af80c3fce12..0ca7d2dc16d5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3985,15 +3985,20 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head) } } +static void balance_push(struct rq *rq); + +struct callback_head balance_push_callback = { + .next = NULL, + .func = (void (*)(struct callback_head *))balance_push, +}; + static inline struct callback_head *splice_balance_callbacks(struct rq *rq) { struct callback_head *head = rq->balance_callback; lockdep_assert_held(&rq->lock); - if (head) { + if (head) rq->balance_callback = NULL; - rq->balance_flags &= ~BALANCE_WORK; - } return head; } @@ -4014,21 +4019,6 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head) } } -static void balance_push(struct rq *rq); - -static inline void balance_switch(struct rq *rq) -{ - if (likely(!rq->balance_flags)) - return; - - if (rq->balance_flags & BALANCE_PUSH) { - balance_push(rq); - return; - } - - __balance_callbacks(rq); -} - #else static inline void __balance_callbacks(struct rq *rq) @@ -4044,10 +4034,6 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head) { } -static inline void balance_switch(struct rq *rq) -{ -} - #endif static inline void @@ -4075,7 +4061,7 @@ static inline void finish_lock_switch(struct rq *rq) * prev into current: */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - balance_switch(rq); + __balance_callbacks(rq); raw_spin_unlock_irq(&rq->lock); } @@ -7256,6 +7242,10 @@ static void balance_push(struct rq *rq) lockdep_assert_held(&rq->lock); SCHED_WARN_ON(rq->cpu != smp_processor_id()); + /* + * Ensure the thing is persistent until balance_push_set(.on = false); + */ + rq->balance_callback = &balance_push_callback; /* * Both the cpu-hotplug and stop task are in this case and are @@ -7305,9 +7295,9 @@ static void balance_push_set(int cpu, bool on) rq_lock_irqsave(rq, &rf); if (on) - rq->balance_flags |= BALANCE_PUSH; + rq->balance_callback = &balance_push_callback; else - rq->balance_flags &= ~BALANCE_PUSH; + rq->balance_callback = NULL; rq_unlock_irqrestore(rq, &rf); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f5acb6c5ce49..12ada79d40f3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -975,7 +975,6 @@ struct rq { unsigned long cpu_capacity_orig; struct callback_head *balance_callback; - unsigned char balance_flags; unsigned char nohz_idle_balance; unsigned char idle_balance; @@ -1226,6 +1225,8 @@ struct rq_flags { #endif }; +extern struct callback_head balance_push_callback; + /* * Lockdep annotation that avoids accidental unlocks; it's like a * sticky/continuous lockdep_assert_held(). @@ -1243,9 +1244,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) #ifdef CONFIG_SCHED_DEBUG rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; -#endif #ifdef CONFIG_SMP - SCHED_WARN_ON(rq->balance_callback); + SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback); +#endif #endif } @@ -1408,9 +1409,6 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SMP -#define BALANCE_WORK 0x01 -#define BALANCE_PUSH 0x02 - static inline void queue_balance_callback(struct rq *rq, struct callback_head *head, @@ -1418,13 +1416,12 @@ queue_balance_callback(struct rq *rq, { lockdep_assert_held(&rq->lock); - if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH))) + if (unlikely(head->next || rq->balance_callback == &balance_push_callback)) return; head->func = (void (*)(struct callback_head *))func; head->next = rq->balance_callback; rq->balance_callback = head; - rq->balance_flags |= BALANCE_WORK; } #define rcu_dereference_check_sched_domain(p) \ From a313357e704f2617f298333e3e617a38b1719760 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:37 +0100 Subject: [PATCH 062/326] genirq: Move irq_has_action() into core code This function uses irq_to_desc() and is going to be used by modules to replace the open coded irq_to_desc() (ab)usage. The final goal is to remove the export of irq_to_desc() so driver cannot fiddle with it anymore. Move it into the core code and fixup the usage sites to include the proper header. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201210194042.548936472@linutronix.de --- arch/alpha/kernel/sys_jensen.c | 2 +- arch/x86/kernel/topology.c | 1 + include/linux/interrupt.h | 1 + include/linux/irqdesc.h | 7 +------ kernel/irq/manage.c | 17 +++++++++++++++++ 5 files changed, 21 insertions(+), 7 deletions(-) diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c index 0a2ab6cb18db..e5d870ff225f 100644 --- a/arch/alpha/kernel/sys_jensen.c +++ b/arch/alpha/kernel/sys_jensen.c @@ -7,7 +7,7 @@ * * Code supporting the Jensen. */ - +#include #include #include #include diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 0a2ec801b63f..f5477eab5692 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -25,6 +25,7 @@ * * Send feedback to */ +#include #include #include #include diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 870b3251e174..bb8ff9083e7d 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -232,6 +232,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); # define local_irq_enable_in_hardirq() local_irq_enable() #endif +bool irq_has_action(unsigned int irq); extern void disable_irq_nosync(unsigned int irq); extern bool disable_hardirq(unsigned int irq); extern void disable_irq(unsigned int irq); diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 5745491303e0..385a4fafe631 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -179,12 +179,7 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, /* Test to see if a driver has successfully requested an irq */ static inline int irq_desc_has_action(struct irq_desc *desc) { - return desc->action != NULL; -} - -static inline int irq_has_action(unsigned int irq) -{ - return irq_desc_has_action(irq_to_desc(irq)); + return desc && desc->action != NULL; } /** diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c826ba4141fe..a5a1cde5c1a2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2822,3 +2822,20 @@ out_unlock: return err; } EXPORT_SYMBOL_GPL(irq_set_irqchip_state); + +/** + * irq_has_action - Check whether an interrupt is requested + * @irq: The linux irq number + * + * Returns: A snapshot of the current state + */ +bool irq_has_action(unsigned int irq) +{ + bool res; + + rcu_read_lock(); + res = irq_desc_has_action(irq_to_desc(irq)); + rcu_read_unlock(); + return res; +} +EXPORT_SYMBOL_GPL(irq_has_action); From fdd029630434b434b127efc7fba337da28f45658 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:38 +0100 Subject: [PATCH 063/326] genirq: Move status flag checks to core These checks are used by modules and prevent the removal of the export of irq_to_desc(). Move the accessor into the core. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201210194042.703779349@linutronix.de --- include/linux/irqdesc.h | 17 +++++------------ kernel/irq/manage.c | 20 ++++++++++++++++++++ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 385a4fafe631..308d7db8991f 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -223,28 +223,21 @@ irq_set_chip_handler_name_locked(struct irq_data *data, struct irq_chip *chip, data->chip = chip; } +bool irq_check_status_bit(unsigned int irq, unsigned int bitmask); + static inline bool irq_balancing_disabled(unsigned int irq) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); - return desc->status_use_accessors & IRQ_NO_BALANCING_MASK; + return irq_check_status_bit(irq, IRQ_NO_BALANCING_MASK); } static inline bool irq_is_percpu(unsigned int irq) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); - return desc->status_use_accessors & IRQ_PER_CPU; + return irq_check_status_bit(irq, IRQ_PER_CPU); } static inline bool irq_is_percpu_devid(unsigned int irq) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); - return desc->status_use_accessors & IRQ_PER_CPU_DEVID; + return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID); } static inline void diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a5a1cde5c1a2..ab8567f32501 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2839,3 +2839,23 @@ bool irq_has_action(unsigned int irq) return res; } EXPORT_SYMBOL_GPL(irq_has_action); + +/** + * irq_check_status_bit - Check whether bits in the irq descriptor status are set + * @irq: The linux irq number + * @bitmask: The bitmask to evaluate + * + * Returns: True if one of the bits in @bitmask is set + */ +bool irq_check_status_bit(unsigned int irq, unsigned int bitmask) +{ + struct irq_desc *desc; + bool res = false; + + rcu_read_lock(); + desc = irq_to_desc(irq); + if (desc) + res = !!(desc->status_use_accessors & bitmask); + rcu_read_unlock(); + return res; +} From f1c6306c0d6b50844ba02c8a53e35405e9c0db05 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:39 +0100 Subject: [PATCH 064/326] genirq: Move irq_set_lockdep_class() to core irq_set_lockdep_class() is used from modules and requires irq_to_desc() to be exported. Move it into the core code which lifts another requirement for the export. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201210194042.860029489@linutronix.de --- include/linux/irqdesc.h | 10 ++++------ kernel/irq/irqdesc.c | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 308d7db8991f..4a1d016716f4 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -240,16 +240,14 @@ static inline bool irq_is_percpu_devid(unsigned int irq) return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID); } +void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, + struct lock_class_key *request_class); static inline void irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, struct lock_class_key *request_class) { - struct irq_desc *desc = irq_to_desc(irq); - - if (desc) { - lockdep_set_class(&desc->lock, lock_class); - lockdep_set_class(&desc->request_mutex, request_class); - } + if (IS_ENABLED(CONFIG_LOCKDEP)) + __irq_set_lockdep_class(irq, lock_class, request_class); } #endif diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index e810eb9906ea..20a54fa7cd30 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -968,3 +968,17 @@ unsigned int kstat_irqs_usr(unsigned int irq) rcu_read_unlock(); return sum; } + +#ifdef CONFIG_LOCKDEP +void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, + struct lock_class_key *request_class) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) { + lockdep_set_class(&desc->lock, lock_class); + lockdep_set_class(&desc->request_mutex, request_class); + } +} +EXPORT_SYMBOL_GPL(__irq_set_lockdep_class); +#endif From 3e2380123fb96987ce958f623207010c667ffa7c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:40 +0100 Subject: [PATCH 065/326] genirq: Provide irq_get_effective_affinity() Provide an accessor to the effective interrupt affinity mask. Going to be used to replace open coded fiddling with the irq descriptor. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201210194042.967177918@linutronix.de --- include/linux/irq.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/irq.h b/include/linux/irq.h index c332871d59da..4aeb1c4c7e07 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -906,6 +906,13 @@ struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) } #endif +static inline struct cpumask *irq_get_effective_affinity_mask(unsigned int irq) +{ + struct irq_data *d = irq_get_irq_data(irq); + + return d ? irq_data_get_effective_affinity_mask(d) : NULL; +} + unsigned int arch_dynirq_lower_bound(unsigned int from); int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, From 9e42ad10cedf0632fc39860381375806092212bd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:41 +0100 Subject: [PATCH 066/326] genirq: Annotate irq stats data races Both the per cpu stats and the accumulated count are accessed lockless and can be concurrently modified. That's intentional and the stats are a rough estimate anyway. Annotate them with data_race(). Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201210194043.067097663@linutronix.de --- kernel/irq/irqdesc.c | 4 ++-- kernel/irq/proc.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 20a54fa7cd30..02b446a21ce6 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -943,10 +943,10 @@ unsigned int kstat_irqs(unsigned int irq) if (!irq_settings_is_per_cpu_devid(desc) && !irq_settings_is_per_cpu(desc) && !irq_is_nmi(desc)) - return desc->tot_count; + return data_race(desc->tot_count); for_each_possible_cpu(cpu) - sum += *per_cpu_ptr(desc->kstat_irqs, cpu); + sum += data_race(*per_cpu_ptr(desc->kstat_irqs, cpu)); return sum; } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 72513ed2a5fc..98138788cb04 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -488,9 +488,10 @@ int show_interrupts(struct seq_file *p, void *v) if (!desc || irq_settings_is_hidden(desc)) goto outsparse; - if (desc->kstat_irqs) + if (desc->kstat_irqs) { for_each_online_cpu(j) - any_count |= *per_cpu_ptr(desc->kstat_irqs, j); + any_count |= data_race(*per_cpu_ptr(desc->kstat_irqs, j)); + } if ((!desc->action || irq_desc_is_chained(desc)) && !any_count) goto outsparse; From bb0e5192f59875031a0ad060bef2ea0f6c657474 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:42 +0100 Subject: [PATCH 067/326] parisc/irq: Simplify irq count output for /proc/interrupts The SMP variant works perfectly fine on UP as well. Signed-off-by: Thomas Gleixner Cc: linux-parisc@vger.kernel.org Link: https://lore.kernel.org/r/20201210194043.172893840@linutronix.de --- arch/parisc/kernel/irq.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c index e76c86619949..15c6207cf3d4 100644 --- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -216,12 +216,9 @@ int show_interrupts(struct seq_file *p, void *v) if (!action) goto skip; seq_printf(p, "%3d: ", i); -#ifdef CONFIG_SMP + for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); -#else - seq_printf(p, "%10u ", kstat_irqs(i)); -#endif seq_printf(p, " %14s", irq_desc_get_chip(desc)->name); #ifndef PARISC_IRQ_CR16_COUNTS From 26c19d0a8610fb233b31730fe26a31145f2d9796 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:43 +0100 Subject: [PATCH 068/326] genirq: Make kstat_irqs() static No more users outside the core code. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201210194043.268774449@linutronix.de --- include/linux/kernel_stat.h | 1 - kernel/irq/irqdesc.c | 19 ++++++------------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 89f0745c096d..44ae1a7eb9e3 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -67,7 +67,6 @@ static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu) /* * Number of interrupts per specific IRQ source, since bootup */ -extern unsigned int kstat_irqs(unsigned int irq); extern unsigned int kstat_irqs_usr(unsigned int irq); /* diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 02b446a21ce6..2eb076f4a566 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -924,15 +924,7 @@ static bool irq_is_nmi(struct irq_desc *desc) return desc->istate & IRQS_NMI; } -/** - * kstat_irqs - Get the statistics for an interrupt - * @irq: The interrupt number - * - * Returns the sum of interrupt counts on all cpus since boot for - * @irq. The caller must ensure that the interrupt is not removed - * concurrently. - */ -unsigned int kstat_irqs(unsigned int irq) +static unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); unsigned int sum = 0; @@ -951,13 +943,14 @@ unsigned int kstat_irqs(unsigned int irq) } /** - * kstat_irqs_usr - Get the statistics for an interrupt + * kstat_irqs_usr - Get the statistics for an interrupt from thread context * @irq: The interrupt number * * Returns the sum of interrupt counts on all cpus since boot for @irq. - * Contrary to kstat_irqs() this can be called from any context. - * It uses rcu since a concurrent removal of an interrupt descriptor is - * observing an rcu grace period before delayed_free_desc()/irq_kobj_release(). + * + * It uses rcu to protect the access since a concurrent removal of an + * interrupt descriptor is observing an rcu grace period before + * delayed_free_desc()/irq_kobj_release(). */ unsigned int kstat_irqs_usr(unsigned int irq) { From 501e2db67fa4264b517de5c7934e94cca89b3a1e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:44 +0100 Subject: [PATCH 069/326] genirq: Provide kstat_irqdesc_cpu() Most users of kstat_irqs_cpu() have the irq descriptor already. No point in calling into the core code and looking it up once more. Use it in per_cpu_count_show() to start with. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201210194043.362094758@linutronix.de --- include/linux/irqdesc.h | 6 ++++++ kernel/irq/irqdesc.c | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 4a1d016716f4..891b323266df 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -113,6 +113,12 @@ static inline void irq_unlock_sparse(void) { } extern struct irq_desc irq_desc[NR_IRQS]; #endif +static inline unsigned int irq_desc_kstat_cpu(struct irq_desc *desc, + unsigned int cpu) +{ + return desc->kstat_irqs ? *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; +} + static inline struct irq_desc *irq_data_to_desc(struct irq_data *data) { return container_of(data->common, struct irq_desc, irq_common_data); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2eb076f4a566..f509c4db2029 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -147,12 +147,12 @@ static ssize_t per_cpu_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - int cpu, irq = desc->irq_data.irq; ssize_t ret = 0; char *p = ""; + int cpu; for_each_possible_cpu(cpu) { - unsigned int c = kstat_irqs_cpu(irq, cpu); + unsigned int c = irq_desc_kstat_cpu(desc, cpu); ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c); p = ","; From 88c637748e3176dcfaa36185e5eaafe6098d43e0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:45 +0100 Subject: [PATCH 070/326] ARM: smp: Use irq_desc_kstat_cpu() in show_ipi_list() The irq descriptor is already there, no need to look it up again. Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20201210194043.454288890@linutronix.de --- arch/arm/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 48099c6e1e4a..e66c46aba5b4 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -550,7 +550,7 @@ void show_ipi_list(struct seq_file *p, int prec) seq_printf(p, "%*s%u: ", prec - 1, "IPI", i); for_each_online_cpu(cpu) - seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu)); + seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu)); seq_printf(p, " %s\n", ipi_types[i]); } From 5089bc51f81f05ad7f0e46db2107be2311343852 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:46 +0100 Subject: [PATCH 071/326] arm64/smp: Use irq_desc_kstat_cpu() in arch_show_interrupts() The irq descriptor is already there, no need to look it up again. Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20201210194043.546326568@linutronix.de --- arch/arm64/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 18e9727d3f64..b2e5dc11abed 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -810,7 +810,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i, prec >= 4 ? " " : ""); for_each_online_cpu(cpu) - seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu)); + seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu)); seq_printf(p, " %s\n", ipi_types[i]); } From 7435248e6d66e4e853da093c939c28a9f4b92765 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:47 +0100 Subject: [PATCH 072/326] parisc/irq: Use irq_desc_kstat_cpu() in show_interrupts() The irq descriptor is already there, no need to look it up again. Signed-off-by: Thomas Gleixner Cc: linux-parisc@vger.kernel.org Link: https://lore.kernel.org/r/20201210194043.659522455@linutronix.de --- arch/parisc/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c index 15c6207cf3d4..49cd6d2caefb 100644 --- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -218,7 +218,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, "%3d: ", i); for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + seq_printf(p, "%10u ", irq_desc_kstat_cpu(desc, j)); seq_printf(p, " %14s", irq_desc_get_chip(desc)->name); #ifndef PARISC_IRQ_CR16_COUNTS From ba22d0ede31779485f0d86d7dcf51387ba810a17 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:48 +0100 Subject: [PATCH 073/326] s390/irq: Use irq_desc_kstat_cpu() in show_msi_interrupt() The irq descriptor is already there, no need to look it up again. Signed-off-by: Thomas Gleixner Acked-by: Heiko Carstens Link: https://lore.kernel.org/r/20201210194043.769108348@linutronix.de --- arch/s390/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 3514420f0259..f8a8b9428ae2 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -124,7 +124,7 @@ static void show_msi_interrupt(struct seq_file *p, int irq) raw_spin_lock_irqsave(&desc->lock, flags); seq_printf(p, "%3d: ", irq); for_each_online_cpu(cpu) - seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu)); + seq_printf(p, "%10u ", irq_desc_kstat_cpu(desc, cpu)); if (desc->irq_data.chip) seq_printf(p, " %8s", desc->irq_data.chip->name); From 3afba095158269c281c49518f49da5a702878919 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:49 +0100 Subject: [PATCH 074/326] drm/i915/lpe_audio: Remove pointless irq_to_desc() usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nothing uses the result and nothing should ever use it in driver code. Signed-off-by: Thomas Gleixner Reviewed-by: Ville Syrjälä Acked-by: Jani Nikula Link: https://lore.kernel.org/r/20201210194043.862572239@linutronix.de --- drivers/gpu/drm/i915/display/intel_lpe_audio.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_lpe_audio.c b/drivers/gpu/drm/i915/display/intel_lpe_audio.c index ad5cc13037ae..1c939f9c9bc9 100644 --- a/drivers/gpu/drm/i915/display/intel_lpe_audio.c +++ b/drivers/gpu/drm/i915/display/intel_lpe_audio.c @@ -297,13 +297,9 @@ int intel_lpe_audio_init(struct drm_i915_private *dev_priv) */ void intel_lpe_audio_teardown(struct drm_i915_private *dev_priv) { - struct irq_desc *desc; - if (!HAS_LPE_AUDIO(dev_priv)) return; - desc = irq_to_desc(dev_priv->lpe_audio.irq); - lpe_audio_platdev_destroy(dev_priv); irq_free_desc(dev_priv->lpe_audio.irq); From 9c6508b9d2091d14a8fde5d478e19e053bf46552 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:50 +0100 Subject: [PATCH 075/326] drm/i915/pmu: Replace open coded kstat_irqs() copy Driver code has no business with the internals of the irq descriptor. Aside of that the count is per interrupt line and therefore takes interrupts from other devices into account which share the interrupt line and are not handled by the graphics driver. Replace it with a pmu private count which only counts interrupts which originate from the graphics card. To avoid atomics or heuristics of some sort make the counter field 'unsigned long'. That limits the count to 4e9 on 32bit which is a lot and postprocessing can easily deal with the occasional wraparound. Signed-off-by: Thomas Gleixner Acked-by: Jani Nikula Cc: Tvrtko Ursulin Link: https://lore.kernel.org/r/20201210194043.957046529@linutronix.de --- drivers/gpu/drm/i915/i915_irq.c | 34 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_pmu.c | 19 +----------------- drivers/gpu/drm/i915/i915_pmu.h | 8 ++++++++ 3 files changed, 43 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 759f523c6a6b..e741cd7f7fc6 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -60,6 +60,24 @@ * and related files, but that will be described in separate chapters. */ +/* + * Interrupt statistic for PMU. Increments the counter only if the + * interrupt originated from the the GPU so interrupts from a device which + * shares the interrupt line are not accounted. + */ +static inline void pmu_irq_stats(struct drm_i915_private *i915, + irqreturn_t res) +{ + if (unlikely(res != IRQ_HANDLED)) + return; + + /* + * A clever compiler translates that into INC. A not so clever one + * should at least prevent store tearing. + */ + WRITE_ONCE(i915->pmu.irq_count, i915->pmu.irq_count + 1); +} + typedef bool (*long_pulse_detect_func)(enum hpd_pin pin, u32 val); static const u32 hpd_ilk[HPD_NUM_PINS] = { @@ -1599,6 +1617,8 @@ static irqreturn_t valleyview_irq_handler(int irq, void *arg) valleyview_pipestat_irq_handler(dev_priv, pipe_stats); } while (0); + pmu_irq_stats(dev_priv, ret); + enable_rpm_wakeref_asserts(&dev_priv->runtime_pm); return ret; @@ -1676,6 +1696,8 @@ static irqreturn_t cherryview_irq_handler(int irq, void *arg) valleyview_pipestat_irq_handler(dev_priv, pipe_stats); } while (0); + pmu_irq_stats(dev_priv, ret); + enable_rpm_wakeref_asserts(&dev_priv->runtime_pm); return ret; @@ -2103,6 +2125,8 @@ static irqreturn_t ilk_irq_handler(int irq, void *arg) if (sde_ier) raw_reg_write(regs, SDEIER, sde_ier); + pmu_irq_stats(i915, ret); + /* IRQs are synced during runtime_suspend, we don't require a wakeref */ enable_rpm_wakeref_asserts(&i915->runtime_pm); @@ -2419,6 +2443,8 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg) gen8_master_intr_enable(regs); + pmu_irq_stats(dev_priv, IRQ_HANDLED); + return IRQ_HANDLED; } @@ -2514,6 +2540,8 @@ __gen11_irq_handler(struct drm_i915_private * const i915, gen11_gu_misc_irq_handler(gt, gu_misc_iir); + pmu_irq_stats(i915, IRQ_HANDLED); + return IRQ_HANDLED; } @@ -3688,6 +3716,8 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg) i8xx_pipestat_irq_handler(dev_priv, iir, pipe_stats); } while (0); + pmu_irq_stats(dev_priv, ret); + enable_rpm_wakeref_asserts(&dev_priv->runtime_pm); return ret; @@ -3796,6 +3826,8 @@ static irqreturn_t i915_irq_handler(int irq, void *arg) i915_pipestat_irq_handler(dev_priv, iir, pipe_stats); } while (0); + pmu_irq_stats(dev_priv, ret); + enable_rpm_wakeref_asserts(&dev_priv->runtime_pm); return ret; @@ -3941,6 +3973,8 @@ static irqreturn_t i965_irq_handler(int irq, void *arg) i965_pipestat_irq_handler(dev_priv, iir, pipe_stats); } while (0); + pmu_irq_stats(dev_priv, IRQ_HANDLED); + enable_rpm_wakeref_asserts(&dev_priv->runtime_pm); return ret; diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 69c0fa20eba1..3b88cb01b4da 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -4,7 +4,6 @@ * Copyright © 2017-2018 Intel Corporation */ -#include #include #include "gt/intel_engine.h" @@ -423,22 +422,6 @@ static enum hrtimer_restart i915_sample(struct hrtimer *hrtimer) return HRTIMER_RESTART; } -static u64 count_interrupts(struct drm_i915_private *i915) -{ - /* open-coded kstat_irqs() */ - struct irq_desc *desc = irq_to_desc(i915->drm.pdev->irq); - u64 sum = 0; - int cpu; - - if (!desc || !desc->kstat_irqs) - return 0; - - for_each_possible_cpu(cpu) - sum += *per_cpu_ptr(desc->kstat_irqs, cpu); - - return sum; -} - static void i915_pmu_event_destroy(struct perf_event *event) { struct drm_i915_private *i915 = @@ -581,7 +564,7 @@ static u64 __i915_pmu_event_read(struct perf_event *event) USEC_PER_SEC /* to MHz */); break; case I915_PMU_INTERRUPTS: - val = count_interrupts(i915); + val = READ_ONCE(pmu->irq_count); break; case I915_PMU_RC6_RESIDENCY: val = get_rc6(&i915->gt); diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h index 941f0c14037c..9e49c6490780 100644 --- a/drivers/gpu/drm/i915/i915_pmu.h +++ b/drivers/gpu/drm/i915/i915_pmu.h @@ -107,6 +107,14 @@ struct i915_pmu { * @sleep_last: Last time GT parked for RC6 estimation. */ ktime_t sleep_last; + /** + * @irq_count: Number of interrupts + * + * Intentionally unsigned long to avoid atomics or heuristics on 32bit. + * 4e9 interrupts are a lot and postprocessing can really deal with an + * occasional wraparound easily. It's 32bit after all. + */ + unsigned long irq_count; /** * @events_attr_group: Device events attribute group. */ From f3925032d7fd4aa627ff10e780430269b3829f83 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:51 +0100 Subject: [PATCH 076/326] pinctrl: nomadik: Use irq_has_action() Let the core code do the fiddling with irq_desc. Signed-off-by: Thomas Gleixner Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20201210194044.065003856@linutronix.de --- drivers/pinctrl/nomadik/pinctrl-nomadik.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/pinctrl/nomadik/pinctrl-nomadik.c b/drivers/pinctrl/nomadik/pinctrl-nomadik.c index 657e35a75d84..d4ea10803fd9 100644 --- a/drivers/pinctrl/nomadik/pinctrl-nomadik.c +++ b/drivers/pinctrl/nomadik/pinctrl-nomadik.c @@ -948,8 +948,8 @@ static void nmk_gpio_dbg_show_one(struct seq_file *s, (mode < 0) ? "unknown" : modes[mode]); } else { int irq = chip->to_irq(chip, offset); - struct irq_desc *desc = irq_to_desc(irq); const int pullidx = pull ? 1 : 0; + bool wake; int val; static const char * const pulls[] = { "none ", @@ -969,8 +969,9 @@ static void nmk_gpio_dbg_show_one(struct seq_file *s, * This races with request_irq(), set_irq_type(), * and set_irq_wake() ... but those are "rare". */ - if (irq > 0 && desc && desc->action) { + if (irq > 0 && irq_has_action(irq)) { char *trigger; + bool wake; if (nmk_chip->edge_rising & BIT(offset)) trigger = "edge-rising"; @@ -979,10 +980,10 @@ static void nmk_gpio_dbg_show_one(struct seq_file *s, else trigger = "edge-undefined"; + wake = !!(nmk_chip->real_wake & BIT(offset)); + seq_printf(s, " irq-%d %s%s", - irq, trigger, - irqd_is_wakeup_set(&desc->irq_data) - ? " wakeup" : ""); + irq, trigger, wake ? " wakeup" : ""); } } clk_disable(nmk_chip->clk); From 886c8121659dddb6dbfab4cdeb58d75e2d928731 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:52 +0100 Subject: [PATCH 077/326] mfd: ab8500-debugfs: Remove the racy fiddling with irq_desc First of all drivers have absolutely no business to dig into the internals of an irq descriptor. That's core code and subject to change. All of this information is readily available to /proc/interrupts in a safe and race free way. Remove the inspection code which is a blatant violation of subsystem boundaries and racy against concurrent modifications of the interrupt descriptor. Print the irq line instead so the information can be looked up in a sane way in /proc/interrupts. Signed-off-by: Thomas Gleixner Reviewed-by: Linus Walleij Acked-by: Lee Jones Link: https://lore.kernel.org/r/20201210194044.157283633@linutronix.de --- drivers/mfd/ab8500-debugfs.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/drivers/mfd/ab8500-debugfs.c b/drivers/mfd/ab8500-debugfs.c index 6d1bf7c3ca3b..a32039366a93 100644 --- a/drivers/mfd/ab8500-debugfs.c +++ b/drivers/mfd/ab8500-debugfs.c @@ -1513,24 +1513,14 @@ static int ab8500_interrupts_show(struct seq_file *s, void *p) { int line; - seq_puts(s, "name: number: number of: wake:\n"); + seq_puts(s, "name: number: irq: number of: wake:\n"); for (line = 0; line < num_interrupt_lines; line++) { - struct irq_desc *desc = irq_to_desc(line + irq_first); - - seq_printf(s, "%3i: %6i %4i", + seq_printf(s, "%3i: %4i %6i %4i\n", line, + line + irq_first, num_interrupts[line], num_wake_interrupts[line]); - - if (desc && desc->name) - seq_printf(s, "-%-8s", desc->name); - if (desc && desc->action) { - struct irqaction *action = desc->action; - - seq_printf(s, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(s, ", %s", action->name); } seq_putc(s, '\n'); } From 1110918e439fde69fdf2fe869f6499d56157fec9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:53 +0100 Subject: [PATCH 078/326] NTB/msi: Use irq_has_action() Use the proper core function. Signed-off-by: Thomas Gleixner Reviewed-by: Logan Gunthorpe Link: https://lore.kernel.org/r/20201210194044.255887860@linutronix.de --- drivers/ntb/msi.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/ntb/msi.c b/drivers/ntb/msi.c index 0a5e884a920c..3f05cfbc73af 100644 --- a/drivers/ntb/msi.c +++ b/drivers/ntb/msi.c @@ -282,15 +282,13 @@ int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb, irq_handler_t handler, struct ntb_msi_desc *msi_desc) { struct msi_desc *entry; - struct irq_desc *desc; int ret; if (!ntb->msi) return -EINVAL; for_each_pci_msi_entry(entry, ntb->pdev) { - desc = irq_to_desc(entry->irq); - if (desc->action) + if (irq_has_action(entry->irq)) continue; ret = devm_request_threaded_irq(&ntb->dev, entry->irq, handler, From e56427068a8d796bb7b8e297f2b6e947380e383f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:54 +0100 Subject: [PATCH 079/326] PCI: xilinx-nwl: Use irq_data_get_irq_chip_data() Going through a full irq descriptor lookup instead of just using the proper helper function which provides direct access is suboptimal. In fact it _is_ wrong because the chip callback needs to get the chip data which is relevant for the chip while using the irq descriptor variant returns the irq chip data of the top level chip of a hierarchy. It does not matter in this case because the chip is the top level chip, but that doesn't make it more correct. Signed-off-by: Thomas Gleixner Reviewed-by: Rob Herring Cc: Bjorn Helgaas Link: https://lore.kernel.org/r/20201210194044.364211860@linutronix.de --- drivers/pci/controller/pcie-xilinx-nwl.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/pcie-xilinx-nwl.c b/drivers/pci/controller/pcie-xilinx-nwl.c index f3cf7d61924f..22135df79d83 100644 --- a/drivers/pci/controller/pcie-xilinx-nwl.c +++ b/drivers/pci/controller/pcie-xilinx-nwl.c @@ -379,13 +379,11 @@ static void nwl_pcie_msi_handler_low(struct irq_desc *desc) static void nwl_mask_leg_irq(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(data->irq); - struct nwl_pcie *pcie; + struct nwl_pcie *pcie = irq_data_get_irq_chip_data(data); unsigned long flags; u32 mask; u32 val; - pcie = irq_desc_get_chip_data(desc); mask = 1 << (data->hwirq - 1); raw_spin_lock_irqsave(&pcie->leg_mask_lock, flags); val = nwl_bridge_readl(pcie, MSGF_LEG_MASK); @@ -395,13 +393,11 @@ static void nwl_mask_leg_irq(struct irq_data *data) static void nwl_unmask_leg_irq(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(data->irq); - struct nwl_pcie *pcie; + struct nwl_pcie *pcie = irq_data_get_irq_chip_data(data); unsigned long flags; u32 mask; u32 val; - pcie = irq_desc_get_chip_data(desc); mask = 1 << (data->hwirq - 1); raw_spin_lock_irqsave(&pcie->leg_mask_lock, flags); val = nwl_bridge_readl(pcie, MSGF_LEG_MASK); From b8fecfdfb08dcbabf3d46cfaf7c2fed0e6802ce8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:55 +0100 Subject: [PATCH 080/326] PCI: mobiveil: Use irq_data_get_irq_chip_data() Going through a full irq descriptor lookup instead of just using the proper helper function which provides direct access is suboptimal. In fact it _is_ wrong because the chip callback needs to get the chip data which is relevant for the chip while using the irq descriptor variant returns the irq chip data of the top level chip of a hierarchy. It does not matter in this case because the chip is the top level chip, but that doesn't make it more correct. Signed-off-by: Thomas Gleixner Reviewed-by: Rob Herring Cc: Bjorn Helgaas Link: https://lore.kernel.org/r/20201210194044.473308721@linutronix.de --- drivers/pci/controller/mobiveil/pcie-mobiveil-host.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/mobiveil/pcie-mobiveil-host.c b/drivers/pci/controller/mobiveil/pcie-mobiveil-host.c index a2632d02ce8f..c637de3a389b 100644 --- a/drivers/pci/controller/mobiveil/pcie-mobiveil-host.c +++ b/drivers/pci/controller/mobiveil/pcie-mobiveil-host.c @@ -306,13 +306,11 @@ int mobiveil_host_init(struct mobiveil_pcie *pcie, bool reinit) static void mobiveil_mask_intx_irq(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(data->irq); - struct mobiveil_pcie *pcie; + struct mobiveil_pcie *pcie = irq_data_get_irq_chip_data(data); struct mobiveil_root_port *rp; unsigned long flags; u32 mask, shifted_val; - pcie = irq_desc_get_chip_data(desc); rp = &pcie->rp; mask = 1 << ((data->hwirq + PAB_INTX_START) - 1); raw_spin_lock_irqsave(&rp->intx_mask_lock, flags); @@ -324,13 +322,11 @@ static void mobiveil_mask_intx_irq(struct irq_data *data) static void mobiveil_unmask_intx_irq(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(data->irq); - struct mobiveil_pcie *pcie; + struct mobiveil_pcie *pcie = irq_data_get_irq_chip_data(data); struct mobiveil_root_port *rp; unsigned long flags; u32 shifted_val, mask; - pcie = irq_desc_get_chip_data(desc); rp = &pcie->rp; mask = 1 << ((data->hwirq + PAB_INTX_START) - 1); raw_spin_lock_irqsave(&rp->intx_mask_lock, flags); From 80a62deedf9d449cb65655df39d34b7ef9321d79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:56 +0100 Subject: [PATCH 081/326] net/mlx4: Replace irq_to_desc() abuse No driver has any business with the internals of an interrupt descriptor. Storing a pointer to it just to use yet another helper at the actual usage site to retrieve the affinity mask is creative at best. Just because C does not allow encapsulation does not mean that the kernel has no limits. Retrieve a pointer to the affinity mask itself and use that. It's still using an interface which is usually not for random drivers, but definitely less hideous than the previous hack. Signed-off-by: Thomas Gleixner Cc: Tariq Toukan Link: https://lore.kernel.org/r/20201210194044.580936243@linutronix.de --- drivers/net/ethernet/mellanox/mlx4/en_cq.c | 8 +++----- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 6 +----- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 3 ++- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c index 74d466796b7c..2a250f32d5cb 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c @@ -90,7 +90,7 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, int cq_idx) { struct mlx4_en_dev *mdev = priv->mdev; - int err = 0; + int irq, err = 0; int timestamp_en = 0; bool assigned_eq = false; @@ -116,10 +116,8 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, assigned_eq = true; } - - cq->irq_desc = - irq_to_desc(mlx4_eq_get_irq(mdev->dev, - cq->vector)); + irq = mlx4_eq_get_irq(mdev->dev, cq->vector); + cq->aff_mask = irq_get_affinity_mask(irq); } else { /* For TX we use the same irq per ring we assigned for the RX */ diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 502d1b97855c..399459adee7d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -959,8 +959,6 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget) /* If we used up all the quota - we're probably not done yet... */ if (done == budget || !clean_complete) { - const struct cpumask *aff; - struct irq_data *idata; int cpu_curr; /* in case we got here because of !clean_complete */ @@ -969,10 +967,8 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget) INC_PERF_COUNTER(priv->pstats.napi_quota); cpu_curr = smp_processor_id(); - idata = irq_desc_get_irq_data(cq->irq_desc); - aff = irq_data_get_affinity_mask(idata); - if (likely(cpumask_test_cpu(cpu_curr, aff))) + if (likely(cpumask_test_cpu(cpu_curr, cq->aff_mask))) return budget; /* Current cpu is not according to smp_irq_affinity - diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index a46efe37cfa9..48d71e0c71e4 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -46,6 +46,7 @@ #endif #include #include +#include #include #include @@ -380,7 +381,7 @@ struct mlx4_en_cq { struct mlx4_cqe *buf; #define MLX4_EN_OPCODE_ERROR 0x1e - struct irq_desc *irq_desc; + const struct cpumask *aff_mask; }; struct mlx4_en_port_profile { From 197d237077295793a3e4ea0abcbea106f8b4217c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:57 +0100 Subject: [PATCH 082/326] net/mlx4: Use effective interrupt affinity Using the interrupt affinity mask for checking locality is not really working well on architectures which support effective affinity masks. The affinity mask is either the system wide default or set by user space, but the architecture can or even must reduce the mask to the effective set, which means that checking the affinity mask itself does not really tell about the actual target CPUs. Signed-off-by: Thomas Gleixner Cc: Tariq Toukan Link: https://lore.kernel.org/r/20201210194044.672935978@linutronix.de --- drivers/net/ethernet/mellanox/mlx4/en_cq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c index 2a250f32d5cb..d5fc72b1a36f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c @@ -117,7 +117,7 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, assigned_eq = true; } irq = mlx4_eq_get_irq(mdev->dev, cq->vector); - cq->aff_mask = irq_get_affinity_mask(irq); + cq->aff_mask = irq_get_effective_affinity_mask(irq); } else { /* For TX we use the same irq per ring we assigned for the RX */ From 6e745db4ddd072c7f67b37d850bc5aaedcf35400 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:58 +0100 Subject: [PATCH 083/326] net/mlx5: Replace irq_to_desc() abuse No driver has any business with the internals of an interrupt descriptor. Storing a pointer to it just to use yet another helper at the actual usage site to retrieve the affinity mask is creative at best. Just because C does not allow encapsulation does not mean that the kernel has no limits. Retrieve a pointer to the affinity mask itself and use that. It's still using an interface which is usually not for random drivers, but definitely less hideous than the previous hack. Signed-off-by: Thomas Gleixner Cc: Saeed Mahameed Link: https://lore.kernel.org/r/20201210194044.769458162@linutronix.de --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 6 +----- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 2f05b0f9de01..45fd585d101b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -669,7 +669,7 @@ struct mlx5e_channel { spinlock_t async_icosq_lock; /* data path - accessed per napi poll */ - struct irq_desc *irq_desc; + const struct cpumask *aff_mask; struct mlx5e_ch_stats *stats; /* control */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index ebce97921e03..4345bc41b2be 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1998,7 +1998,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, c->num_tc = params->num_tc; c->xdp = !!params->xdp_prog; c->stats = &priv->channel_stats[ix].ch; - c->irq_desc = irq_to_desc(irq); + c->aff_mask = irq_get_affinity_mask(irq); c->lag_port = mlx5e_enumerate_lag_port(priv->mdev, ix); netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c index d5868670f8a5..793e313dcb8b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c @@ -40,12 +40,8 @@ static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c) { int current_cpu = smp_processor_id(); - const struct cpumask *aff; - struct irq_data *idata; - idata = irq_desc_get_irq_data(c->irq_desc); - aff = irq_data_get_affinity_mask(idata); - return cpumask_test_cpu(current_cpu, aff); + return cpumask_test_cpu(current_cpu, c->aff_mask); } static void mlx5e_handle_tx_dim(struct mlx5e_txqsq *sq) From ec7b37b6f08fac3eb9a733efa3d8eae5c3fb0383 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:59 +0100 Subject: [PATCH 084/326] net/mlx5: Use effective interrupt affinity Using the interrupt affinity mask for checking locality is not really working well on architectures which support effective affinity masks. The affinity mask is either the system wide default or set by user space, but the architecture can or even must reduce the mask to the effective set, which means that checking the affinity mask itself does not really tell about the actual target CPUs. Signed-off-by: Thomas Gleixner Cc: Saeed Mahameed Link: https://lore.kernel.org/r/20201210194044.876342330@linutronix.de --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 4345bc41b2be..9bcf73f417d0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1998,7 +1998,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, c->num_tc = params->num_tc; c->xdp = !!params->xdp_prog; c->stats = &priv->channel_stats[ix].ch; - c->aff_mask = irq_get_affinity_mask(irq); + c->aff_mask = irq_get_effective_affinity_mask(irq); c->lag_port = mlx5e_enumerate_lag_port(priv->mdev, ix); netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64); From 3bd5371a4da68613fb3d4aaf961ed8244bcbd741 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:26:00 +0100 Subject: [PATCH 085/326] xen/events: Remove unused bind_evtchn_to_irq_lateeoi() Signed-off-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Link: https://lore.kernel.org/r/20201210194044.972064156@linutronix.de --- drivers/xen/events/events_base.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 6038c4c35db5..d6458e439c78 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1132,12 +1132,6 @@ int bind_evtchn_to_irq(evtchn_port_t evtchn) } EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); -int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn) -{ - return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip); -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi); - static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; From 67473b8194bc3ecc42d60a4f5dc1ed479f28ed6e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:26:01 +0100 Subject: [PATCH 086/326] xen/events: Remove disfunct affinity spreading This function can only ever work when the event channels: - are already established - interrupts assigned to them - the affinity has been set by user space already because any newly set up event channel is forced to be bound to CPU0 and the affinity mask of the interrupt is forced to contain cpumask_of(0). As the CPU0 enforcement was in place _before_ this was implemented it's entirely unclear how that can ever have worked at all. Remove it as preparation for doing it proper. Signed-off-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Link: https://lore.kernel.org/r/20201210194045.065115500@linutronix.de --- drivers/xen/events/events_base.c | 9 --------- drivers/xen/evtchn.c | 34 +------------------------------- 2 files changed, 1 insertion(+), 42 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index d6458e439c78..9cade1994785 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1696,15 +1696,6 @@ static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, return ret; } -/* To be called with desc->lock held. */ -int xen_set_affinity_evtchn(struct irq_desc *desc, unsigned int tcpu) -{ - struct irq_data *d = irq_desc_get_irq_data(desc); - - return set_affinity_irq(d, cpumask_of(tcpu), false); -} -EXPORT_SYMBOL_GPL(xen_set_affinity_evtchn); - static void enable_dynirq(struct irq_data *data) { evtchn_port_t evtchn = evtchn_from_irq(data->irq); diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index 5dc016d68f83..a7a85719a8c8 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -421,36 +421,6 @@ static void evtchn_unbind_from_user(struct per_user_data *u, del_evtchn(u, evtchn); } -static DEFINE_PER_CPU(int, bind_last_selected_cpu); - -static void evtchn_bind_interdom_next_vcpu(evtchn_port_t evtchn) -{ - unsigned int selected_cpu, irq; - struct irq_desc *desc; - unsigned long flags; - - irq = irq_from_evtchn(evtchn); - desc = irq_to_desc(irq); - - if (!desc) - return; - - raw_spin_lock_irqsave(&desc->lock, flags); - selected_cpu = this_cpu_read(bind_last_selected_cpu); - selected_cpu = cpumask_next_and(selected_cpu, - desc->irq_common_data.affinity, cpu_online_mask); - - if (unlikely(selected_cpu >= nr_cpu_ids)) - selected_cpu = cpumask_first_and(desc->irq_common_data.affinity, - cpu_online_mask); - - this_cpu_write(bind_last_selected_cpu, selected_cpu); - - /* unmask expects irqs to be disabled */ - xen_set_affinity_evtchn(desc, selected_cpu); - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - static long evtchn_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -508,10 +478,8 @@ static long evtchn_ioctl(struct file *file, break; rc = evtchn_bind_to_user(u, bind_interdomain.local_port); - if (rc == 0) { + if (rc == 0) rc = bind_interdomain.local_port; - evtchn_bind_interdom_next_vcpu(rc); - } break; } From 1ca1b4e2c0cbc88ce3939910ac36dca51d326fe4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:26:02 +0100 Subject: [PATCH 087/326] xen/events: Use immediate affinity setting There is absolutely no reason to mimic the x86 deferred affinity setting. This mechanism is required to handle the hardware induced issues of IO/APIC and MSI and is not in use when the interrupts are remapped. XEN does not need this and can simply change the affinity from the calling context. The core code invokes this with the interrupt descriptor lock held so it is fully serialized against any other operation. Mark the interrupts with IRQ_MOVE_PCNTXT to disable the deferred affinity setting. The conditional mask/unmask operation is already handled in xen_rebind_evtchn_to_cpu(). This makes XEN on x86 use the same mechanics as on e.g. ARM64 where deferred affinity setting is not required and not implemented and the code path in the ack functions is compiled out. Signed-off-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Link: https://lore.kernel.org/r/20201210194045.157601122@linutronix.de --- drivers/xen/events/events_base.c | 35 ++++++++------------------------ 1 file changed, 9 insertions(+), 26 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 9cade1994785..eaba42a89c8f 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -628,6 +628,11 @@ static void xen_irq_init(unsigned irq) info->refcnt = -1; set_info_for_irq(irq, info); + /* + * Interrupt affinity setting can be immediate. No point + * in delaying it until an interrupt is handled. + */ + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); INIT_LIST_HEAD(&info->eoi_list); list_add_tail(&info->list, &xen_irq_list_head); @@ -739,18 +744,7 @@ static void eoi_pirq(struct irq_data *data) if (!VALID_EVTCHN(evtchn)) return; - if (unlikely(irqd_is_setaffinity_pending(data)) && - likely(!irqd_irq_disabled(data))) { - int masked = test_and_set_mask(evtchn); - - clear_evtchn(evtchn); - - irq_move_masked_irq(data); - - if (!masked) - unmask_evtchn(evtchn); - } else - clear_evtchn(evtchn); + clear_evtchn(evtchn); if (pirq_needs_eoi(data->irq)) { rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); @@ -1641,7 +1635,6 @@ void rebind_evtchn_irq(evtchn_port_t evtchn, int irq) mutex_unlock(&irq_mapping_update_lock); bind_evtchn_to_cpu(evtchn, info->cpu); - /* This will be deferred until interrupt is processed */ irq_set_affinity(irq, cpumask_of(info->cpu)); /* Unmask the event channel. */ @@ -1688,8 +1681,9 @@ static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, bool force) { unsigned tcpu = cpumask_first_and(dest, cpu_online_mask); - int ret = xen_rebind_evtchn_to_cpu(evtchn_from_irq(data->irq), tcpu); + int ret; + ret = xen_rebind_evtchn_to_cpu(evtchn_from_irq(data->irq), tcpu); if (!ret) irq_data_update_effective_affinity(data, cpumask_of(tcpu)); @@ -1719,18 +1713,7 @@ static void ack_dynirq(struct irq_data *data) if (!VALID_EVTCHN(evtchn)) return; - if (unlikely(irqd_is_setaffinity_pending(data)) && - likely(!irqd_irq_disabled(data))) { - int masked = test_and_set_mask(evtchn); - - clear_evtchn(evtchn); - - irq_move_masked_irq(data); - - if (!masked) - unmask_evtchn(evtchn); - } else - clear_evtchn(evtchn); + clear_evtchn(evtchn); } static void mask_ack_dynirq(struct irq_data *data) From f7a6f994b4f0ee69c656dda3da11431d92d6b08f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:26:03 +0100 Subject: [PATCH 088/326] xen/events: Only force affinity mask for percpu interrupts All event channel setups bind the interrupt on CPU0 or the target CPU for percpu interrupts and overwrite the affinity mask with the corresponding cpumask. That does not make sense. The XEN implementation of irqchip::irq_set_affinity() already picks a single target CPU out of the affinity mask and the actual target is stored in the effective CPU mask, so destroying the user chosen affinity mask which might contain more than one CPU is wrong. Change the implementation so that the channel is bound to CPU0 at the XEN level and leave the affinity mask alone. At startup of the interrupt affinity will be assigned out of the affinity mask and the XEN binding will be updated. Only keep the enforcement for real percpu interrupts. On resume the overwrite is not required either because info->cpu and the affinity mask are still the same as at the time of suspend. Same for rebind_evtchn_irq(). This also prepares for proper interrupt spreading. Signed-off-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Link: https://lore.kernel.org/r/20201210194045.250321315@linutronix.de --- drivers/xen/events/events_base.c | 48 ++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index eaba42a89c8f..679b2cbd2de4 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -433,15 +433,20 @@ static bool pirq_needs_eoi_flag(unsigned irq) return info->u.pirq.flags & PIRQ_NEEDS_EOI; } -static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu) +static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu, + bool force_affinity) { int irq = get_evtchn_to_irq(evtchn); struct irq_info *info = info_for_irq(irq); BUG_ON(irq == -1); -#ifdef CONFIG_SMP - cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(cpu)); -#endif + + if (IS_ENABLED(CONFIG_SMP) && force_affinity) { + cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(cpu)); + cpumask_copy(irq_get_effective_affinity_mask(irq), + cpumask_of(cpu)); + } + xen_evtchn_port_bind_to_cpu(evtchn, cpu, info->cpu); info->cpu = cpu; @@ -788,7 +793,7 @@ static unsigned int __startup_pirq(unsigned int irq) goto err; info->evtchn = evtchn; - bind_evtchn_to_cpu(evtchn, 0); + bind_evtchn_to_cpu(evtchn, 0, false); rc = xen_evtchn_port_setup(evtchn); if (rc) @@ -1107,8 +1112,14 @@ static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip) irq = ret; goto out; } - /* New interdomain events are bound to VCPU 0. */ - bind_evtchn_to_cpu(evtchn, 0); + /* + * New interdomain events are initially bound to vCPU0 This + * is required to setup the event channel in the first + * place and also important for UP guests because the + * affinity setting is not invoked on them so nothing would + * bind the channel. + */ + bind_evtchn_to_cpu(evtchn, 0, false); } else { struct irq_info *info = info_for_irq(irq); WARN_ON(info == NULL || info->type != IRQT_EVTCHN); @@ -1156,7 +1167,11 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) irq = ret; goto out; } - bind_evtchn_to_cpu(evtchn, cpu); + /* + * Force the affinity mask to the target CPU so proc shows + * the correct target. + */ + bind_evtchn_to_cpu(evtchn, cpu, true); } else { struct irq_info *info = info_for_irq(irq); WARN_ON(info == NULL || info->type != IRQT_IPI); @@ -1269,7 +1284,11 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu) goto out; } - bind_evtchn_to_cpu(evtchn, cpu); + /* + * Force the affinity mask for percpu interrupts so proc + * shows the correct target. + */ + bind_evtchn_to_cpu(evtchn, cpu, percpu); } else { struct irq_info *info = info_for_irq(irq); WARN_ON(info == NULL || info->type != IRQT_VIRQ); @@ -1634,8 +1653,7 @@ void rebind_evtchn_irq(evtchn_port_t evtchn, int irq) mutex_unlock(&irq_mapping_update_lock); - bind_evtchn_to_cpu(evtchn, info->cpu); - irq_set_affinity(irq, cpumask_of(info->cpu)); + bind_evtchn_to_cpu(evtchn, info->cpu, false); /* Unmask the event channel. */ enable_irq(irq); @@ -1669,7 +1687,7 @@ static int xen_rebind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int tcpu) * it, but don't do the xenlinux-level rebind in that case. */ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) - bind_evtchn_to_cpu(evtchn, tcpu); + bind_evtchn_to_cpu(evtchn, tcpu, false); if (!masked) unmask_evtchn(evtchn); @@ -1798,7 +1816,8 @@ static void restore_cpu_virqs(unsigned int cpu) /* Record the new mapping. */ (void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq); - bind_evtchn_to_cpu(evtchn, cpu); + /* The affinity mask is still valid */ + bind_evtchn_to_cpu(evtchn, cpu, false); } } @@ -1823,7 +1842,8 @@ static void restore_cpu_ipis(unsigned int cpu) /* Record the new mapping. */ (void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); - bind_evtchn_to_cpu(evtchn, cpu); + /* The affinity mask is still valid */ + bind_evtchn_to_cpu(evtchn, cpu, false); } } From 62ebcda8a8dfa4aeaa3288020a082787910afebc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:26:04 +0100 Subject: [PATCH 089/326] xen/events: Reduce irq_info:: Spurious_cnt storage size To prepare for interrupt spreading reduce the storage size of irq_info::spurious_cnt to u8 so the required flag for the spreading logic will not increase the storage size. Protect the usage site against overruns. Signed-off-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Link: https://lore.kernel.org/r/20201210194045.360198201@linutronix.de --- drivers/xen/events/events_base.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 679b2cbd2de4..b352440963ee 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -95,7 +95,7 @@ struct irq_info { struct list_head list; struct list_head eoi_list; short refcnt; - short spurious_cnt; + u8 spurious_cnt; enum xen_irq_type type; /* type */ unsigned irq; evtchn_port_t evtchn; /* event channel */ @@ -528,8 +528,10 @@ static void xen_irq_lateeoi_locked(struct irq_info *info, bool spurious) return; if (spurious) { - if ((1 << info->spurious_cnt) < (HZ << 2)) - info->spurious_cnt++; + if ((1 << info->spurious_cnt) < (HZ << 2)) { + if (info->spurious_cnt != 0xFF) + info->spurious_cnt++; + } if (info->spurious_cnt > 1) { delay = 1 << (info->spurious_cnt - 2); if (delay > HZ) From 88f0a9d066443118261adf7e049781476f09dac1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:26:05 +0100 Subject: [PATCH 090/326] xen/events: Implement irq distribution Keep track of the assignments of event channels to CPUs and select the online CPU with the least assigned channels in the affinity mask which is handed to irq_chip::irq_set_affinity() from the core code. Signed-off-by: Thomas Gleixner Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Stefano Stabellini Link: https://lore.kernel.org/r/20201210194045.457218278@linutronix.de --- drivers/xen/events/events_base.c | 76 ++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 8 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index b352440963ee..a8030332a191 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -96,6 +96,7 @@ struct irq_info { struct list_head eoi_list; short refcnt; u8 spurious_cnt; + u8 is_accounted; enum xen_irq_type type; /* type */ unsigned irq; evtchn_port_t evtchn; /* event channel */ @@ -161,6 +162,9 @@ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping */ static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; +/* Event channel distribution data */ +static atomic_t channels_on_cpu[NR_CPUS]; + static int **evtchn_to_irq; #ifdef CONFIG_X86 static unsigned long *pirq_eoi_map; @@ -257,6 +261,32 @@ static void set_info_for_irq(unsigned int irq, struct irq_info *info) irq_set_chip_data(irq, info); } +/* Per CPU channel accounting */ +static void channels_on_cpu_dec(struct irq_info *info) +{ + if (!info->is_accounted) + return; + + info->is_accounted = 0; + + if (WARN_ON_ONCE(info->cpu >= nr_cpu_ids)) + return; + + WARN_ON_ONCE(!atomic_add_unless(&channels_on_cpu[info->cpu], -1 , 0)); +} + +static void channels_on_cpu_inc(struct irq_info *info) +{ + if (WARN_ON_ONCE(info->cpu >= nr_cpu_ids)) + return; + + if (WARN_ON_ONCE(!atomic_add_unless(&channels_on_cpu[info->cpu], 1, + INT_MAX))) + return; + + info->is_accounted = 1; +} + /* Constructors for packed IRQ information. */ static int xen_irq_info_common_setup(struct irq_info *info, unsigned irq, @@ -339,6 +369,7 @@ static void xen_irq_info_cleanup(struct irq_info *info) { set_evtchn_to_irq(info->evtchn, -1); info->evtchn = 0; + channels_on_cpu_dec(info); } /* @@ -449,7 +480,9 @@ static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu, xen_evtchn_port_bind_to_cpu(evtchn, cpu, info->cpu); + channels_on_cpu_dec(info); info->cpu = cpu; + channels_on_cpu_inc(info); } /** @@ -622,11 +655,6 @@ static void xen_irq_init(unsigned irq) { struct irq_info *info; -#ifdef CONFIG_SMP - /* By default all event channels notify CPU#0. */ - cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(0)); -#endif - info = kzalloc(sizeof(*info), GFP_KERNEL); if (info == NULL) panic("Unable to allocate metadata for IRQ%d\n", irq); @@ -1697,10 +1725,38 @@ static int xen_rebind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int tcpu) return 0; } +/* + * Find the CPU within @dest mask which has the least number of channels + * assigned. This is not precise as the per cpu counts can be modified + * concurrently. + */ +static unsigned int select_target_cpu(const struct cpumask *dest) +{ + unsigned int cpu, best_cpu = UINT_MAX, minch = UINT_MAX; + + for_each_cpu_and(cpu, dest, cpu_online_mask) { + unsigned int curch = atomic_read(&channels_on_cpu[cpu]); + + if (curch < minch) { + minch = curch; + best_cpu = cpu; + } + } + + /* + * Catch the unlikely case that dest contains no online CPUs. Can't + * recurse. + */ + if (best_cpu == UINT_MAX) + return select_target_cpu(cpu_online_mask); + + return best_cpu; +} + static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, bool force) { - unsigned tcpu = cpumask_first_and(dest, cpu_online_mask); + unsigned int tcpu = select_target_cpu(dest); int ret; ret = xen_rebind_evtchn_to_cpu(evtchn_from_irq(data->irq), tcpu); @@ -1928,8 +1984,12 @@ void xen_irq_resume(void) xen_evtchn_resume(); /* No IRQ <-> event-channel mappings. */ - list_for_each_entry(info, &xen_irq_list_head, list) - info->evtchn = 0; /* zap event-channel binding */ + list_for_each_entry(info, &xen_irq_list_head, list) { + /* Zap event-channel binding */ + info->evtchn = 0; + /* Adjust accounting */ + channels_on_cpu_dec(info); + } clear_evtchn_to_irq_all(); From 64a1b95bb9fe3ec76e1a2cd803eff06389341ae4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:26:06 +0100 Subject: [PATCH 091/326] genirq: Restrict export of irq_to_desc() No more (ab)use in drivers finally. There is still the modular build of PPC/KVM which needs it, so restrict it to this case which still makes it unavailable for most drivers. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201210194045.551428291@linutronix.de --- kernel/irq/irqdesc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index f509c4db2029..3d0bc38a0bcf 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -352,7 +352,9 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return radix_tree_lookup(&irq_desc_tree, irq); } -EXPORT_SYMBOL(irq_to_desc); +#ifdef CONFIG_KVM_BOOK3S_64_HV +EXPORT_SYMBOL_GPL(irq_to_desc); +#endif static void delete_irq_desc(unsigned int irq) { From b784c77075023e1a71bc06e6b4f711acb99e9c73 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Tue, 15 Dec 2020 13:24:59 +0100 Subject: [PATCH 092/326] coccinnelle: Remove ptr_ret script The ptr_ret script script addresses a number of situations where we end up testing an error pointer, and if it's an error returning it, or return 0 otherwise to transform it into a PTR_ERR_OR_ZERO call. So it will convert a block like this: if (IS_ERR(err)) return PTR_ERR(err); return 0; into return PTR_ERR_OR_ZERO(err); While this is technically correct, it has a number of drawbacks. First, it merges the error and success path, which will make it harder for a reviewer or reader to grasp. It's also more difficult to extend if we were to add some code between the error check and the function return, making the author essentially revert that patch before adding new lines, while it would have been a trivial addition otherwise for the rewiever. Therefore, since that script is only about cosmetic in the first place, let's remove it since it's not worth it. Acked-by: Jani Nikula Acked-by: Thierry Reding Acked-by: Julia Lawall Reviewed-by: Wolfram Sang Reviewed-by: Mark Brown Signed-off-by: Maxime Ripard Signed-off-by: Julia Lawall --- scripts/coccinelle/api/ptr_ret.cocci | 97 ---------------------------- 1 file changed, 97 deletions(-) delete mode 100644 scripts/coccinelle/api/ptr_ret.cocci diff --git a/scripts/coccinelle/api/ptr_ret.cocci b/scripts/coccinelle/api/ptr_ret.cocci deleted file mode 100644 index e76cd5d90a8a..000000000000 --- a/scripts/coccinelle/api/ptr_ret.cocci +++ /dev/null @@ -1,97 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/// -/// Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR -/// -// Confidence: High -// Copyright: (C) 2012 Julia Lawall, INRIA/LIP6. -// Copyright: (C) 2012 Gilles Muller, INRIA/LiP6. -// URL: http://coccinelle.lip6.fr/ -// Options: --no-includes --include-headers -// -// Keywords: ERR_PTR, PTR_ERR, PTR_ERR_OR_ZERO -// Version min: 2.6.39 -// - -virtual context -virtual patch -virtual org -virtual report - -@depends on patch@ -expression ptr; -@@ - -- if (IS_ERR(ptr)) return PTR_ERR(ptr); else return 0; -+ return PTR_ERR_OR_ZERO(ptr); - -@depends on patch@ -expression ptr; -@@ - -- if (IS_ERR(ptr)) return PTR_ERR(ptr); return 0; -+ return PTR_ERR_OR_ZERO(ptr); - -@depends on patch@ -expression ptr; -@@ - -- (IS_ERR(ptr) ? PTR_ERR(ptr) : 0) -+ PTR_ERR_OR_ZERO(ptr) - -@r1 depends on !patch@ -expression ptr; -position p1; -@@ - -* if@p1 (IS_ERR(ptr)) return PTR_ERR(ptr); else return 0; - -@r2 depends on !patch@ -expression ptr; -position p2; -@@ - -* if@p2 (IS_ERR(ptr)) return PTR_ERR(ptr); return 0; - -@r3 depends on !patch@ -expression ptr; -position p3; -@@ - -* IS_ERR@p3(ptr) ? PTR_ERR(ptr) : 0 - -@script:python depends on org@ -p << r1.p1; -@@ - -coccilib.org.print_todo(p[0], "WARNING: PTR_ERR_OR_ZERO can be used") - - -@script:python depends on org@ -p << r2.p2; -@@ - -coccilib.org.print_todo(p[0], "WARNING: PTR_ERR_OR_ZERO can be used") - -@script:python depends on org@ -p << r3.p3; -@@ - -coccilib.org.print_todo(p[0], "WARNING: PTR_ERR_OR_ZERO can be used") - -@script:python depends on report@ -p << r1.p1; -@@ - -coccilib.report.print_report(p[0], "WARNING: PTR_ERR_OR_ZERO can be used") - -@script:python depends on report@ -p << r2.p2; -@@ - -coccilib.report.print_report(p[0], "WARNING: PTR_ERR_OR_ZERO can be used") - -@script:python depends on report@ -p << r3.p3; -@@ - -coccilib.report.print_report(p[0], "WARNING: PTR_ERR_OR_ZERO can be used") From 5ae96d779ff3eeb2977919ff311a6c8849943c2d Mon Sep 17 00:00:00 2001 From: Enrico Weigelt Date: Tue, 15 Dec 2020 17:35:31 +0100 Subject: [PATCH 093/326] libnvdimm: Cleanup include of badblocks.h * drivers/nvdimm/core.c doesn't use anything from badblocks.h on its own, thus including it isn't needed. There's indeed indirect use, via funcs in nd.h, but this one already includes badblocks.h. * drivers/nvdimm/claim.c calls stuff from badblocks.h and therefore should include it on its own (instead of relying any other header doing that) * drivers/nvdimm/btt.h doesn't really need anything from badblocks.h and can easily live with a forward declaration of struct badblocks (just having pointers to it, but not dereferencing it anywhere) Signed-off-by: Enrico Weigelt Link: https://lore.kernel.org/r/20201215163531.21446-1-info@metux.net Signed-off-by: Dan Williams --- drivers/nvdimm/btt.h | 3 ++- drivers/nvdimm/claim.c | 1 + drivers/nvdimm/core.c | 1 - 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h index 2e258bee7db2..aa53e0b769bd 100644 --- a/drivers/nvdimm/btt.h +++ b/drivers/nvdimm/btt.h @@ -7,7 +7,6 @@ #ifndef _LINUX_BTT_H #define _LINUX_BTT_H -#include #include #define BTT_SIG_LEN 16 @@ -197,6 +196,8 @@ struct arena_info { int log_index[2]; }; +struct badblocks; + /** * struct btt - handle for a BTT instance * @btt_disk: Pointer to the gendisk for BTT device diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c index 5a7c80053c62..030dbde6b088 100644 --- a/drivers/nvdimm/claim.c +++ b/drivers/nvdimm/claim.c @@ -4,6 +4,7 @@ */ #include #include +#include #include "nd-core.h" #include "pmem.h" #include "pfn.h" diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index c21ba0602029..7de592d7eff4 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -3,7 +3,6 @@ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. */ #include -#include #include #include #include From 4e6a7b3bbd5a6f9e6f0c5c3ad976ed116c7ade79 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Mon, 14 Dec 2020 21:45:06 +0800 Subject: [PATCH 094/326] device-dax/pmem: Convert comma to semicolon Replace a comma between expression statements by a semicolon. Signed-off-by: Zheng Yongjun Link: https://lore.kernel.org/r/20201214134506.4831-1-zhengyongjun3@huawei.com Signed-off-by: Dan Williams --- drivers/dax/pmem/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c index 62b26bfceab1..062e8bc14223 100644 --- a/drivers/dax/pmem/core.c +++ b/drivers/dax/pmem/core.c @@ -52,7 +52,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) /* adjust the dax_region range to the start of data */ range = pgmap.range; - range.start += offset, + range.start += offset; dax_region = alloc_dax_region(dev, region_id, &range, nd_region->target_node, le32_to_cpu(pfn_sb->align), IORESOURCE_DAX_STATIC); From 1aa574312518ef1d60d2dc62d58f7021db3b163a Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Tue, 1 Dec 2020 21:59:29 +0800 Subject: [PATCH 095/326] device-dax/core: Fix memory leak when rmmod dax.ko When I repeatedly modprobe and rmmod dax.ko, kmemleak report a memory leak as follows: unreferenced object 0xffff9a5588c05088 (size 8): comm "modprobe", pid 261, jiffies 4294693644 (age 42.063s) ... backtrace: [<00000000e007ced0>] kstrdup+0x35/0x70 [<000000002ae73897>] kstrdup_const+0x3d/0x50 [<000000002b00c9c3>] kvasprintf_const+0xbc/0xf0 [<000000008023282f>] kobject_set_name_vargs+0x3b/0xd0 [<00000000d2cbaa4e>] kobject_set_name+0x62/0x90 [<00000000202e7a22>] bus_register+0x7f/0x2b0 [<000000000b77792c>] 0xffffffffc02840f7 [<000000002d5be5ac>] 0xffffffffc02840b4 [<00000000dcafb7cd>] do_one_initcall+0x58/0x240 [<00000000049fe480>] do_init_module+0x56/0x1e2 [<0000000022671491>] load_module+0x2517/0x2840 [<000000001a2201cb>] __do_sys_finit_module+0x9c/0xe0 [<000000003eb304e7>] do_syscall_64+0x33/0x40 [<0000000051c5fd06>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 When rmmod dax is executed, dax_bus_exit() is missing. This patch can fix this bug. Fixes: 9567da0b408a ("device-dax: Introduce bus + driver model") Cc: Reported-by: Hulk Robot Signed-off-by: Wang Hai Link: https://lore.kernel.org/r/20201201135929.66530-1-wanghai38@huawei.com Signed-off-by: Dan Williams --- drivers/dax/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index edc279be3e59..cadbd0a1a1ef 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -752,6 +752,7 @@ err_chrdev: static void __exit dax_core_exit(void) { + dax_bus_exit(); unregister_chrdev_region(dax_devt, MINORMASK+1); ida_destroy(&dax_minor_ida); dax_fs_exit(); From 846f151d03f796bf1b303784edaf3a22e3f51377 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 4 Dec 2020 17:51:52 +0100 Subject: [PATCH 096/326] drm/ttm: fix unused function warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ttm_pool_type_count() is not used when debugfs is disabled: drivers/gpu/drm/ttm/ttm_pool.c:243:21: error: unused function 'ttm_pool_type_count' [-Werror,-Wunused-function] static unsigned int ttm_pool_type_count(struct ttm_pool_type *pt) Move the definition into the #ifdef block. Fixes: d099fc8f540a ("drm/ttm: new TT backend allocation pool v3") Signed-off-by: Arnd Bergmann Reviewed-by: Martin Peres Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/405695/ Signed-off-by: Christian König --- drivers/gpu/drm/ttm/ttm_pool.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index 5455b2044759..7b2f60616750 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -239,21 +239,6 @@ static struct page *ttm_pool_type_take(struct ttm_pool_type *pt) return p; } -/* Count the number of pages available in a pool_type */ -static unsigned int ttm_pool_type_count(struct ttm_pool_type *pt) -{ - unsigned int count = 0; - struct page *p; - - spin_lock(&pt->lock); - /* Only used for debugfs, the overhead doesn't matter */ - list_for_each_entry(p, &pt->pages, lru) - ++count; - spin_unlock(&pt->lock); - - return count; -} - /* Initialize and add a pool type to the global shrinker list */ static void ttm_pool_type_init(struct ttm_pool_type *pt, struct ttm_pool *pool, enum ttm_caching caching, unsigned int order) @@ -543,6 +528,20 @@ void ttm_pool_fini(struct ttm_pool *pool) EXPORT_SYMBOL(ttm_pool_fini); #ifdef CONFIG_DEBUG_FS +/* Count the number of pages available in a pool_type */ +static unsigned int ttm_pool_type_count(struct ttm_pool_type *pt) +{ + unsigned int count = 0; + struct page *p; + + spin_lock(&pt->lock); + /* Only used for debugfs, the overhead doesn't matter */ + list_for_each_entry(p, &pt->pages, lru) + ++count; + spin_unlock(&pt->lock); + + return count; +} /* Dump information about the different pool types */ static void ttm_pool_debugfs_orders(struct ttm_pool_type *pt, From 34cdf405aa5de827b8bef79a6c82c39120b3729b Mon Sep 17 00:00:00 2001 From: Chris Chiu Date: Wed, 16 Dec 2020 20:52:00 +0800 Subject: [PATCH 097/326] ALSA: hda/realtek: Remove dummy lineout on Acer TravelMate P648/P658 Acer TravelMate laptops P648/P658 series with codec ALC282 only have one physical jack for headset but there's a confusing lineout pin on NID 0x1b reported. Audio applications hence misunderstand that there are a speaker and a lineout, and take the lineout as the default audio output. Add a new quirk to remove the useless lineout and enable the pin 0x18 for jack sensing and headset microphone. Signed-off-by: Chris Chiu Signed-off-by: Jian-Hong Pan Cc: Link: https://lore.kernel.org/r/20201216125200.27053-1-chiu@endlessos.org Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 41cc64036f22..71da03f950cc 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6368,6 +6368,7 @@ enum { ALC287_FIXUP_HP_GPIO_LED, ALC256_FIXUP_HP_HEADSET_MIC, ALC236_FIXUP_DELL_AIO_HEADSET_MIC, + ALC282_FIXUP_ACER_DISABLE_LINEOUT, }; static const struct hda_fixup alc269_fixups[] = { @@ -7791,6 +7792,16 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC255_FIXUP_DELL1_MIC_NO_PRESENCE }, + [ALC282_FIXUP_ACER_DISABLE_LINEOUT] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x1b, 0x411111f0 }, + { 0x18, 0x01a1913c }, /* use as headset mic, without its own jack detect */ + { }, + }, + .chained = true, + .chain_id = ALC269_FIXUP_HEADSET_MODE + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -8564,6 +8575,22 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x12, 0x90a60140}, {0x19, 0x04a11030}, {0x21, 0x04211020}), + SND_HDA_PIN_QUIRK(0x10ec0282, 0x1025, "Acer", ALC282_FIXUP_ACER_DISABLE_LINEOUT, + ALC282_STANDARD_PINS, + {0x12, 0x90a609c0}, + {0x18, 0x03a11830}, + {0x19, 0x04a19831}, + {0x1a, 0x0481303f}, + {0x1b, 0x04211020}, + {0x21, 0x0321101f}), + SND_HDA_PIN_QUIRK(0x10ec0282, 0x1025, "Acer", ALC282_FIXUP_ACER_DISABLE_LINEOUT, + ALC282_STANDARD_PINS, + {0x12, 0x90a60940}, + {0x18, 0x03a11830}, + {0x19, 0x04a19831}, + {0x1a, 0x0481303f}, + {0x1b, 0x04211020}, + {0x21, 0x0321101f}), SND_HDA_PIN_QUIRK(0x10ec0283, 0x1028, "Dell", ALC269_FIXUP_DELL1_MIC_NO_PRESENCE, ALC282_STANDARD_PINS, {0x12, 0x90a60130}, From 8075c3005e4b1efa12dbbf6e84bc412a713de92c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 16 Dec 2020 00:49:31 +0000 Subject: [PATCH 098/326] dma-buf: cma_heap: Include linux/vmalloc.h to fix build failures on MIPS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to include in order for MIPS to find vmap(), as it doesn't otherwise get included there. Without this patch, one can hit the following build error: drivers/dma-buf/heaps/cma_heap.c: In function 'cma_heap_do_vmap': drivers/dma-buf/heaps/cma_heap.c:195:10: error: implicit declaration of function 'vmap' Cc: Sumit Semwal Cc: Liam Mark Cc: Laura Abbott Cc: Brian Starkey Cc: Hridya Valsaraju Cc: Suren Baghdasaryan Cc: Sandeep Patil Cc: Daniel Mentz Cc: Chris Goldsworthy Cc: Ørjan Eide Cc: Robin Murphy Cc: Ezequiel Garcia Cc: Simon Ser Cc: James Jones Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Fixes: a5d2d29e24be ("dma-buf: heaps: Move heap-helper logic into the cma_heap implementation") Reported-by: Guenter Roeck Signed-off-by: John Stultz Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20201216004931.113505-1-john.stultz@linaro.org --- drivers/dma-buf/heaps/cma_heap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma-buf/heaps/cma_heap.c b/drivers/dma-buf/heaps/cma_heap.c index 5e7c3436310c..3c4e34301172 100644 --- a/drivers/dma-buf/heaps/cma_heap.c +++ b/drivers/dma-buf/heaps/cma_heap.c @@ -20,6 +20,7 @@ #include #include #include +#include struct cma_heap { From e6582cb5dab4ae572513412cc10fd0ffe07e0b05 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 30 Nov 2020 11:19:21 +0100 Subject: [PATCH 099/326] blk-mq: Remove 'running from the wrong CPU' warning It's guaranteed that no request is in flight when a hctx is going offline. This warning is only triggered when the wq's CPU is hot plugged and the blk-mq is not synced up yet. As this state is temporary and the request is still processed correctly, better remove the warning as this is the fast path. Suggested-by: Ming Lei Signed-off-by: Daniel Wagner Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index b09ce00cc6af..a428798e1c5c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1495,31 +1495,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { int srcu_idx; - /* - * We should be running this queue from one of the CPUs that - * are mapped to it. - * - * There are at least two related races now between setting - * hctx->next_cpu from blk_mq_hctx_next_cpu() and running - * __blk_mq_run_hw_queue(): - * - * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(), - * but later it becomes online, then this warning is harmless - * at all - * - * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(), - * but later it becomes offline, then the warning can't be - * triggered, and we depend on blk-mq timeout handler to - * handle dispatched requests to this hctx - */ - if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && - cpu_online(hctx->next_cpu)) { - printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n", - raw_smp_processor_id(), - cpumask_empty(hctx->cpumask) ? "inactive": "active"); - dump_stack(); - } - /* * We can't run the queue inline with ints disabled. Ensure that * we catch bad users of this early. From e7508d48565060af5d89f10cb83c9359c8ae1310 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Thu, 10 Dec 2020 11:18:20 +0100 Subject: [PATCH 100/326] block/rnbd-clt: Get rid of warning regarding size argument in strlcpy The kernel test robot triggerred the following warning, >> drivers/block/rnbd/rnbd-clt.c:1397:42: warning: size argument in 'strlcpy' call appears to be size of the source; expected the size of the destination [-Wstrlcpy-strlcat-size] strlcpy(dev->pathname, pathname, strlen(pathname) + 1); ~~~~~~~^~~~~~~~~~~~~ To get rid of the above warning, use a kstrdup as Bart suggested. Fixes: 64e8a6ece1a5 ("block/rnbd-clt: Dynamically alloc buffer for pathname & blk_symlink_name") Reported-by: kernel test robot Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index a199b190c73d..d63f0974bd04 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1388,12 +1388,11 @@ static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, goto out_queues; } - dev->pathname = kzalloc(strlen(pathname) + 1, GFP_KERNEL); + dev->pathname = kstrdup(pathname, GFP_KERNEL); if (!dev->pathname) { ret = -ENOMEM; goto out_queues; } - strlcpy(dev->pathname, pathname, strlen(pathname) + 1); dev->clt_device_id = ret; dev->sess = sess; From 46067844efdb8275ade705923120fc5391543b53 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Thu, 10 Dec 2020 11:18:21 +0100 Subject: [PATCH 101/326] block/rnbd-clt: Fix possible memleak In error case, we do not free the memory for blk_symlink_name. Do it by free the memory in error case, and set to NULL afterwards. Also fix the condition in rnbd_clt_remove_dev_symlink. Fixes: 64e8a6ece1a5 ("block/rnbd-clt: Dynamically alloc buffer for pathname & blk_symlink_name") Signed-off-by: Jack Wang Reviewed-by: Md Haris Iqbal Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index a7caeedeb198..d4aa6bfc9555 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -432,7 +432,7 @@ void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) * i.e. rnbd_clt_unmap_dev_store() leading to a sysfs warning because * of sysfs link already was removed already. */ - if (strlen(dev->blk_symlink_name) && try_module_get(THIS_MODULE)) { + if (dev->blk_symlink_name && try_module_get(THIS_MODULE)) { sysfs_remove_link(rnbd_devs_kobj, dev->blk_symlink_name); kfree(dev->blk_symlink_name); module_put(THIS_MODULE); @@ -521,7 +521,8 @@ static int rnbd_clt_add_dev_symlink(struct rnbd_clt_dev *dev) return 0; out_err: - dev->blk_symlink_name[0] = '\0'; + kfree(dev->blk_symlink_name); + dev->blk_symlink_name = NULL ; return ret; } From 87019e7d99d707e60e20ea3245a561419d5de5ce Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Thu, 10 Dec 2020 11:18:22 +0100 Subject: [PATCH 102/326] block/rnbd-srv: Protect dev session sysfs removal Since the removal of the session sysfs can also be called from the function destroy_sess, there is a need to protect the call from the function rnbd_srv_sess_dev_force_close Fixes: 786998050cbc ("block/rnbd-srv: close a mapped device from server side.") Signed-off-by: Md Haris Iqbal Reviewed-by: Guoqing Jiang Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index d1ee72ed8384..066411cce5e2 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -338,9 +338,10 @@ static int rnbd_srv_link_ev(struct rtrs_srv *rtrs, void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev) { + mutex_lock(&sess_dev->sess->lock); rnbd_srv_destroy_dev_session_sysfs(sess_dev); + mutex_unlock(&sess_dev->sess->lock); sess_dev->keep_id = true; - } static int process_msg_close(struct rtrs_srv *rtrs, From 3877ece01e46f01fae0fbc00df93d0e5f23196b0 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Thu, 10 Dec 2020 11:18:23 +0100 Subject: [PATCH 103/326] block/rnbd: Fix typos Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index d63f0974bd04..3a2e6e8ed6b1 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -359,7 +359,7 @@ static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess, * 2nd reference is dropped after confirmation with the response is * returned. * 1st and 2nd can happen in any order, so the rnbd_iu should be - * released (rtrs_permit returned to ibbtrs) only leased after both + * released (rtrs_permit returned to rtrs) only after both * are finished. */ atomic_set(&iu->refcount, 2); @@ -803,7 +803,7 @@ static struct rnbd_clt_session *alloc_sess(const char *sessname) rnbd_init_cpu_qlists(sess->cpu_queues); /* - * That is simple percpu variable which stores cpu indeces, which are + * That is simple percpu variable which stores cpu indices, which are * incremented on each access. We need that for the sake of fairness * to wake up queues in a round-robin manner. */ @@ -1666,7 +1666,7 @@ static void rnbd_destroy_sessions(void) /* * Here at this point there is no any concurrent access to sessions * list and devices list: - * 1. New session or device can'be be created - session sysfs files + * 1. New session or device can't be created - session sysfs files * are removed. * 2. Device or session can't be removed - module reference is taken * into account in unmap device sysfs callback. From 512c781fd28cb401ee9f2843e32bf4640732c671 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Thu, 10 Dec 2020 11:18:24 +0100 Subject: [PATCH 104/326] block/rnbd: Set write-back cache and fua same to the target device The rnbd-client always sets the write-back cache and fua attributes of the rnbd device queue regardless of the target device on the server. That generates IO hang issue when the target device does not support both of write-back cacne and fua. This patch adds more fields for the cache policy and fua into the device opening message. The rnbd-server sends the information if the target device supports the write-back cache and fua and rnbd-client recevives it and set the device queue accordingly. Signed-off-by: Gioh Kim [jwang: some minor change, rename a few varables, remove unrelated comments.] Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 8 +++++--- drivers/block/rnbd/rnbd-clt.h | 2 ++ drivers/block/rnbd/rnbd-proto.h | 9 ++++++++- drivers/block/rnbd/rnbd-srv.c | 9 +++++++-- 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 3a2e6e8ed6b1..b5fffbdeb263 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -88,6 +88,8 @@ static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev, dev->discard_alignment = le32_to_cpu(rsp->discard_alignment); dev->secure_discard = le16_to_cpu(rsp->secure_discard); dev->rotational = rsp->rotational; + dev->wc = !!(rsp->cache_policy & RNBD_WRITEBACK); + dev->fua = !!(rsp->cache_policy & RNBD_FUA); dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE; dev->max_segments = BMAX_SEGMENTS; @@ -1305,7 +1307,7 @@ static void setup_request_queue(struct rnbd_clt_dev *dev) blk_queue_max_segments(dev->queue, dev->max_segments); blk_queue_io_opt(dev->queue, dev->sess->max_io_size); blk_queue_virt_boundary(dev->queue, SZ_4K - 1); - blk_queue_write_cache(dev->queue, true, true); + blk_queue_write_cache(dev->queue, dev->wc, dev->fua); dev->queue->queuedata = dev; } @@ -1528,13 +1530,13 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, } rnbd_clt_info(dev, - "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_write_same_sectors: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, rotational: %d)\n", + "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_write_same_sectors: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, rotational: %d, wc: %d, fua: %d)\n", dev->gd->disk_name, dev->nsectors, dev->logical_block_size, dev->physical_block_size, dev->max_write_same_sectors, dev->max_discard_sectors, dev->discard_granularity, dev->discard_alignment, dev->secure_discard, dev->max_segments, - dev->max_hw_sectors, dev->rotational); + dev->max_hw_sectors, dev->rotational, dev->wc, dev->fua); mutex_unlock(&dev->lock); diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index b193d5904050..efd67ae286ca 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -112,6 +112,8 @@ struct rnbd_clt_dev { enum rnbd_access_mode access_mode; bool read_only; bool rotational; + bool wc; + bool fua; u32 max_hw_sectors; u32 max_write_same_sectors; u32 max_discard_sectors; diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h index ca166241452c..c1bc5c0fef71 100644 --- a/drivers/block/rnbd/rnbd-proto.h +++ b/drivers/block/rnbd/rnbd-proto.h @@ -108,6 +108,11 @@ struct rnbd_msg_close { __le32 device_id; }; +enum rnbd_cache_policy { + RNBD_FUA = 1 << 0, + RNBD_WRITEBACK = 1 << 1, +}; + /** * struct rnbd_msg_open_rsp - response message to RNBD_MSG_OPEN * @hdr: message header @@ -124,6 +129,7 @@ struct rnbd_msg_close { * @max_segments: max segments hardware support in one transfer * @secure_discard: supports secure discard * @rotation: is a rotational disc? + * @cache_policy: support write-back caching or FUA? */ struct rnbd_msg_open_rsp { struct rnbd_msg_hdr hdr; @@ -139,7 +145,8 @@ struct rnbd_msg_open_rsp { __le16 max_segments; __le16 secure_discard; u8 rotational; - u8 reserved[11]; + u8 cache_policy; + u8 reserved[10]; }; /** diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 066411cce5e2..b8e44331e494 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -550,6 +550,7 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, struct rnbd_srv_sess_dev *sess_dev) { struct rnbd_dev *rnbd_dev = sess_dev->rnbd_dev; + struct request_queue *q = bdev_get_queue(rnbd_dev->bdev); rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP); rsp->device_id = @@ -574,8 +575,12 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, cpu_to_le32(rnbd_dev_get_discard_alignment(rnbd_dev)); rsp->secure_discard = cpu_to_le16(rnbd_dev_get_secure_discard(rnbd_dev)); - rsp->rotational = - !blk_queue_nonrot(bdev_get_queue(rnbd_dev->bdev)); + rsp->rotational = !blk_queue_nonrot(q); + rsp->cache_policy = 0; + if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + rsp->cache_policy |= RNBD_WRITEBACK; + if (blk_queue_fua(q)) + rsp->cache_policy |= RNBD_FUA; } static struct rnbd_srv_sess_dev * From 5a1328d0c3a757cdd8c65f4dfe0a02502a5810bc Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Thu, 10 Dec 2020 11:18:25 +0100 Subject: [PATCH 105/326] block/rnbd-clt: Dynamically allocate sglist for rnbd_iu The BMAX_SEGMENT static array for scatterlist is embedded in rnbd_iu structure to avoid memory allocation in hot IO path. In many cases, we do need only several sg entries because many IOs have only several segments. This patch change rnbd_iu to check the number of segments in the request and allocate sglist dynamically. For io path, use sg_alloc_table_chained to allocate sg list faster. First it makes two sg entries after pdu of request. The sg_alloc_table_chained uses the pre-allocated sg entries if the number of segments of the request is less than two. So it reduces the number of memory allocation. Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 60 ++++++++++++++++++++--------------- drivers/block/rnbd/rnbd-clt.h | 10 +++++- 2 files changed, 43 insertions(+), 27 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index b5fffbdeb263..5941ff7c83a8 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -384,6 +384,7 @@ static void rnbd_softirq_done_fn(struct request *rq) struct rnbd_iu *iu; iu = blk_mq_rq_to_pdu(rq); + sg_free_table_chained(&iu->sgt, RNBD_INLINE_SG_CNT); rnbd_put_permit(sess, iu->permit); blk_mq_end_request(rq, errno_to_blk_status(iu->errno)); } @@ -477,7 +478,7 @@ static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id, bool wait) iu->buf = NULL; iu->dev = dev; - sg_mark_end(&iu->sglist[0]); + sg_alloc_table(&iu->sgt, 1, GFP_KERNEL); msg.hdr.type = cpu_to_le16(RNBD_MSG_CLOSE); msg.device_id = cpu_to_le32(device_id); @@ -492,6 +493,7 @@ static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id, bool wait) err = errno; } + sg_free_table(&iu->sgt); rnbd_put_iu(sess, iu); return err; } @@ -564,7 +566,8 @@ static int send_msg_open(struct rnbd_clt_dev *dev, bool wait) iu->buf = rsp; iu->dev = dev; - sg_init_one(iu->sglist, rsp, sizeof(*rsp)); + sg_alloc_table(&iu->sgt, 1, GFP_KERNEL); + sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp)); msg.hdr.type = cpu_to_le16(RNBD_MSG_OPEN); msg.access_mode = dev->access_mode; @@ -572,7 +575,7 @@ static int send_msg_open(struct rnbd_clt_dev *dev, bool wait) WARN_ON(!rnbd_clt_get_dev(dev)); err = send_usr_msg(sess->rtrs, READ, iu, - &vec, sizeof(*rsp), iu->sglist, 1, + &vec, sizeof(*rsp), iu->sgt.sgl, 1, msg_open_conf, &errno, wait); if (err) { rnbd_clt_put_dev(dev); @@ -582,6 +585,7 @@ static int send_msg_open(struct rnbd_clt_dev *dev, bool wait) err = errno; } + sg_free_table(&iu->sgt); rnbd_put_iu(sess, iu); return err; } @@ -610,7 +614,8 @@ static int send_msg_sess_info(struct rnbd_clt_session *sess, bool wait) iu->buf = rsp; iu->sess = sess; - sg_init_one(iu->sglist, rsp, sizeof(*rsp)); + sg_alloc_table(&iu->sgt, 1, GFP_KERNEL); + sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp)); msg.hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO); msg.ver = RNBD_PROTO_VER_MAJOR; @@ -626,7 +631,7 @@ static int send_msg_sess_info(struct rnbd_clt_session *sess, bool wait) goto put_iu; } err = send_usr_msg(sess->rtrs, READ, iu, - &vec, sizeof(*rsp), iu->sglist, 1, + &vec, sizeof(*rsp), iu->sgt.sgl, 1, msg_sess_info_conf, &errno, wait); if (err) { rnbd_clt_put_sess(sess); @@ -636,7 +641,7 @@ put_iu: } else { err = errno; } - + sg_free_table(&iu->sgt); rnbd_put_iu(sess, iu); return err; } @@ -1016,11 +1021,10 @@ static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev, * See queue limits. */ if (req_op(rq) != REQ_OP_DISCARD) - sg_cnt = blk_rq_map_sg(dev->queue, rq, iu->sglist); + sg_cnt = blk_rq_map_sg(dev->queue, rq, iu->sgt.sgl); if (sg_cnt == 0) - /* Do not forget to mark the end */ - sg_mark_end(&iu->sglist[0]); + sg_mark_end(&iu->sgt.sgl[0]); msg.hdr.type = cpu_to_le16(RNBD_MSG_IO); msg.device_id = cpu_to_le32(dev->device_id); @@ -1029,13 +1033,13 @@ static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev, .iov_base = &msg, .iov_len = sizeof(msg) }; - size = rnbd_clt_get_sg_size(iu->sglist, sg_cnt); + size = rnbd_clt_get_sg_size(iu->sgt.sgl, sg_cnt); req_ops = (struct rtrs_clt_req_ops) { .priv = iu, .conf_fn = msg_io_conf, }; err = rtrs_clt_request(rq_data_dir(rq), &req_ops, rtrs, permit, - &vec, 1, size, iu->sglist, sg_cnt); + &vec, 1, size, iu->sgt.sgl, sg_cnt); if (unlikely(err)) { rnbd_clt_err_rl(dev, "RTRS failed to transfer IO, err: %d\n", err); @@ -1122,6 +1126,7 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx, struct rnbd_clt_dev *dev = rq->rq_disk->private_data; struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq); int err; + blk_status_t ret = BLK_STS_IOERR; if (unlikely(dev->dev_state != DEV_STATE_MAPPED)) return BLK_STS_IOERR; @@ -1133,32 +1138,35 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_RESOURCE; } + iu->sgt.sgl = iu->first_sgl; + err = sg_alloc_table_chained(&iu->sgt, + /* Even-if the request has no segment, + * sglist must have one entry at least */ + blk_rq_nr_phys_segments(rq) ? : 1, + iu->sgt.sgl, + RNBD_INLINE_SG_CNT); + if (err) { + rnbd_clt_err_rl(dev, "sg_alloc_table_chained ret=%d\n", err); + rnbd_clt_dev_kick_mq_queue(dev, hctx, 10/*ms*/); + rnbd_put_permit(dev->sess, iu->permit); + return BLK_STS_RESOURCE; + } + blk_mq_start_request(rq); err = rnbd_client_xfer_request(dev, rq, iu); if (likely(err == 0)) return BLK_STS_OK; if (unlikely(err == -EAGAIN || err == -ENOMEM)) { rnbd_clt_dev_kick_mq_queue(dev, hctx, 10/*ms*/); - rnbd_put_permit(dev->sess, iu->permit); - return BLK_STS_RESOURCE; + ret = BLK_STS_RESOURCE; } - + sg_free_table_chained(&iu->sgt, RNBD_INLINE_SG_CNT); rnbd_put_permit(dev->sess, iu->permit); - return BLK_STS_IOERR; -} - -static int rnbd_init_request(struct blk_mq_tag_set *set, struct request *rq, - unsigned int hctx_idx, unsigned int numa_node) -{ - struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq); - - sg_init_table(iu->sglist, BMAX_SEGMENTS); - return 0; + return ret; } static struct blk_mq_ops rnbd_mq_ops = { .queue_rq = rnbd_queue_rq, - .init_request = rnbd_init_request, .complete = rnbd_softirq_done_fn, }; @@ -1172,7 +1180,7 @@ static int setup_mq_tags(struct rnbd_clt_session *sess) tag_set->numa_node = NUMA_NO_NODE; tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_TAG_QUEUE_SHARED; - tag_set->cmd_size = sizeof(struct rnbd_iu); + tag_set->cmd_size = sizeof(struct rnbd_iu) + RNBD_RDMA_SGL_SIZE; tag_set->nr_hw_queues = num_online_cpus(); return blk_mq_alloc_tag_set(tag_set); diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index efd67ae286ca..537d499dad3b 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -44,6 +44,13 @@ struct rnbd_iu_comp { int errno; }; +#ifdef CONFIG_ARCH_NO_SG_CHAIN +#define RNBD_INLINE_SG_CNT 0 +#else +#define RNBD_INLINE_SG_CNT 2 +#endif +#define RNBD_RDMA_SGL_SIZE (sizeof(struct scatterlist) * RNBD_INLINE_SG_CNT) + struct rnbd_iu { union { struct request *rq; /* for block io */ @@ -56,11 +63,12 @@ struct rnbd_iu { /* use to send msg associated with a sess */ struct rnbd_clt_session *sess; }; - struct scatterlist sglist[BMAX_SEGMENTS]; + struct sg_table sgt; struct work_struct work; int errno; struct rnbd_iu_comp comp; atomic_t refcount; + struct scatterlist first_sgl[]; /* must be the last one */ }; struct rnbd_cpu_qlist { From 9aaf9a2aba0c2b5f0fc6dfeb011f0b4c8e224a73 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Thu, 10 Dec 2020 11:18:26 +0100 Subject: [PATCH 106/326] block/rnbd-clt: Does not request pdu to rtrs-clt Previously the rnbd client requested the rtrs to allocate rnbd_iu just after the rtrs_iu. So the rnbd client passes the size of rnbd_iu for rtrs_clt_open() and rtrs creates an array of rnbd_iu and rtrs_iu. For IO handling, rnbd_iu exists after the request because we pass the size of rnbd_iu when setting the tag-set. Therefore we do not use the rnbd_iu allocated by rtrs for IO handling. We only use the rnbd_iu allocated by rtrs when doing session initialization. Almost all rnbd_iu allocated by rtrs are wasted. By this patch the rnbd client does not request rnbd_iu allocation to rtrs but allocate it for itself when doing session initialization. Also remove unused rtrs_permit_to_pdu from rtrs. Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 17 +++++++++++++---- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 6 ------ drivers/infiniband/ulp/rtrs/rtrs.h | 7 ------- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 5941ff7c83a8..96e3f9fe8241 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -349,12 +349,19 @@ static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess, struct rnbd_iu *iu; struct rtrs_permit *permit; + iu = kzalloc(sizeof(*iu), GFP_KERNEL); + if (!iu) { + return NULL; + } + permit = rnbd_get_permit(sess, con_type, wait ? RTRS_PERMIT_WAIT : RTRS_PERMIT_NOWAIT); - if (unlikely(!permit)) + if (unlikely(!permit)) { + kfree(iu); return NULL; - iu = rtrs_permit_to_pdu(permit); + } + iu->permit = permit; /* * 1st reference is dropped after finishing sending a "user" message, @@ -373,8 +380,10 @@ static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess, static void rnbd_put_iu(struct rnbd_clt_session *sess, struct rnbd_iu *iu) { - if (atomic_dec_and_test(&iu->refcount)) + if (atomic_dec_and_test(&iu->refcount)) { rnbd_put_permit(sess, iu->permit); + kfree(iu); + } } static void rnbd_softirq_done_fn(struct request *rq) @@ -1218,7 +1227,7 @@ find_and_get_or_create_sess(const char *sessname, */ sess->rtrs = rtrs_clt_open(&rtrs_ops, sessname, paths, path_cnt, port_nr, - sizeof(struct rnbd_iu), + 0, /* Do not use pdu of rtrs */ RECONNECT_DELAY, BMAX_SEGMENTS, BLK_MAX_SEGMENT_SIZE, MAX_RECONNECTS); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 560865f65dc4..67f86c405a26 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -157,12 +157,6 @@ void rtrs_clt_put_permit(struct rtrs_clt *clt, struct rtrs_permit *permit) } EXPORT_SYMBOL(rtrs_clt_put_permit); -void *rtrs_permit_to_pdu(struct rtrs_permit *permit) -{ - return permit + 1; -} -EXPORT_SYMBOL(rtrs_permit_to_pdu); - /** * rtrs_permit_to_clt_con() - returns RDMA connection pointer by the permit * @sess: client session pointer diff --git a/drivers/infiniband/ulp/rtrs/rtrs.h b/drivers/infiniband/ulp/rtrs/rtrs.h index 9af750f4d783..8738e90e715a 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.h +++ b/drivers/infiniband/ulp/rtrs/rtrs.h @@ -63,13 +63,6 @@ struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops, void rtrs_clt_close(struct rtrs_clt *sess); -/** - * rtrs_permit_to_pdu() - converts rtrs_permit to opaque pdu pointer - * @permit: RTRS permit pointer, it associates the memory allocation for future - * RDMA operation. - */ -void *rtrs_permit_to_pdu(struct rtrs_permit *permit); - enum { RTRS_PERMIT_NOWAIT = 0, RTRS_PERMIT_WAIT = 1, From a146468d76e0462393a3e15b77b8b3ede60e2d06 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Dec 2020 20:57:27 -0700 Subject: [PATCH 107/326] io_uring: break links on shutdown failure Ensure that the return value of __sys_shutdown_sock() is used to potentially break links to the request, if we fail. Fixes: 36f4fa6886a8 ("io_uring: add support for shutdown(2)") Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6f9392c35eef..6a4560c9ed9a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3784,6 +3784,8 @@ static int io_shutdown(struct io_kiocb *req, bool force_nonblock) return -ENOTSOCK; ret = __sys_shutdown_sock(sock, req->shutdown.how); + if (ret < 0) + req_set_fail_links(req); io_req_complete(req, ret); return 0; #else From 4c46764733c85b82c07e9559b39da4d00a7dd659 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Sat, 5 Dec 2020 19:50:56 +0800 Subject: [PATCH 108/326] libnvdimm/label: Return -ENXIO for no slot in __blk_label_update Forget to set error code when nd_label_alloc_slot failed, and we add it to avoid overwritten error code. Fixes: 0ba1c634892b ("libnvdimm: write blk label set") Signed-off-by: Zhang Qilong Link: https://lore.kernel.org/r/20201205115056.2076523-1-zhangqilong3@huawei.com Signed-off-by: Dan Williams --- drivers/nvdimm/label.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index 6f2be7a34598..9251441fd8a3 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -1008,8 +1008,10 @@ static int __blk_label_update(struct nd_region *nd_region, if (is_old_resource(res, old_res_list, old_num_resources)) continue; /* carry-over */ slot = nd_label_alloc_slot(ndd); - if (slot == UINT_MAX) + if (slot == UINT_MAX) { + rc = -ENXIO; goto abort; + } dev_dbg(ndd->dev, "allocated: %d\n", slot); nd_label = to_label(ndd, slot); From 7948fab26bcc468aa2a76462f441291b5fb0d5c7 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Mon, 7 Dec 2020 11:30:05 +0530 Subject: [PATCH 109/326] watchdog: qcom: Avoid context switch in restart handler The use of msleep() in the restart handler will cause scheduler to induce a context switch which is not desirable. This generates below warning on SDX55 when WDT is the only available restart source: [ 39.800188] reboot: Restarting system [ 39.804115] ------------[ cut here ]------------ [ 39.807855] WARNING: CPU: 0 PID: 678 at kernel/rcu/tree_plugin.h:297 rcu_note_context_switch+0x190/0x764 [ 39.812538] Modules linked in: [ 39.821954] CPU: 0 PID: 678 Comm: reboot Not tainted 5.10.0-rc1-00063-g33a9990d1d66-dirty #47 [ 39.824854] Hardware name: Generic DT based system [ 39.833470] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 39.838154] [] (show_stack) from [] (dump_stack+0x8c/0xa0) [ 39.846049] [] (dump_stack) from [] (__warn+0xd8/0xf0) [ 39.853058] [] (__warn) from [] (warn_slowpath_fmt+0x64/0xc8) [ 39.859925] [] (warn_slowpath_fmt) from [] (rcu_note_context_switch+0x190/0x764) [ 39.867503] [] (rcu_note_context_switch) from [] (__schedule+0x84/0x640) [ 39.876685] [] (__schedule) from [] (schedule+0x58/0x10c) [ 39.885095] [] (schedule) from [] (schedule_timeout+0x1e8/0x3d4) [ 39.892135] [] (schedule_timeout) from [] (msleep+0x2c/0x38) [ 39.899947] [] (msleep) from [] (qcom_wdt_restart+0xc4/0xcc) [ 39.907319] [] (qcom_wdt_restart) from [] (watchdog_restart_notifier+0x18/0x28) [ 39.914715] [] (watchdog_restart_notifier) from [] (atomic_notifier_call_chain+0x60/0x84) [ 39.923487] [] (atomic_notifier_call_chain) from [] (machine_restart+0x78/0x7c) [ 39.933551] [] (machine_restart) from [] (__do_sys_reboot+0xdc/0x1e0) [ 39.942397] [] (__do_sys_reboot) from [] (ret_fast_syscall+0x0/0x54) [ 39.950721] Exception stack(0xc3e0bfa8 to 0xc3e0bff0) [ 39.958855] bfa0: 0001221c bed2fe24 fee1dead 28121969 01234567 00000000 [ 39.963832] bfc0: 0001221c bed2fe24 00000003 00000058 000225e0 00000000 00000000 00000000 [ 39.971985] bfe0: b6e62560 bed2fc84 00010fd8 b6e62580 [ 39.980124] ---[ end trace 3f578288bad866e4 ]--- Hence, replace msleep() with mdelay() to fix this issue. Fixes: 05e487d905ab ("watchdog: qcom: register a restart notifier") Signed-off-by: Manivannan Sadhasivam Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201207060005.21293-1-manivannan.sadhasivam@linaro.org Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/qcom-wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/qcom-wdt.c b/drivers/watchdog/qcom-wdt.c index 07d399c4edc4..7cf0f2ec649b 100644 --- a/drivers/watchdog/qcom-wdt.c +++ b/drivers/watchdog/qcom-wdt.c @@ -148,7 +148,7 @@ static int qcom_wdt_restart(struct watchdog_device *wdd, unsigned long action, */ wmb(); - msleep(150); + mdelay(150); return 0; } From 8cbd82d62f45423bc337abfcfd51da83fbe60277 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Fri, 11 Dec 2020 01:19:29 +0000 Subject: [PATCH 110/326] dt-bindings: watchdog: sun4i: Add A100 compatible Add a binding for A100's watchdog controller. Signed-off-by: Yangtao Li Acked-by: Rob Herring Signed-off-by: Andre Przywara Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201211011934.6171-17-andre.przywara@arm.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml b/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml index e8f226376108..5ac607de8be4 100644 --- a/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/allwinner,sun4i-a10-wdt.yaml @@ -21,6 +21,9 @@ properties: - items: - const: allwinner,sun50i-a64-wdt - const: allwinner,sun6i-a31-wdt + - items: + - const: allwinner,sun50i-a100-wdt + - const: allwinner,sun6i-a31-wdt - items: - const: allwinner,sun50i-h6-wdt - const: allwinner,sun6i-a31-wdt From 36c47df85ee8e1f8a35366ac11324f8875de00eb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 3 Dec 2020 23:33:42 +0100 Subject: [PATCH 111/326] watchdog: coh901327: add COMMON_CLK dependency clang produces a build failure in configurations without COMMON_CLK when a timeout calculation goes wrong: arm-linux-gnueabi-ld: drivers/watchdog/coh901327_wdt.o: in function `coh901327_enable': coh901327_wdt.c:(.text+0x50): undefined reference to `__bad_udelay' Add a Kconfig dependency to only do build testing when COMMON_CLK is enabled. Fixes: da2a68b3eb47 ("watchdog: Enable COMPILE_TEST where possible") Signed-off-by: Arnd Bergmann Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201203223358.1269372-1-arnd@kernel.org Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 5d3c3ec918a8..60f2b9a4a566 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -632,7 +632,7 @@ config SUNXI_WATCHDOG config COH901327_WATCHDOG bool "ST-Ericsson COH 901 327 watchdog" - depends on ARCH_U300 || (ARM && COMPILE_TEST) + depends on ARCH_U300 || (ARM && COMMON_CLK && COMPILE_TEST) default y if MACH_U300 select WATCHDOG_CORE help From e629fffcc333efbda6b7f8cdcf77238533ddf442 Mon Sep 17 00:00:00 2001 From: Johan Jonker Date: Mon, 16 Nov 2020 15:25:39 +0100 Subject: [PATCH 112/326] dt-binding: watchdog: add Rockchip compatibles to snps,dw-wdt.yaml The Rockchip watchdog compatibles below are already in use, but somehow never added to a document, so add them to the snps,dw-wdt.yaml file. "rockchip,rk3066-wdt", "snps,dw-wdt" "rockchip,rk3188-wdt", "snps,dw-wdt" "rockchip,rk3288-wdt", "snps,dw-wdt" "rockchip,rk3368-wdt", "snps,dw-wdt" make ARCH=arm dtbs_check DT_SCHEMA_FILES=Documentation/devicetree/bindings/watchdog/snps,dw-wdt.yaml make ARCH=arm64 dtbs_check DT_SCHEMA_FILES=Documentation/devicetree/bindings/watchdog/snps,dw-wdt.yaml Signed-off-by: Johan Jonker Reviewed-by: Rob Herring Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201116142539.12377-1-jbx6244@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../devicetree/bindings/watchdog/snps,dw-wdt.yaml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/watchdog/snps,dw-wdt.yaml b/Documentation/devicetree/bindings/watchdog/snps,dw-wdt.yaml index d9fc7bb851b1..f7ee9229c29f 100644 --- a/Documentation/devicetree/bindings/watchdog/snps,dw-wdt.yaml +++ b/Documentation/devicetree/bindings/watchdog/snps,dw-wdt.yaml @@ -14,7 +14,15 @@ maintainers: properties: compatible: - const: snps,dw-wdt + oneOf: + - const: snps,dw-wdt + - items: + - enum: + - rockchip,rk3066-wdt + - rockchip,rk3188-wdt + - rockchip,rk3288-wdt + - rockchip,rk3368-wdt + - const: snps,dw-wdt reg: maxItems: 1 From c21172b3a73e8daf016eec52af229bb7b9c76cc8 Mon Sep 17 00:00:00 2001 From: "Enrico Weigelt, metux IT consult" Date: Tue, 17 Nov 2020 16:22:13 +0100 Subject: [PATCH 113/326] watchdog: iTCO_wdt: use dev_*() instead of pr_*() for logging For device log outputs, it's better to have device name / ID prefixed in all messages, so use the proper dev_*() functions here. Explicit message on module load/unload don't seem to be really helpful (we have other means to check which modules have been loaded), instead just add noise to the kernel log. So, removing them. Signed-off-by: Enrico Weigelt, metux IT consult Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201117152214.32244-2-info@metux.net Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/iTCO_wdt.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index f2ddc8fc71cd..bf31d7b67a69 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -40,8 +40,6 @@ * Includes, defines, variables, module parameters, ... */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - /* Module and version information */ #define DRV_NAME "iTCO_wdt" #define DRV_VERSION "1.11" @@ -279,7 +277,7 @@ static int iTCO_wdt_start(struct watchdog_device *wd_dev) /* disable chipset's NO_REBOOT bit */ if (p->update_no_reboot_bit(p->no_reboot_priv, false)) { spin_unlock(&p->io_lock); - pr_err("failed to reset NO_REBOOT flag, reboot disabled by hardware/BIOS\n"); + dev_err(wd_dev->parent, "failed to reset NO_REBOOT flag, reboot disabled by hardware/BIOS\n"); return -EIO; } @@ -510,7 +508,7 @@ static int iTCO_wdt_probe(struct platform_device *pdev) /* Check chipset's NO_REBOOT bit */ if (p->update_no_reboot_bit(p->no_reboot_priv, false) && iTCO_vendor_check_noreboot_on()) { - pr_info("unable to reset NO_REBOOT flag, device disabled by hardware/BIOS\n"); + dev_info(dev, "unable to reset NO_REBOOT flag, device disabled by hardware/BIOS\n"); return -ENODEV; /* Cannot reset NO_REBOOT bit */ } @@ -530,12 +528,12 @@ static int iTCO_wdt_probe(struct platform_device *pdev) if (!devm_request_region(dev, p->tco_res->start, resource_size(p->tco_res), pdev->name)) { - pr_err("I/O address 0x%04llx already in use, device disabled\n", + dev_err(dev, "I/O address 0x%04llx already in use, device disabled\n", (u64)TCOBASE(p)); return -EBUSY; } - pr_info("Found a %s TCO device (Version=%d, TCOBASE=0x%04llx)\n", + dev_info(dev, "Found a %s TCO device (Version=%d, TCOBASE=0x%04llx)\n", pdata->name, pdata->version, (u64)TCOBASE(p)); /* Clear out the (probably old) status */ @@ -558,7 +556,7 @@ static int iTCO_wdt_probe(struct platform_device *pdev) break; } - p->wddev.info = &ident, + p->wddev.info = &ident, p->wddev.ops = &iTCO_wdt_ops, p->wddev.bootstatus = 0; p->wddev.timeout = WATCHDOG_TIMEOUT; @@ -575,7 +573,7 @@ static int iTCO_wdt_probe(struct platform_device *pdev) if not reset to the default */ if (iTCO_wdt_set_timeout(&p->wddev, heartbeat)) { iTCO_wdt_set_timeout(&p->wddev, WATCHDOG_TIMEOUT); - pr_info("timeout value out of range, using %d\n", + dev_info(dev, "timeout value out of range, using %d\n", WATCHDOG_TIMEOUT); } @@ -583,11 +581,11 @@ static int iTCO_wdt_probe(struct platform_device *pdev) watchdog_stop_on_unregister(&p->wddev); ret = devm_watchdog_register_device(dev, &p->wddev); if (ret != 0) { - pr_err("cannot register watchdog device (err=%d)\n", ret); + dev_err(dev, "cannot register watchdog device (err=%d)\n", ret); return ret; } - pr_info("initialized. heartbeat=%d sec (nowayout=%d)\n", + dev_info(dev, "initialized. heartbeat=%d sec (nowayout=%d)\n", heartbeat, nowayout); return 0; From 150927c3674d7db4dd51a7269e01423c8c78e53b Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Thu, 17 Dec 2020 16:52:44 +0800 Subject: [PATCH 114/326] ALSA: hda/realtek - Supported Dell fixed type headset This platform only supported iphone type headset. It can't support Dell headset mode. Signed-off-by: Kailang Yang Cc: Link: https://lore.kernel.org/r/b97e971978034bc9b772a08ec91265e8@realtek.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 71da03f950cc..c203eba8d9ff 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -7881,6 +7881,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x09bf, "Dell Precision", ALC233_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x0a2e, "Dell", ALC236_FIXUP_DELL_AIO_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0a30, "Dell", ALC236_FIXUP_DELL_AIO_HEADSET_MIC), + SND_PCI_QUIRK(0x1028, 0x0a58, "Dell Precision 3650 Tower", ALC255_FIXUP_DELL_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2), From 74c64efa1557fef731b59eb813f115436d18078e Mon Sep 17 00:00:00 2001 From: Robin Gong Date: Fri, 18 Dec 2020 00:15:47 +0800 Subject: [PATCH 115/326] ALSA: core: memalloc: add page alignment for iram Since mmap for userspace is based on page alignment, add page alignment for iram alloc from pool, otherwise, some good data located in the same page of dmab->area maybe touched wrongly by userspace like pulseaudio. Signed-off-by: Robin Gong Cc: Link: https://lore.kernel.org/r/1608221747-3474-1-git-send-email-yibin.gong@nxp.com Signed-off-by: Takashi Iwai --- sound/core/memalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index 0aeeb6244ff6..0f335162f87c 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -77,7 +77,8 @@ static void snd_malloc_dev_iram(struct snd_dma_buffer *dmab, size_t size) /* Assign the pool into private_data field */ dmab->private_data = pool; - dmab->area = gen_pool_dma_alloc(pool, size, &dmab->addr); + dmab->area = gen_pool_dma_alloc_align(pool, size, &dmab->addr, + PAGE_SIZE); } /** From 725124d10d00b2f56bb5bd08b431cc74ab3b3ace Mon Sep 17 00:00:00 2001 From: Amadej Kastelic Date: Tue, 15 Dec 2020 19:09:05 +0100 Subject: [PATCH 116/326] ALSA: usb-audio: Add VID to support native DSD reproduction on FiiO devices Add VID to support native DSD reproduction on FiiO devices. Tested-by: Amadej Kastelic Signed-off-by: Emilio Moretti Signed-off-by: Amadej Kastelic Cc: Link: https://lore.kernel.org/r/X9j7wdXSr4XyK7Bd@ryzen.localdomain Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 63cdf3c8c2bc..e4a690bb4c99 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1771,6 +1771,7 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, case 0x25ce: /* Mytek devices */ case 0x278b: /* Rotel? */ case 0x292b: /* Gustard/Ess based devices */ + case 0x2972: /* FiiO devices */ case 0x2ab6: /* T+A devices */ case 0x3353: /* Khadas devices */ case 0x3842: /* EVGA */ From 4bc4a912534a72f1c96f483448f0be16e5a48063 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Dec 2020 07:53:33 -0700 Subject: [PATCH 117/326] io_uring: hold mmap_sem for mm->locked_vm manipulation The kernel doesn't seem to have clear rules around this, but various spots are using the mmap_sem to serialize access to modifying the locked_vm count. Play it safe and lock the mm for write when accounting or unaccounting locked memory. Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6a4560c9ed9a..2d07d35e7262 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8157,10 +8157,13 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, __io_unaccount_mem(ctx->user, nr_pages); if (ctx->mm_account) { - if (acct == ACCT_LOCKED) + if (acct == ACCT_LOCKED) { + mmap_write_lock(ctx->mm_account); ctx->mm_account->locked_vm -= nr_pages; - else if (acct == ACCT_PINNED) + mmap_write_unlock(ctx->mm_account); + }else if (acct == ACCT_PINNED) { atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); + } } } @@ -8176,10 +8179,13 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, } if (ctx->mm_account) { - if (acct == ACCT_LOCKED) + if (acct == ACCT_LOCKED) { + mmap_write_lock(ctx->mm_account); ctx->mm_account->locked_vm += nr_pages; - else if (acct == ACCT_PINNED) + mmap_write_unlock(ctx->mm_account); + } else if (acct == ACCT_PINNED) { atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); + } } return 0; From 1aba169e770911fb2afa63eb859883c4de2191e3 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Thu, 17 Dec 2020 00:58:47 -0800 Subject: [PATCH 118/326] nbd: Respect max_part for all partition scans The creation path of the NBD device respects max_part and only scans for partitions if max_part is not 0. However, some other code paths ignore max_part, and unconditionally scan for partitions. Add a check for max_part on each partition scan. Signed-off-by: Josh Triplett Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 92f84ed0ba9e..6727358e147d 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -318,7 +318,8 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, blk_queue_logical_block_size(nbd->disk->queue, blksize); blk_queue_physical_block_size(nbd->disk->queue, blksize); - set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); + if (max_part) + set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); if (!set_capacity_and_notify(nbd->disk, bytesize >> 9)) kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); return 0; @@ -1476,9 +1477,11 @@ static int nbd_open(struct block_device *bdev, fmode_t mode) refcount_set(&nbd->config_refs, 1); refcount_inc(&nbd->refs); mutex_unlock(&nbd->config_lock); - set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + if (max_part) + set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); } else if (nbd_disconnected(nbd->config)) { - set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + if (max_part) + set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); } out: mutex_unlock(&nbd_index_mutex); From 76efc1c770968d6c786e5340029f8005ed29b2a5 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 10 Dec 2020 18:56:44 +0800 Subject: [PATCH 119/326] blk-iocost: Add iocg idle state tracepoint It will be helpful to trace the iocg's whole state, including active and idle state. And we can easily expand the original iocost_iocg_activate trace event to support a state trace class, including active and idle state tracing. Acked-by: Tejun Heo Signed-off-by: Baolin Wang Signed-off-by: Jens Axboe --- block/blk-iocost.c | 3 +++ include/trace/events/iocost.h | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index ffa418c0dcb1..ac6078a34939 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2185,6 +2185,9 @@ static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) WEIGHT_ONE); } + TRACE_IOCG_PATH(iocg_idle, iocg, now, + atomic64_read(&iocg->active_period), + atomic64_read(&ioc->cur_period), vtime); __propagate_weights(iocg, 0, 0, false, now); list_del_init(&iocg->active_list); } diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index 0b6869980ba2..e282ce02fa2d 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -11,7 +11,7 @@ struct ioc_gq; #include -TRACE_EVENT(iocost_iocg_activate, +DECLARE_EVENT_CLASS(iocost_iocg_state, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, u64 last_period, u64 cur_period, u64 vtime), @@ -59,6 +59,20 @@ TRACE_EVENT(iocost_iocg_activate, ) ); +DEFINE_EVENT(iocost_iocg_state, iocost_iocg_activate, + TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, + u64 last_period, u64 cur_period, u64 vtime), + + TP_ARGS(iocg, path, now, last_period, cur_period, vtime) +); + +DEFINE_EVENT(iocost_iocg_state, iocost_iocg_idle, + TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, + u64 last_period, u64 cur_period, u64 vtime), + + TP_ARGS(iocg, path, now, last_period, cur_period, vtime) +); + DECLARE_EVENT_CLASS(iocg_inuse_update, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, From cda286f0715c82f8117e166afd42cca068876dde Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Dec 2020 00:24:35 +0000 Subject: [PATCH 120/326] io_uring: cancel reqs shouldn't kill overflow list io_uring_cancel_task_requests() doesn't imply that the ring is going away, it may continue to work well after that. The problem is that it sets ->cq_overflow_flushed effectively disabling the CQ overflow feature Split setting cq_overflow_flushed from flush, and do the first one only on exit. It's ok in terms of cancellations because there is a io_uring->in_idle check in __io_cqring_fill_event(). It also fixes a race with setting ->cq_overflow_flushed in io_uring_cancel_task_requests, whuch's is not atomic and a part of a bitmask with other flags. Though, the only other flag that's not set during init is drain_next, so it's not as bad for sane architectures. Signed-off-by: Pavel Begunkov Fixes: 0f2122045b946 ("io_uring: don't rely on weak ->files references") Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2d07d35e7262..0d0217281ccf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1733,10 +1733,6 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, spin_lock_irqsave(&ctx->completion_lock, flags); - /* if force is set, the ring is going away. always drop after that */ - if (force) - ctx->cq_overflow_flushed = 1; - cqe = NULL; list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { if (!io_match_task(req, tsk, files)) @@ -8655,6 +8651,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); + /* if force is set, the ring is going away. always drop after that */ + ctx->cq_overflow_flushed = 1; if (ctx->rings) io_cqring_overflow_flush(ctx, true, NULL, NULL); mutex_unlock(&ctx->uring_lock); From 9cd2be519d05ee78876d55e8e902b7125f78b74f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Dec 2020 00:24:36 +0000 Subject: [PATCH 121/326] io_uring: remove racy overflow list fast checks list_empty_careful() is not racy only if some conditions are met, i.e. no re-adds after del_init. io_cqring_overflow_flush() does list_move(), so it's actually racy. Remove those checks, we have ->cq_check_overflow for the fast path. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0d0217281ccf..fa3cf564ec06 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1724,8 +1724,6 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, LIST_HEAD(list); if (!force) { - if (list_empty_careful(&ctx->cq_overflow_list)) - return true; if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)) return false; @@ -6822,8 +6820,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) /* if we have a backlog and couldn't flush it all, return BUSY */ if (test_bit(0, &ctx->sq_check_overflow)) { - if (!list_empty(&ctx->cq_overflow_list) && - !io_cqring_overflow_flush(ctx, false, NULL, NULL)) + if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) return -EBUSY; } From e23de15fdbd3070446b2d212373c0ae556f63d93 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Dec 2020 00:24:37 +0000 Subject: [PATCH 122/326] io_uring: consolidate CQ nr events calculation Add a helper which calculates number of events in CQ. Handcoded version of it in io_cqring_overflow_flush() is not the clearest thing, so it makes it slightly more readable. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index fa3cf564ec06..56ce97fc9c02 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1693,6 +1693,11 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) return io_wq_current_is_worker(); } +static inline unsigned __io_cqring_events(struct io_ring_ctx *ctx) +{ + return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); +} + static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) @@ -1723,14 +1728,10 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, unsigned long flags; LIST_HEAD(list); - if (!force) { - if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) == - rings->cq_ring_entries)) - return false; - } + if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries) + return false; spin_lock_irqsave(&ctx->completion_lock, flags); - cqe = NULL; list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { if (!io_match_task(req, tsk, files)) @@ -2314,8 +2315,6 @@ static void io_double_put_req(struct io_kiocb *req) static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) { - struct io_rings *rings = ctx->rings; - if (test_bit(0, &ctx->cq_check_overflow)) { /* * noflush == true is from the waitqueue handler, just ensure @@ -2330,7 +2329,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) /* See comment at the top of this file */ smp_rmb(); - return ctx->cached_cq_tail - READ_ONCE(rings->cq.head); + return __io_cqring_events(ctx); } static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) From 09e88404f46cc32237f596c66f48a826294e08f2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Dec 2020 00:24:38 +0000 Subject: [PATCH 123/326] io_uring: inline io_cqring_mark_overflow() There is only one user of it and the name is misleading, get rid of it by inlining. By the way make overflow_flush's return value deduction simpler. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 56ce97fc9c02..5c9aa8b5e077 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1708,15 +1708,6 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) eventfd_signal(ctx->cq_ev_fd, 1); } -static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) -{ - if (list_empty(&ctx->cq_overflow_list)) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); - ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; - } -} - /* Returns true if there are no backlogged entries after the flush */ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, struct task_struct *tsk, @@ -1726,13 +1717,13 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, struct io_kiocb *req, *tmp; struct io_uring_cqe *cqe; unsigned long flags; + bool all_flushed; LIST_HEAD(list); if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries) return false; spin_lock_irqsave(&ctx->completion_lock, flags); - cqe = NULL; list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { if (!io_match_task(req, tsk, files)) continue; @@ -1753,9 +1744,14 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, } } - io_commit_cqring(ctx); - io_cqring_mark_overflow(ctx); + all_flushed = list_empty(&ctx->cq_overflow_list); + if (all_flushed) { + clear_bit(0, &ctx->sq_check_overflow); + clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; + } + io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); @@ -1765,7 +1761,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, io_put_req(req); } - return cqe != NULL; + return all_flushed; } static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) From 89448c47b8452b67c146dc6cad6f737e004c5caf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 17 Dec 2020 00:24:39 +0000 Subject: [PATCH 124/326] io_uring: limit {io|sq}poll submit locking scope We don't need to take uring_lock for SQPOLL|IOPOLL to do io_cqring_overflow_flush() when cq_overflow_list is empty, remove it from the hot path. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5c9aa8b5e077..8cf6f22afc5e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9154,10 +9154,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { - io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); - if (!list_empty_careful(&ctx->cq_overflow_list)) + if (!list_empty_careful(&ctx->cq_overflow_list)) { + bool needs_lock = ctx->flags & IORING_SETUP_IOPOLL; + + io_ring_submit_lock(ctx, needs_lock); io_cqring_overflow_flush(ctx, false, NULL, NULL); - io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); + io_ring_submit_unlock(ctx, needs_lock); + } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) From cca415537244f6102cbb09b5b90db6ae2c953bdd Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Sat, 7 Nov 2020 23:58:18 +0800 Subject: [PATCH 125/326] ext4: fix a memory leak of ext4_free_data When freeing metadata, we will create an ext4_free_data and insert it into the pending free list. After the current transaction is committed, the object will be freed. ext4_mb_free_metadata() will check whether the area to be freed overlaps with the pending free list. If true, return directly. At this time, ext4_free_data is leaked. Fortunately, the probability of this problem is small, since it only occurs if the file system is corrupted such that a block is claimed by more one inode and those inodes are deleted within a single jbd2 transaction. Signed-off-by: Chunguang Xu Link: https://lore.kernel.org/r/1604764698-4269-8-git-send-email-brookxu@tencent.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/mballoc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 29dfeb043050..77815cd110b2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5103,6 +5103,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, cluster), "Block already on to-be-freed list"); + kmem_cache_free(ext4_free_data_cachep, new_entry); return 0; } } From c9200760da8a728eb9767ca41a956764b28c1310 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 9 Dec 2020 15:59:11 -0500 Subject: [PATCH 126/326] ext4: check for invalid block size early when mounting a file system Check for valid block size directly by validating s_log_block_size; we were doing this in two places. First, by calculating blocksize via BLOCK_SIZE << s_log_block_size, and then checking that the blocksize was valid. And then secondly, by checking s_log_block_size directly. The first check is not reliable, and can trigger an UBSAN warning if s_log_block_size on a maliciously corrupted superblock is greater than 22. This is harmless, since the second test will correctly reject the maliciously fuzzed file system, but to make syzbot shut up, and because the two checks are duplicative in any case, delete the blocksize check, and move the s_log_block_size earlier in ext4_fill_super(). Signed-off-by: Theodore Ts'o Reported-by: syzbot+345b75652b1d24227443@syzkaller.appspotmail.com --- fs/ext4/super.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f86220a8df50..4a16bbf0432c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4202,19 +4202,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + if (le32_to_cpu(es->s_log_block_size) > + (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log block size: %u", + le32_to_cpu(es->s_log_block_size)); + goto failed_mount; + } + if (le32_to_cpu(es->s_log_cluster_size) > + (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log cluster size: %u", + le32_to_cpu(es->s_log_cluster_size)); + goto failed_mount; + } + + blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); if (blocksize == PAGE_SIZE) set_opt(sb, DIOREAD_NOLOCK); - if (blocksize < EXT4_MIN_BLOCK_SIZE || - blocksize > EXT4_MAX_BLOCK_SIZE) { - ext4_msg(sb, KERN_ERR, - "Unsupported filesystem blocksize %d (%d log_block_size)", - blocksize, le32_to_cpu(es->s_log_block_size)); - goto failed_mount; - } - if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; @@ -4432,21 +4439,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) goto failed_mount; - if (le32_to_cpu(es->s_log_block_size) > - (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Invalid log block size: %u", - le32_to_cpu(es->s_log_block_size)); - goto failed_mount; - } - if (le32_to_cpu(es->s_log_cluster_size) > - (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Invalid log cluster size: %u", - le32_to_cpu(es->s_log_cluster_size)); - goto failed_mount; - } - if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { ext4_msg(sb, KERN_ERR, "Number of reserved GDT blocks insanely large: %d", From bc18546bf68e47996a359d2533168d5770a22024 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 23 Oct 2020 14:22:32 +0300 Subject: [PATCH 127/326] ext4: fix an IS_ERR() vs NULL check The ext4_find_extent() function never returns NULL, it returns error pointers. Fixes: 44059e503b03 ("ext4: fast commit recovery path") Signed-off-by: Dan Carpenter Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20201023112232.GB282278@mwanda Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/extents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 90ba74c14694..3960b7ec3ab7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5815,8 +5815,8 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, int ret; path = ext4_find_extent(inode, start, NULL, 0); - if (!path) - return -EINVAL; + if (IS_ERR(path)) + return PTR_ERR(path); ex = path[path->p_depth].p_ext; if (!ex) { ret = -EFSCORRUPTED; From 03505c58b86a5ca9bff2a9d611c2fe95dc14f707 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 12 Nov 2020 14:56:42 +0800 Subject: [PATCH 128/326] ext4: remove the unused EXT4_CURRENT_REV macro There are no callers of the EXT4_CURRENT_REV macro, so remove it. Signed-off-by: Kaixu Xia Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/1605164202-31120-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cd95965be3d1..264d3a092c7d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1868,7 +1868,6 @@ static inline bool ext4_verity_in_progress(struct inode *inode) #define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ #define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ -#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV #define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV #define EXT4_GOOD_OLD_INODE_SIZE 128 From b1b7dce3f09b460da38946d1845f3076daa36abb Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Thu, 19 Nov 2020 15:28:22 -0800 Subject: [PATCH 129/326] ext4: add docs about fast commit idempotence Fast commit on-disk format is designed such that the replay of these tags can be idempotent. This patch adds documentation in the code in form of comments and in form kernel docs that describes these characteristics. This patch also adds a TODO item needed to ensure kernel fast commit replay idempotence. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201119232822.1860882-1-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- Documentation/filesystems/ext4/journal.rst | 50 ++++++++++++++++++ fs/ext4/fast_commit.c | 61 ++++++++++++++++++++++ 2 files changed, 111 insertions(+) diff --git a/Documentation/filesystems/ext4/journal.rst b/Documentation/filesystems/ext4/journal.rst index 849d5b119eb8..cdbfec473167 100644 --- a/Documentation/filesystems/ext4/journal.rst +++ b/Documentation/filesystems/ext4/journal.rst @@ -681,3 +681,53 @@ Here is the list of supported tags and their meanings: - Stores the TID of the commit, CRC of the fast commit of which this tag represents the end of +Fast Commit Replay Idempotence +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Fast commits tags are idempotent in nature provided the recovery code follows +certain rules. The guiding principle that the commit path follows while +committing is that it stores the result of a particular operation instead of +storing the procedure. + +Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' +was associated with inode 10. During fast commit, instead of storing this +operation as a procedure "rename a to b", we store the resulting file system +state as a "series" of outcomes: + +- Link dirent b to inode 10 +- Unlink dirent a +- Inode 10 with valid refcount + +Now when recovery code runs, it needs "enforce" this state on the file +system. This is what guarantees idempotence of fast commit replay. + +Let's take an example of a procedure that is not idempotent and see how fast +commits make it idempotent. Consider following sequence of operations: + +1) rm A +2) mv B A +3) read A + +If we store this sequence of operations as is then the replay is not idempotent. +Let's say while in replay, we crash after (2). During the second replay, +file A (which was actually created as a result of "mv B A" operation) would get +deleted. Thus, file named A would be absent when we try to read A. So, this +sequence of operations is not idempotent. However, as mentioned above, instead +of storing the procedure fast commits store the outcome of each procedure. Thus +the fast commit log for above procedure would be as follows: + +(Let's assume dirent A was linked to inode 10 and dirent B was linked to +inode 11 before the replay) + +1) Unlink A +2) Link A to inode 11 +3) Unlink B +4) Inode 11 + +If we crash after (3) we will have file A linked to inode 11. During the second +replay, we will remove file A (inode 11). But we will create it back and make +it point to inode 11. We won't find B, so we'll just skip that step. At this +point, the refcount for inode 11 is not reliable, but that gets fixed by the +replay of last inode 11 tag. Thus, by converting a non-idempotent procedure +into a series of idempotent outcomes, fast commits ensured idempotence during +the replay. diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index f2033e13a273..b4bc8bf307c9 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -103,8 +103,69 @@ * * Replay code should thus check for all the valid tails in the FC area. * + * Fast Commit Replay Idempotence + * ------------------------------ + * + * Fast commits tags are idempotent in nature provided the recovery code follows + * certain rules. The guiding principle that the commit path follows while + * committing is that it stores the result of a particular operation instead of + * storing the procedure. + * + * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' + * was associated with inode 10. During fast commit, instead of storing this + * operation as a procedure "rename a to b", we store the resulting file system + * state as a "series" of outcomes: + * + * - Link dirent b to inode 10 + * - Unlink dirent a + * - Inode <10> with valid refcount + * + * Now when recovery code runs, it needs "enforce" this state on the file + * system. This is what guarantees idempotence of fast commit replay. + * + * Let's take an example of a procedure that is not idempotent and see how fast + * commits make it idempotent. Consider following sequence of operations: + * + * rm A; mv B A; read A + * (x) (y) (z) + * + * (x), (y) and (z) are the points at which we can crash. If we store this + * sequence of operations as is then the replay is not idempotent. Let's say + * while in replay, we crash at (z). During the second replay, file A (which was + * actually created as a result of "mv B A" operation) would get deleted. Thus, + * file named A would be absent when we try to read A. So, this sequence of + * operations is not idempotent. However, as mentioned above, instead of storing + * the procedure fast commits store the outcome of each procedure. Thus the fast + * commit log for above procedure would be as follows: + * + * (Let's assume dirent A was linked to inode 10 and dirent B was linked to + * inode 11 before the replay) + * + * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] + * (w) (x) (y) (z) + * + * If we crash at (z), we will have file A linked to inode 11. During the second + * replay, we will remove file A (inode 11). But we will create it back and make + * it point to inode 11. We won't find B, so we'll just skip that step. At this + * point, the refcount for inode 11 is not reliable, but that gets fixed by the + * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled + * similarly. Thus, by converting a non-idempotent procedure into a series of + * idempotent outcomes, fast commits ensured idempotence during the replay. + * * TODOs * ----- + * + * 0) Fast commit replay path hardening: Fast commit replay code should use + * journal handles to make sure all the updates it does during the replay + * path are atomic. With that if we crash during fast commit replay, after + * trying to do recovery again, we will find a file system where fast commit + * area is invalid (because new full commit would be found). In order to deal + * with that, fast commit replay code should ensure that the "FC_REPLAY" + * superblock state is persisted before starting the replay, so that after + * the crash, fast commit recovery code can look at that flag and perform + * fast commit recovery even if that area is invalidated by later full + * commits. + * * 1) Make fast commit atomic updates more fine grained. Today, a fast commit * eligible update must be protected within ext4_fc_start_update() and * ext4_fc_stop_update(). These routines are called at much higher From 5a150bdec7dc79ad88e61cdf8c13106dd878311e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 20 Nov 2020 12:28:32 -0600 Subject: [PATCH 130/326] ext4: fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, fix a warning by explicitly adding a break statement instead of just letting the code fall through to the next case. Link: https://github.com/KSPP/linux/issues/115 Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/03497331f088a938d7a728e7a689bd7953139429.1605896059.git.gustavoars@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4a16bbf0432c..872d45a131ca 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4878,6 +4878,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "requested data journaling mode"); goto failed_mount_wq; } + break; default: break; } From 941ba122ca56756aad82db21d28f283ad33b8dee Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Fri, 20 Nov 2020 12:22:31 -0800 Subject: [PATCH 131/326] ext4: make fast_commit.h byte identical with e2fsprogs/fast_commit.h This patch makes fast_commit.h byte by byte identical with e2fsprogs/fast_commit.h. This will help us ensure that there are no on-disk format inconsistencies between e2fsck and kernel ext4. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201120202232.2240293-1-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 38 --------------------- fs/ext4/fast_commit.h | 78 +++++++++++++++++++++++++++++++++---------- 2 files changed, 61 insertions(+), 55 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index b4bc8bf307c9..4fcc21c25e79 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1281,18 +1281,6 @@ static void ext4_fc_cleanup(journal_t *journal, int full) /* Ext4 Replay Path Routines */ -/* Get length of a particular tlv */ -static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl) -{ - return le16_to_cpu(tl->fc_len); -} - -/* Get a pointer to "value" of a tlv */ -static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl) -{ - return (u8 *)tl + sizeof(*tl); -} - /* Helper struct for dentry replay routines */ struct dentry_info_args { int parent_ino, dname_len, ino, inode_len; @@ -1831,32 +1819,6 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl) return 0; } -static inline const char *tag2str(u16 tag) -{ - switch (tag) { - case EXT4_FC_TAG_LINK: - return "TAG_ADD_ENTRY"; - case EXT4_FC_TAG_UNLINK: - return "TAG_DEL_ENTRY"; - case EXT4_FC_TAG_ADD_RANGE: - return "TAG_ADD_RANGE"; - case EXT4_FC_TAG_CREAT: - return "TAG_CREAT_DENTRY"; - case EXT4_FC_TAG_DEL_RANGE: - return "TAG_DEL_RANGE"; - case EXT4_FC_TAG_INODE: - return "TAG_INODE"; - case EXT4_FC_TAG_PAD: - return "TAG_PAD"; - case EXT4_FC_TAG_TAIL: - return "TAG_TAIL"; - case EXT4_FC_TAG_HEAD: - return "TAG_HEAD"; - default: - return "TAG_ERROR"; - } -} - static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) { struct ext4_fc_replay_state *state; diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h index 3a6e5a1fa1b8..b77f70f55a62 100644 --- a/fs/ext4/fast_commit.h +++ b/fs/ext4/fast_commit.h @@ -3,6 +3,11 @@ #ifndef __FAST_COMMIT_H__ #define __FAST_COMMIT_H__ +/* + * Note this file is present in e2fsprogs/lib/ext2fs/fast_commit.h and + * linux/fs/ext4/fast_commit.h. These file should always be byte identical. + */ + /* Fast commit tags */ #define EXT4_FC_TAG_ADD_RANGE 0x0001 #define EXT4_FC_TAG_DEL_RANGE 0x0002 @@ -50,7 +55,7 @@ struct ext4_fc_del_range { struct ext4_fc_dentry_info { __le32 fc_parent_ino; __le32 fc_ino; - u8 fc_dname[0]; + __u8 fc_dname[0]; }; /* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */ @@ -65,19 +70,6 @@ struct ext4_fc_tail { __le32 fc_crc; }; -/* - * In memory list of dentry updates that are performed on the file - * system used by fast commit code. - */ -struct ext4_fc_dentry_update { - int fcd_op; /* Type of update create / unlink / link */ - int fcd_parent; /* Parent inode number */ - int fcd_ino; /* Inode number */ - struct qstr fcd_name; /* Dirent name */ - unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */ - struct list_head fcd_list; -}; - /* * Fast commit reason codes */ @@ -107,6 +99,20 @@ enum { EXT4_FC_REASON_MAX }; +#ifdef __KERNEL__ +/* + * In memory list of dentry updates that are performed on the file + * system used by fast commit code. + */ +struct ext4_fc_dentry_update { + int fcd_op; /* Type of update create / unlink / link */ + int fcd_parent; /* Parent inode number */ + int fcd_ino; /* Inode number */ + struct qstr fcd_name; /* Dirent name */ + unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */ + struct list_head fcd_list; +}; + struct ext4_fc_stats { unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX]; unsigned long fc_num_commits; @@ -145,13 +151,51 @@ struct ext4_fc_replay_state { }; #define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1) +#endif #define fc_for_each_tl(__start, __end, __tl) \ - for (tl = (struct ext4_fc_tl *)start; \ - (u8 *)tl < (u8 *)end; \ - tl = (struct ext4_fc_tl *)((u8 *)tl + \ + for (tl = (struct ext4_fc_tl *)(__start); \ + (__u8 *)tl < (__u8 *)(__end); \ + tl = (struct ext4_fc_tl *)((__u8 *)tl + \ sizeof(struct ext4_fc_tl) + \ + le16_to_cpu(tl->fc_len))) +static inline const char *tag2str(__u16 tag) +{ + switch (tag) { + case EXT4_FC_TAG_LINK: + return "ADD_ENTRY"; + case EXT4_FC_TAG_UNLINK: + return "DEL_ENTRY"; + case EXT4_FC_TAG_ADD_RANGE: + return "ADD_RANGE"; + case EXT4_FC_TAG_CREAT: + return "CREAT_DENTRY"; + case EXT4_FC_TAG_DEL_RANGE: + return "DEL_RANGE"; + case EXT4_FC_TAG_INODE: + return "INODE"; + case EXT4_FC_TAG_PAD: + return "PAD"; + case EXT4_FC_TAG_TAIL: + return "TAIL"; + case EXT4_FC_TAG_HEAD: + return "HEAD"; + default: + return "ERROR"; + } +} + +/* Get length of a particular tlv */ +static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl) +{ + return le16_to_cpu(tl->fc_len); +} + +/* Get a pointer to "value" of a tlv */ +static inline __u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl) +{ + return (__u8 *)tl + sizeof(*tl); +} #endif /* __FAST_COMMIT_H__ */ From 9bd23c31f392bda88618008f27fd52ee9e0fac38 Mon Sep 17 00:00:00 2001 From: Harshad Shirwadkar Date: Fri, 20 Nov 2020 12:22:32 -0800 Subject: [PATCH 132/326] jbd2: add a helper to find out number of fast commit blocks Add a helper to read number of fast commit blocks from jbd2 superblock and also rename the JBD2_MIN_FC_BLKS to JBD2_DEFAULT_FAST_COMMIT_BLOCKS since this constant is just the default number of fast commit blocks to use in case number of fast commit blocks isn't set in jbd2 superblock. Signed-off-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201120202232.2240293-2-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 8 ++------ include/linux/jbd2.h | 9 ++++++++- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 188f79d76988..2dc944442802 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1869,9 +1869,7 @@ static int load_superblock(journal_t *journal) if (jbd2_has_feature_fast_commit(journal)) { journal->j_fc_last = be32_to_cpu(sb->s_maxlen); - num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks); - if (!num_fc_blocks) - num_fc_blocks = JBD2_MIN_FC_BLOCKS; + num_fc_blocks = jbd2_journal_get_num_fc_blks(sb); if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS) journal->j_last = journal->j_fc_last - num_fc_blocks; journal->j_fc_first = journal->j_last + 1; @@ -2102,9 +2100,7 @@ jbd2_journal_initialize_fast_commit(journal_t *journal) journal_superblock_t *sb = journal->j_superblock; unsigned long long num_fc_blks; - num_fc_blks = be32_to_cpu(sb->s_num_fc_blks); - if (num_fc_blks == 0) - num_fc_blks = JBD2_MIN_FC_BLOCKS; + num_fc_blks = jbd2_journal_get_num_fc_blks(sb); if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS) return -ENOSPC; diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d2a4860feb72..99d3cd051ac3 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -68,7 +68,7 @@ extern void *jbd2_alloc(size_t size, gfp_t flags); extern void jbd2_free(void *ptr, size_t size); #define JBD2_MIN_JOURNAL_BLOCKS 1024 -#define JBD2_MIN_FC_BLOCKS 256 +#define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256 #ifdef __KERNEL__ @@ -1692,6 +1692,13 @@ static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) return journal->j_chksum_driver != NULL; } +static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb) +{ + int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks); + + return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS; +} + /* * Return number of free blocks in the log. Must be called under j_state_lock. */ From 46e294efc355c48d1dd4d58501aa56dac461792a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 27 Nov 2020 12:06:49 +0100 Subject: [PATCH 133/326] ext4: fix deadlock with fs freezing and EA inodes Xattr code using inodes with large xattr data can end up dropping last inode reference (and thus deleting the inode) from places like ext4_xattr_set_entry(). That function is called with transaction started and so ext4_evict_inode() can deadlock against fs freezing like: CPU1 CPU2 removexattr() freeze_super() vfs_removexattr() ext4_xattr_set() handle = ext4_journal_start() ... ext4_xattr_set_entry() iput(old_ea_inode) ext4_evict_inode(old_ea_inode) sb->s_writers.frozen = SB_FREEZE_FS; sb_wait_write(sb, SB_FREEZE_FS); ext4_freeze() jbd2_journal_lock_updates() -> blocks waiting for all handles to stop sb_start_intwrite() -> blocks as sb is already in SB_FREEZE_FS state Generally it is advisable to delete inodes from a separate transaction as it can consume quite some credits however in this case it would be quite clumsy and furthermore the credits for inode deletion are quite limited and already accounted for. So just tweak ext4_evict_inode() to avoid freeze protection if we have transaction already started and thus it is not really needed anyway. Cc: stable@vger.kernel.org Fixes: dec214d00e0d ("ext4: xattr inode deduplication") Signed-off-by: Jan Kara Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20201127110649.24730-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b147c2e20469..6b44657fb3e4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -175,6 +175,7 @@ void ext4_evict_inode(struct inode *inode) */ int extra_credits = 6; struct ext4_xattr_inode_array *ea_inode_array = NULL; + bool freeze_protected = false; trace_ext4_evict_inode(inode); @@ -232,9 +233,14 @@ void ext4_evict_inode(struct inode *inode) /* * Protect us against freezing - iput() caller didn't have to have any - * protection against it + * protection against it. When we are in a running transaction though, + * we are already protected against freezing and we cannot grab further + * protection due to lock ordering constraints. */ - sb_start_intwrite(inode->i_sb); + if (!ext4_journal_current_handle()) { + sb_start_intwrite(inode->i_sb); + freeze_protected = true; + } if (!IS_NOQUOTA(inode)) extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); @@ -253,7 +259,8 @@ void ext4_evict_inode(struct inode *inode) * cleaned up. */ ext4_orphan_del(NULL, inode); - sb_end_intwrite(inode->i_sb); + if (freeze_protected) + sb_end_intwrite(inode->i_sb); goto no_delete; } @@ -294,7 +301,8 @@ void ext4_evict_inode(struct inode *inode) stop_handle: ext4_journal_stop(handle); ext4_orphan_del(NULL, inode); - sb_end_intwrite(inode->i_sb); + if (freeze_protected) + sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); goto no_delete; } @@ -323,7 +331,8 @@ stop_handle: else ext4_free_inode(handle, inode); ext4_journal_stop(handle); - sb_end_intwrite(inode->i_sb); + if (freeze_protected) + sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: From b08070eca9e247f60ab39d79b2c25d274750441f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 27 Nov 2020 12:33:54 +0100 Subject: [PATCH 134/326] ext4: don't remount read-only with errors=continue on reboot ext4_handle_error() with errors=continue mount option can accidentally remount the filesystem read-only when the system is rebooting. Fix that. Fixes: 1dc1097ff60e ("ext4: avoid panic during forced reboot") Signed-off-by: Jan Kara Reviewed-by: Andreas Dilger Cc: stable@kernel.org Link: https://lore.kernel.org/r/20201127113405.26867-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 872d45a131ca..3ef84e8ab1ae 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -666,19 +666,17 @@ static bool system_going_down(void) static void ext4_handle_error(struct super_block *sb) { + journal_t *journal = EXT4_SB(sb)->s_journal; + if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); - if (sb_rdonly(sb)) + if (sb_rdonly(sb) || test_opt(sb, ERRORS_CONT)) return; - if (!test_opt(sb, ERRORS_CONT)) { - journal_t *journal = EXT4_SB(sb)->s_journal; - - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); - if (journal) - jbd2_journal_abort(journal, -EIO); - } + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); + if (journal) + jbd2_journal_abort(journal, -EIO); /* * We force ERRORS_RO behavior when system is rebooting. Otherwise we * could panic during 'reboot -f' as the underlying device got already From 81414b4dd48f596bf33e1b32c2e43e2047150ca6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 27 Nov 2020 12:33:55 +0100 Subject: [PATCH 135/326] ext4: remove redundant sb checksum recomputation Superblock is written out either through ext4_commit_super() or through ext4_handle_dirty_super(). In both cases we recompute the checksum so it is not necessary to recompute it after updating superblock free inodes & blocks counters. Signed-off-by: Jan Kara Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20201127113405.26867-3-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3ef84e8ab1ae..7bb516c9b2de 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5010,13 +5010,11 @@ no_journal: block = ext4_count_free_clusters(sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); - ext4_superblock_csum_set(sb); err = percpu_counter_init(&sbi->s_freeclusters_counter, block, GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); - ext4_superblock_csum_set(sb); err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } From 93c20bc3eafba52c134cf5183f18833b9bd22bf8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 27 Nov 2020 12:33:56 +0100 Subject: [PATCH 136/326] ext4: standardize error message in ext4_protect_reserved_inode() We use __ext4_error() when ext4_protect_reserved_inode() finds filesystem corruption. However EXT4_ERROR_INODE_ERR() is perfectly capable of reporting all the needed information. So just use that. Signed-off-by: Jan Kara Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20201127113405.26867-4-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/block_validity.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index c13422a770e4..4666b55b736e 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -176,12 +176,10 @@ static int ext4_protect_reserved_inode(struct super_block *sb, err = add_system_zone(system_blks, map.m_pblk, n, ino); if (err < 0) { if (err == -EFSCORRUPTED) { - __ext4_error(sb, __func__, __LINE__, - -err, map.m_pblk, - "blocks %llu-%llu from inode %u overlap system zone", - map.m_pblk, - map.m_pblk + map.m_len - 1, - ino); + EXT4_ERROR_INODE_ERR(inode, -err, + "blocks %llu-%llu from inode overlap system zone", + map.m_pblk, + map.m_pblk + map.m_len - 1); } break; } From 014c9caa29d3a44e0de695c99ef18bec3e887d52 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 27 Nov 2020 12:33:57 +0100 Subject: [PATCH 137/326] ext4: make ext4_abort() use __ext4_error() The only difference between __ext4_abort() and __ext4_error() is that the former one ignores errors=continue mount option. Unify the code to reduce duplication. Signed-off-by: Jan Kara Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20201127113405.26867-5-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 29 +++++++--------- fs/ext4/ext4_jbd2.c | 4 +-- fs/ext4/inode.c | 2 +- fs/ext4/super.c | 84 ++++++++++++--------------------------------- 4 files changed, 37 insertions(+), 82 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 264d3a092c7d..84c63b9b8310 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2961,9 +2961,9 @@ extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); -extern __printf(6, 7) -void __ext4_error(struct super_block *, const char *, unsigned int, int, __u64, - const char *, ...); +extern __printf(7, 8) +void __ext4_error(struct super_block *, const char *, unsigned int, bool, + int, __u64, const char *, ...); extern __printf(6, 7) void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, int, const char *, ...); @@ -2972,9 +2972,6 @@ void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, const char *, ...); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); -extern __printf(5, 6) -void __ext4_abort(struct super_block *, const char *, unsigned int, int, - const char *, ...); extern __printf(4, 5) void __ext4_warning(struct super_block *, const char *, unsigned int, const char *, ...); @@ -3004,6 +3001,9 @@ void __ext4_grp_locked_error(const char *, unsigned int, #define EXT4_ERROR_FILE(file, block, fmt, a...) \ ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) +#define ext4_abort(sb, err, fmt, a...) \ + __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a) + #ifdef CONFIG_PRINTK #define ext4_error_inode(inode, func, line, block, fmt, ...) \ @@ -3014,11 +3014,11 @@ void __ext4_grp_locked_error(const char *, unsigned int, #define ext4_error_file(file, func, line, block, fmt, ...) \ __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) #define ext4_error(sb, fmt, ...) \ - __ext4_error((sb), __func__, __LINE__, 0, 0, (fmt), ##__VA_ARGS__) + __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt), \ + ##__VA_ARGS__) #define ext4_error_err(sb, err, fmt, ...) \ - __ext4_error((sb), __func__, __LINE__, (err), 0, (fmt), ##__VA_ARGS__) -#define ext4_abort(sb, err, fmt, ...) \ - __ext4_abort((sb), __func__, __LINE__, (err), (fmt), ##__VA_ARGS__) + __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt), \ + ##__VA_ARGS__) #define ext4_warning(sb, fmt, ...) \ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_warning_inode(inode, fmt, ...) \ @@ -3051,17 +3051,12 @@ do { \ #define ext4_error(sb, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ - __ext4_error(sb, "", 0, 0, 0, " "); \ + __ext4_error(sb, "", 0, false, 0, 0, " "); \ } while (0) #define ext4_error_err(sb, err, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ - __ext4_error(sb, "", 0, err, 0, " "); \ -} while (0) -#define ext4_abort(sb, err, fmt, ...) \ -do { \ - no_printk(fmt, ##__VA_ARGS__); \ - __ext4_abort(sb, "", 0, err, " "); \ + __ext4_error(sb, "", 0, false, err, 0, " "); \ } while (0) #define ext4_warning(sb, fmt, ...) \ do { \ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 0fd0c42a4f7d..1a0a827a7f34 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -296,8 +296,8 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); - __ext4_abort(inode->i_sb, where, line, -err, - "error %d when attempting revoke", err); + __ext4_error(inode->i_sb, where, line, true, -err, 0, + "error %d when attempting revoke", err); } BUFFER_TRACE(bh, "exit"); return err; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6b44657fb3e4..ab786123a022 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4619,7 +4619,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) return ERR_PTR(-ESTALE); - __ext4_error(sb, function, line, EFSCORRUPTED, 0, + __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, "inode #%lu: comm %s: iget: illegal inode #", ino, current->comm); return ERR_PTR(-EFSCORRUPTED); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7bb516c9b2de..930396eb8e6e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -662,16 +662,21 @@ static bool system_going_down(void) * We'll just use the jbd2_journal_abort() error code to record an error in * the journal instead. On recovery, the journal will complain about * that error until we've noted it down and cleared it. + * + * If force_ro is set, we unconditionally force the filesystem into an + * ABORT|READONLY state, unless the error response on the fs has been set to + * panic in which case we take the easy way out and panic immediately. This is + * used to deal with unrecoverable failures such as journal IO errors or ENOMEM + * at a critical moment in log management. */ - -static void ext4_handle_error(struct super_block *sb) +static void ext4_handle_error(struct super_block *sb, bool force_ro) { journal_t *journal = EXT4_SB(sb)->s_journal; if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); - if (sb_rdonly(sb) || test_opt(sb, ERRORS_CONT)) + if (sb_rdonly(sb) || (!force_ro && test_opt(sb, ERRORS_CONT))) return; ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); @@ -682,18 +687,17 @@ static void ext4_handle_error(struct super_block *sb) * could panic during 'reboot -f' as the underlying device got already * disabled. */ - if (test_opt(sb, ERRORS_RO) || system_going_down()) { - ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); - /* - * Make sure updated value of ->s_mount_flags will be visible - * before ->s_flags update - */ - smp_wmb(); - sb->s_flags |= SB_RDONLY; - } else if (test_opt(sb, ERRORS_PANIC)) { + if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); + /* + * Make sure updated value of ->s_mount_flags will be visible before + * ->s_flags update + */ + smp_wmb(); + sb->s_flags |= SB_RDONLY; } #define ext4_error_ratelimit(sb) \ @@ -701,7 +705,7 @@ static void ext4_handle_error(struct super_block *sb) "EXT4-fs error") void __ext4_error(struct super_block *sb, const char *function, - unsigned int line, int error, __u64 block, + unsigned int line, bool force_ro, int error, __u64 block, const char *fmt, ...) { struct va_format vaf; @@ -721,7 +725,7 @@ void __ext4_error(struct super_block *sb, const char *function, va_end(args); } save_error_info(sb, error, 0, block, function, line); - ext4_handle_error(sb); + ext4_handle_error(sb, force_ro); } void __ext4_error_inode(struct inode *inode, const char *function, @@ -753,7 +757,7 @@ void __ext4_error_inode(struct inode *inode, const char *function, } save_error_info(inode->i_sb, error, inode->i_ino, block, function, line); - ext4_handle_error(inode->i_sb); + ext4_handle_error(inode->i_sb, false); } void __ext4_error_file(struct file *file, const char *function, @@ -792,7 +796,7 @@ void __ext4_error_file(struct file *file, const char *function, } save_error_info(inode->i_sb, EFSCORRUPTED, inode->i_ino, block, function, line); - ext4_handle_error(inode->i_sb); + ext4_handle_error(inode->i_sb, false); } const char *ext4_decode_error(struct super_block *sb, int errno, @@ -860,51 +864,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, } save_error_info(sb, -errno, 0, 0, function, line); - ext4_handle_error(sb); -} - -/* - * ext4_abort is a much stronger failure handler than ext4_error. The - * abort function may be used to deal with unrecoverable failures such - * as journal IO errors or ENOMEM at a critical moment in log management. - * - * We unconditionally force the filesystem into an ABORT|READONLY state, - * unless the error response on the fs has been set to panic in which - * case we take the easy way out and panic immediately. - */ - -void __ext4_abort(struct super_block *sb, const char *function, - unsigned int line, int error, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) - return; - - save_error_info(sb, error, 0, 0, function, line); - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: %pV\n", - sb->s_id, function, line, &vaf); - va_end(args); - - if (sb_rdonly(sb) == 0) { - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); - if (EXT4_SB(sb)->s_journal) - jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); - - ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); - /* - * Make sure updated value of ->s_mount_flags will be visible - * before ->s_flags update - */ - smp_wmb(); - sb->s_flags |= SB_RDONLY; - } - if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) - panic("EXT4-fs panic from previous error\n"); + ext4_handle_error(sb, false); } void __ext4_msg(struct super_block *sb, @@ -1007,7 +967,7 @@ __acquires(bitlock) ext4_unlock_group(sb, grp); ext4_commit_super(sb, 1); - ext4_handle_error(sb); + ext4_handle_error(sb, false); /* * We only get here in the ERRORS_RO case; relocking the group * may be dangerous, but nothing bad will happen since the From 4067662388f97d0f360e568820d9d5bac6a3c9fa Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 27 Nov 2020 12:33:58 +0100 Subject: [PATCH 138/326] ext4: move functions in super.c Just move error info related functions in super.c close to ext4_handle_error(). We'll want to combine save_error_info() with ext4_handle_error() and this makes change more obvious and saves a forward declaration as well. No functional change. Signed-off-by: Jan Kara Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20201127113405.26867-6-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 196 ++++++++++++++++++++++++------------------------ 1 file changed, 98 insertions(+), 98 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 930396eb8e6e..88d19f354dc3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -423,104 +423,6 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) -static void __save_error_info(struct super_block *sb, int error, - __u32 ino, __u64 block, - const char *func, unsigned int line) -{ - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - int err; - - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - if (bdev_read_only(sb->s_bdev)) - return; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_update_tstamp(es, s_last_error_time); - strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); - es->s_last_error_line = cpu_to_le32(line); - es->s_last_error_ino = cpu_to_le32(ino); - es->s_last_error_block = cpu_to_le64(block); - switch (error) { - case EIO: - err = EXT4_ERR_EIO; - break; - case ENOMEM: - err = EXT4_ERR_ENOMEM; - break; - case EFSBADCRC: - err = EXT4_ERR_EFSBADCRC; - break; - case 0: - case EFSCORRUPTED: - err = EXT4_ERR_EFSCORRUPTED; - break; - case ENOSPC: - err = EXT4_ERR_ENOSPC; - break; - case ENOKEY: - err = EXT4_ERR_ENOKEY; - break; - case EROFS: - err = EXT4_ERR_EROFS; - break; - case EFBIG: - err = EXT4_ERR_EFBIG; - break; - case EEXIST: - err = EXT4_ERR_EEXIST; - break; - case ERANGE: - err = EXT4_ERR_ERANGE; - break; - case EOVERFLOW: - err = EXT4_ERR_EOVERFLOW; - break; - case EBUSY: - err = EXT4_ERR_EBUSY; - break; - case ENOTDIR: - err = EXT4_ERR_ENOTDIR; - break; - case ENOTEMPTY: - err = EXT4_ERR_ENOTEMPTY; - break; - case ESHUTDOWN: - err = EXT4_ERR_ESHUTDOWN; - break; - case EFAULT: - err = EXT4_ERR_EFAULT; - break; - default: - err = EXT4_ERR_UNKNOWN; - } - es->s_last_error_errcode = err; - if (!es->s_first_error_time) { - es->s_first_error_time = es->s_last_error_time; - es->s_first_error_time_hi = es->s_last_error_time_hi; - strncpy(es->s_first_error_func, func, - sizeof(es->s_first_error_func)); - es->s_first_error_line = cpu_to_le32(line); - es->s_first_error_ino = es->s_last_error_ino; - es->s_first_error_block = es->s_last_error_block; - es->s_first_error_errcode = es->s_last_error_errcode; - } - /* - * Start the daily error reporting function if it hasn't been - * started already - */ - if (!es->s_error_count) - mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); - le32_add_cpu(&es->s_error_count, 1); -} - -static void save_error_info(struct super_block *sb, int error, - __u32 ino, __u64 block, - const char *func, unsigned int line) -{ - __save_error_info(sb, error, ino, block, func, line); - if (!bdev_read_only(sb->s_bdev)) - ext4_commit_super(sb, 1); -} - /* * The del_gendisk() function uninitializes the disk-specific data * structures, including the bdi structure, without telling anyone @@ -649,6 +551,104 @@ static bool system_going_down(void) || system_state == SYSTEM_RESTART; } +static void __save_error_info(struct super_block *sb, int error, + __u32 ino, __u64 block, + const char *func, unsigned int line) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + int err; + + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + if (bdev_read_only(sb->s_bdev)) + return; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + ext4_update_tstamp(es, s_last_error_time); + strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); + es->s_last_error_line = cpu_to_le32(line); + es->s_last_error_ino = cpu_to_le32(ino); + es->s_last_error_block = cpu_to_le64(block); + switch (error) { + case EIO: + err = EXT4_ERR_EIO; + break; + case ENOMEM: + err = EXT4_ERR_ENOMEM; + break; + case EFSBADCRC: + err = EXT4_ERR_EFSBADCRC; + break; + case 0: + case EFSCORRUPTED: + err = EXT4_ERR_EFSCORRUPTED; + break; + case ENOSPC: + err = EXT4_ERR_ENOSPC; + break; + case ENOKEY: + err = EXT4_ERR_ENOKEY; + break; + case EROFS: + err = EXT4_ERR_EROFS; + break; + case EFBIG: + err = EXT4_ERR_EFBIG; + break; + case EEXIST: + err = EXT4_ERR_EEXIST; + break; + case ERANGE: + err = EXT4_ERR_ERANGE; + break; + case EOVERFLOW: + err = EXT4_ERR_EOVERFLOW; + break; + case EBUSY: + err = EXT4_ERR_EBUSY; + break; + case ENOTDIR: + err = EXT4_ERR_ENOTDIR; + break; + case ENOTEMPTY: + err = EXT4_ERR_ENOTEMPTY; + break; + case ESHUTDOWN: + err = EXT4_ERR_ESHUTDOWN; + break; + case EFAULT: + err = EXT4_ERR_EFAULT; + break; + default: + err = EXT4_ERR_UNKNOWN; + } + es->s_last_error_errcode = err; + if (!es->s_first_error_time) { + es->s_first_error_time = es->s_last_error_time; + es->s_first_error_time_hi = es->s_last_error_time_hi; + strncpy(es->s_first_error_func, func, + sizeof(es->s_first_error_func)); + es->s_first_error_line = cpu_to_le32(line); + es->s_first_error_ino = es->s_last_error_ino; + es->s_first_error_block = es->s_last_error_block; + es->s_first_error_errcode = es->s_last_error_errcode; + } + /* + * Start the daily error reporting function if it hasn't been + * started already + */ + if (!es->s_error_count) + mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); + le32_add_cpu(&es->s_error_count, 1); +} + +static void save_error_info(struct super_block *sb, int error, + __u32 ino, __u64 block, + const char *func, unsigned int line) +{ + __save_error_info(sb, error, ino, block, func, line); + if (!bdev_read_only(sb->s_bdev)) + ext4_commit_super(sb, 1); +} + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * From 02a7780e4d2fcf438ac6773bc469e7ada2af56be Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 27 Nov 2020 12:33:59 +0100 Subject: [PATCH 139/326] ext4: simplify ext4 error translation We convert errno's to ext4 on-disk format error codes in save_error_info(). Add a function and a bit of macro magic to make this simpler. Signed-off-by: Jan Kara Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20201127113405.26867-7-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 95 +++++++++++++++++++++---------------------------- 1 file changed, 40 insertions(+), 55 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 88d19f354dc3..0e6d9a4bd72f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -551,76 +551,61 @@ static bool system_going_down(void) || system_state == SYSTEM_RESTART; } +struct ext4_err_translation { + int code; + int errno; +}; + +#define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err } + +static struct ext4_err_translation err_translation[] = { + EXT4_ERR_TRANSLATE(EIO), + EXT4_ERR_TRANSLATE(ENOMEM), + EXT4_ERR_TRANSLATE(EFSBADCRC), + EXT4_ERR_TRANSLATE(EFSCORRUPTED), + EXT4_ERR_TRANSLATE(ENOSPC), + EXT4_ERR_TRANSLATE(ENOKEY), + EXT4_ERR_TRANSLATE(EROFS), + EXT4_ERR_TRANSLATE(EFBIG), + EXT4_ERR_TRANSLATE(EEXIST), + EXT4_ERR_TRANSLATE(ERANGE), + EXT4_ERR_TRANSLATE(EOVERFLOW), + EXT4_ERR_TRANSLATE(EBUSY), + EXT4_ERR_TRANSLATE(ENOTDIR), + EXT4_ERR_TRANSLATE(ENOTEMPTY), + EXT4_ERR_TRANSLATE(ESHUTDOWN), + EXT4_ERR_TRANSLATE(EFAULT), +}; + +static int ext4_errno_to_code(int errno) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(err_translation); i++) + if (err_translation[i].errno == errno) + return err_translation[i].code; + return EXT4_ERR_UNKNOWN; +} + static void __save_error_info(struct super_block *sb, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; - int err; EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (bdev_read_only(sb->s_bdev)) return; + /* We default to EFSCORRUPTED error... */ + if (error == 0) + error = EFSCORRUPTED; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); ext4_update_tstamp(es, s_last_error_time); strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); es->s_last_error_line = cpu_to_le32(line); es->s_last_error_ino = cpu_to_le32(ino); es->s_last_error_block = cpu_to_le64(block); - switch (error) { - case EIO: - err = EXT4_ERR_EIO; - break; - case ENOMEM: - err = EXT4_ERR_ENOMEM; - break; - case EFSBADCRC: - err = EXT4_ERR_EFSBADCRC; - break; - case 0: - case EFSCORRUPTED: - err = EXT4_ERR_EFSCORRUPTED; - break; - case ENOSPC: - err = EXT4_ERR_ENOSPC; - break; - case ENOKEY: - err = EXT4_ERR_ENOKEY; - break; - case EROFS: - err = EXT4_ERR_EROFS; - break; - case EFBIG: - err = EXT4_ERR_EFBIG; - break; - case EEXIST: - err = EXT4_ERR_EEXIST; - break; - case ERANGE: - err = EXT4_ERR_ERANGE; - break; - case EOVERFLOW: - err = EXT4_ERR_EOVERFLOW; - break; - case EBUSY: - err = EXT4_ERR_EBUSY; - break; - case ENOTDIR: - err = EXT4_ERR_ENOTDIR; - break; - case ENOTEMPTY: - err = EXT4_ERR_ENOTEMPTY; - break; - case ESHUTDOWN: - err = EXT4_ERR_ESHUTDOWN; - break; - case EFAULT: - err = EXT4_ERR_EFAULT; - break; - default: - err = EXT4_ERR_UNKNOWN; - } - es->s_last_error_errcode = err; + es->s_last_error_errcode = ext4_errno_to_code(error); if (!es->s_first_error_time) { es->s_first_error_time = es->s_last_error_time; es->s_first_error_time_hi = es->s_last_error_time_hi; From 658a337a606f48b7ebe451591f7681d383fa115e Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Thu, 17 Dec 2020 16:59:04 +0100 Subject: [PATCH 140/326] s390/dasd: fix hanging device offline processing For an LCU update a read unit address configuration IO is required. This is started using sleep_on(), which has early exit paths in case the device is not usable for IO. For example when it is in offline processing. In those cases the LCU update should fail and not be retried. Therefore lcu_update_work checks if EOPNOTSUPP is returned or not. Commit 41995342b40c ("s390/dasd: fix endless loop after read unit address configuration") accidentally removed the EOPNOTSUPP return code from read_unit_address_configuration(), which in turn might lead to an endless loop of the LCU update in offline processing. Fix by returning EOPNOTSUPP again if the device is not able to perform the request. Fixes: 41995342b40c ("s390/dasd: fix endless loop after read unit address configuration") Cc: stable@vger.kernel.org #5.3 Signed-off-by: Stefan Haberland Reviewed-by: Jan Hoeppner Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_alias.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/s390/block/dasd_alias.c b/drivers/s390/block/dasd_alias.c index 99f86612f775..31e8b5d48e86 100644 --- a/drivers/s390/block/dasd_alias.c +++ b/drivers/s390/block/dasd_alias.c @@ -462,11 +462,19 @@ static int read_unit_address_configuration(struct dasd_device *device, spin_unlock_irqrestore(&lcu->lock, flags); rc = dasd_sleep_on(cqr); - if (rc && !suborder_not_supported(cqr)) { + if (!rc) + goto out; + + if (suborder_not_supported(cqr)) { + /* suborder not supported or device unusable for IO */ + rc = -EOPNOTSUPP; + } else { + /* IO failed but should be retried */ spin_lock_irqsave(&lcu->lock, flags); lcu->flags |= NEED_UAC_UPDATE; spin_unlock_irqrestore(&lcu->lock, flags); } +out: dasd_sfree_request(cqr, cqr->memdev); return rc; } From a29ea01653493b94ea12bb2b89d1564a265081b6 Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Thu, 17 Dec 2020 16:59:05 +0100 Subject: [PATCH 141/326] s390/dasd: prevent inconsistent LCU device data Prevent _lcu_update from adding a device to a pavgroup if the LCU still requires an update. The data is not reliable any longer and in parallel devices might have been moved on the lists already. This might lead to list corruptions or invalid PAV grouping. Only add devices to a pavgroup if the LCU is up to date. Additional steps are taken by the scheduled lcu update. Fixes: 8e09f21574ea ("[S390] dasd: add hyper PAV support to DASD device driver, part 1") Cc: stable@vger.kernel.org Signed-off-by: Stefan Haberland Reviewed-by: Jan Hoeppner Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_alias.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/s390/block/dasd_alias.c b/drivers/s390/block/dasd_alias.c index 31e8b5d48e86..f841518de6c5 100644 --- a/drivers/s390/block/dasd_alias.c +++ b/drivers/s390/block/dasd_alias.c @@ -511,6 +511,14 @@ static int _lcu_update(struct dasd_device *refdev, struct alias_lcu *lcu) return rc; spin_lock_irqsave(&lcu->lock, flags); + /* + * there is another update needed skip the remaining handling + * the data might already be outdated + * but especially do not add the device to an LCU with pending + * update + */ + if (lcu->flags & NEED_UAC_UPDATE) + goto out; lcu->pav = NO_PAV; for (i = 0; i < MAX_DEVICES_PER_LCU; ++i) { switch (lcu->uac->unit[i].ua_type) { @@ -529,6 +537,7 @@ static int _lcu_update(struct dasd_device *refdev, struct alias_lcu *lcu) alias_list) { _add_device_to_lcu(lcu, device, refdev); } +out: spin_unlock_irqrestore(&lcu->lock, flags); return 0; } From 0ede91f83aa335da1c3ec68eb0f9e228f269f6d8 Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Thu, 17 Dec 2020 16:59:06 +0100 Subject: [PATCH 142/326] s390/dasd: fix list corruption of pavgroup group list dasd_alias_add_device() moves devices to the active_devices list in case of a scheduled LCU update regardless if they have previously been in a pavgroup or not. Example: device A and B are in the same pavgroup. Device A has already been in a pavgroup and the private->pavgroup pointer is set and points to a valid pavgroup. While going through dasd_add_device it is moved from the pavgroup to the active_devices list. In parallel device B might be removed from the same pavgroup in remove_device_from_lcu() which in turn checks if the group is empty and deletes it accordingly because device A has already been removed from there. When now device A enters remove_device_from_lcu() it is tried to remove it from the pavgroup again because the pavgroup pointer is still set and again the empty group will be cleaned up which leads to a list corruption. Fix by setting private->pavgroup to NULL in dasd_add_device. If the device has been the last device on the pavgroup an empty pavgroup remains but this will be cleaned up by the scheduled lcu_update which iterates over all existing pavgroups. Fixes: 8e09f21574ea ("[S390] dasd: add hyper PAV support to DASD device driver, part 1") Cc: stable@vger.kernel.org Signed-off-by: Stefan Haberland Reviewed-by: Jan Hoeppner Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_alias.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/s390/block/dasd_alias.c b/drivers/s390/block/dasd_alias.c index f841518de6c5..60344e13e87b 100644 --- a/drivers/s390/block/dasd_alias.c +++ b/drivers/s390/block/dasd_alias.c @@ -642,6 +642,7 @@ int dasd_alias_add_device(struct dasd_device *device) } if (lcu->flags & UPDATE_PENDING) { list_move(&device->alias_list, &lcu->active_devices); + private->pavgroup = NULL; _schedule_lcu_update(lcu, device); } spin_unlock_irqrestore(&lcu->lock, flags); From 53a7f655834c7c335bf683f248208d4fbe4b47bc Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Thu, 17 Dec 2020 16:59:07 +0100 Subject: [PATCH 143/326] s390/dasd: fix list corruption of lcu list In dasd_alias_disconnect_device_from_lcu the device is removed from any list on the LCU. Afterwards the LCU is removed from the lcu list if it does not contain devices any longer. The lcu->lock protects the lcu from parallel updates. But to cancel all workers and wait for completion the lcu->lock has to be unlocked. If two devices are removed in parallel and both are removed from the LCU the first device that takes the lcu->lock again will delete the LCU because it is already empty but the second device also tries to free the LCU which leads to a list corruption of the lcu list. Fix by removing the device right before the lcu is checked without unlocking the lcu->lock in between. Fixes: 8e09f21574ea ("[S390] dasd: add hyper PAV support to DASD device driver, part 1") Cc: stable@vger.kernel.org Signed-off-by: Stefan Haberland Reviewed-by: Jan Hoeppner Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_alias.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/block/dasd_alias.c b/drivers/s390/block/dasd_alias.c index 60344e13e87b..dc78a523a69f 100644 --- a/drivers/s390/block/dasd_alias.c +++ b/drivers/s390/block/dasd_alias.c @@ -256,7 +256,6 @@ void dasd_alias_disconnect_device_from_lcu(struct dasd_device *device) return; device->discipline->get_uid(device, &uid); spin_lock_irqsave(&lcu->lock, flags); - list_del_init(&device->alias_list); /* make sure that the workers don't use this device */ if (device == lcu->suc_data.device) { spin_unlock_irqrestore(&lcu->lock, flags); @@ -283,6 +282,7 @@ void dasd_alias_disconnect_device_from_lcu(struct dasd_device *device) spin_lock_irqsave(&aliastree.lock, flags); spin_lock(&lcu->lock); + list_del_init(&device->alias_list); if (list_empty(&lcu->grouplist) && list_empty(&lcu->active_devices) && list_empty(&lcu->inactive_devices)) { From 71425189b2b75336d869cfdedea45c9d319fc9c9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 4 Dec 2020 20:13:54 +0100 Subject: [PATCH 144/326] blk-mq: Don't complete on a remote CPU in force threaded mode With force threaded interrupts enabled, raising softirq from an SMP function call will always result in waking the ksoftirqd thread. This is not optimal given that the thread runs at SCHED_OTHER priority. Completing the request in hard IRQ-context on PREEMPT_RT (which enforces the force threaded mode) is bad because the completion handler may acquire sleeping locks which violate the locking context. Disable request completing on a remote CPU in force threaded mode. Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Christoph Hellwig Reviewed-by: Daniel Wagner Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index a428798e1c5c..c338c9bc5a2c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -650,6 +650,14 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) if (!IS_ENABLED(CONFIG_SMP) || !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) return false; + /* + * With force threaded interrupts enabled, raising softirq from an SMP + * function call will always result in waking the ksoftirqd thread. + * This is probably worse than completing the request on a different + * cache domain. + */ + if (force_irqthreads) + return false; /* same CPU or cache domain? Complete locally */ if (cpu == rq->mq_ctx->cpu || From 0b9491b621196a5d7f163dde81d98e0687bdba97 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Wed, 16 Dec 2020 21:27:33 +0800 Subject: [PATCH 145/326] watchdog: convert comma to semicolon Replace a comma between expression statements by a semicolon. Signed-off-by: Zheng Yongjun Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20201216132733.15635-1-zhengyongjun3@huawei.com Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/mpc8xxx_wdt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/mpc8xxx_wdt.c b/drivers/watchdog/mpc8xxx_wdt.c index 3fc457bc16db..2f7ded32e878 100644 --- a/drivers/watchdog/mpc8xxx_wdt.c +++ b/drivers/watchdog/mpc8xxx_wdt.c @@ -175,8 +175,8 @@ static int mpc8xxx_wdt_probe(struct platform_device *ofdev) spin_lock_init(&ddata->lock); - ddata->wdd.info = &mpc8xxx_wdt_info, - ddata->wdd.ops = &mpc8xxx_wdt_ops, + ddata->wdd.info = &mpc8xxx_wdt_info; + ddata->wdd.ops = &mpc8xxx_wdt_ops; ddata->wdd.timeout = WATCHDOG_TIMEOUT; watchdog_init_timeout(&ddata->wdd, timeout, dev); From 6cc980e3f52e2e8db6d0d3bad076b495bd492658 Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Wed, 16 Dec 2020 17:04:23 -0500 Subject: [PATCH 146/326] drm/amdkfd: PCIe atomics required for gfx10 GFX10 CP firmware expects PCIe atomics support. Don't enumerate GFX10 devices on platforms (PCIe v2) that don't support PCIe atomics. Currently, some of the applications like clinfo soft hangs on platforms without PCIe atomics support. Signed-off-by: Harish Kasiviswanathan Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 50922ff2927b..72c893fff61a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -422,7 +422,7 @@ static const struct kfd_device_info navi10_device_info = { .mqd_size_aligned = MQD_SIZE_ALIGNED, .needs_iommu_device = false, .supports_cwsr = true, - .needs_pci_atomics = false, + .needs_pci_atomics = true, .num_sdma_engines = 2, .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 8, @@ -440,7 +440,7 @@ static const struct kfd_device_info navi12_device_info = { .mqd_size_aligned = MQD_SIZE_ALIGNED, .needs_iommu_device = false, .supports_cwsr = true, - .needs_pci_atomics = false, + .needs_pci_atomics = true, .num_sdma_engines = 2, .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 8, @@ -458,7 +458,7 @@ static const struct kfd_device_info navi14_device_info = { .mqd_size_aligned = MQD_SIZE_ALIGNED, .needs_iommu_device = false, .supports_cwsr = true, - .needs_pci_atomics = false, + .needs_pci_atomics = true, .num_sdma_engines = 2, .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 8, @@ -476,7 +476,7 @@ static const struct kfd_device_info sienna_cichlid_device_info = { .mqd_size_aligned = MQD_SIZE_ALIGNED, .needs_iommu_device = false, .supports_cwsr = true, - .needs_pci_atomics = false, + .needs_pci_atomics = true, .num_sdma_engines = 4, .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 8, @@ -494,7 +494,7 @@ static const struct kfd_device_info navy_flounder_device_info = { .mqd_size_aligned = MQD_SIZE_ALIGNED, .needs_iommu_device = false, .supports_cwsr = true, - .needs_pci_atomics = false, + .needs_pci_atomics = true, .num_sdma_engines = 2, .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 8, @@ -530,7 +530,7 @@ static const struct kfd_device_info dimgrey_cavefish_device_info = { .mqd_size_aligned = MQD_SIZE_ALIGNED, .needs_iommu_device = false, .supports_cwsr = true, - .needs_pci_atomics = false, + .needs_pci_atomics = true, .num_sdma_engines = 2, .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 8, From 9e3a6ab74ff80128c337d5f95ce1867a452dc67e Mon Sep 17 00:00:00 2001 From: Xiaomeng Hou Date: Thu, 17 Dec 2020 10:41:10 +0800 Subject: [PATCH 147/326] drm/amd/pm: check pmfw version before issuing RlcPowerNotify message Only pmfw version behind v4.63.23.00 could support this message. Signed-off-by: Xiaomeng Hou Reviewed-by: Huang Rui Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c index 9bccf2ad038c..8cb4fcee9a2c 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c @@ -724,8 +724,13 @@ static int vangogh_set_fine_grain_gfx_freq_parameters(struct smu_context *smu) static int vangogh_system_features_control(struct smu_context *smu, bool en) { - return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_RlcPowerNotify, - en ? RLC_STATUS_NORMAL : RLC_STATUS_OFF, NULL); + struct amdgpu_device *adev = smu->adev; + + if (adev->pm.fw_version >= 0x43f1700) + return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_RlcPowerNotify, + en ? RLC_STATUS_NORMAL : RLC_STATUS_OFF, NULL); + else + return 0; } static const struct pptable_funcs vangogh_ppt_funcs = { From 088fb29b40f2c78bfe01cebce1a1506b6f7e56d1 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 16 Dec 2020 11:36:28 -0500 Subject: [PATCH 148/326] drm/amdgpu: fix vbios reservation handling on SR-IOV There is no reserveration so set the size to 0. Fixes a regression on SR-IOV. Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index c2ced5be6d7b..6e679db5e46f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -496,7 +496,8 @@ void amdgpu_gmc_get_vbios_allocations(struct amdgpu_device *adev) break; } - if (!amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_DCE)) { + if (amdgpu_sriov_vf(adev) || + !amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_DCE)) { size = 0; } else { size = amdgpu_gmc_get_vbios_fb_size(adev); From 8bcbe3132c66c07d03f64d5da80be753359f2e92 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Fri, 20 Nov 2020 17:20:57 +0800 Subject: [PATCH 149/326] device-dax: delete a redundancy check in dev_dax_validate_align() After we have done the alignment check for the length of each range, the alignment check for dev_dax_size(dev_dax) is no longer needed, because it get the sum of the length of each range. Signed-off-by: Zhen Lei Link: https://lore.kernel.org/r/20201120092057.2144-1-thunder.leizhen@huawei.com Signed-off-by: Dan Williams --- drivers/dax/bus.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 27513d311242..9761cb40d4bb 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -1114,16 +1114,9 @@ static ssize_t align_show(struct device *dev, static ssize_t dev_dax_validate_align(struct dev_dax *dev_dax) { - resource_size_t dev_size = dev_dax_size(dev_dax); struct device *dev = &dev_dax->dev; int i; - if (dev_size > 0 && !alloc_is_aligned(dev_dax, dev_size)) { - dev_dbg(dev, "%s: align %u invalid for size %pa\n", - __func__, dev_dax->align, &dev_size); - return -EINVAL; - } - for (i = 0; i < dev_dax->nr_range; i++) { size_t len = range_len(&dev_dax->ranges[i].range); From 29c9dece56ca82c510c39a0e9403b80bdb3032d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 17 Dec 2020 17:36:57 +0100 Subject: [PATCH 150/326] drm/qxl: don't allocate a dma_address array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That seems to be unused. Signed-off-by: Christian König Reviewed-by: David Airlie Link: https://patchwork.freedesktop.org/patch/408787/ --- drivers/gpu/drm/qxl/qxl_ttm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c index 128c38c8a837..7dd0c69baa47 100644 --- a/drivers/gpu/drm/qxl/qxl_ttm.c +++ b/drivers/gpu/drm/qxl/qxl_ttm.c @@ -115,7 +115,7 @@ static struct ttm_tt *qxl_ttm_tt_create(struct ttm_buffer_object *bo, ttm = kzalloc(sizeof(struct ttm_tt), GFP_KERNEL); if (ttm == NULL) return NULL; - if (ttm_dma_tt_init(ttm, bo, page_flags, ttm_cached)) { + if (ttm_tt_init(ttm, bo, page_flags, ttm_cached)) { kfree(ttm); return NULL; } From 0020ef04e48571a88d4f482ad08f71052c5c5a08 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Fri, 18 Dec 2020 15:26:48 +0800 Subject: [PATCH 151/326] io_uring: fix io_wqe->work_list corruption For the first time a req punted to io-wq, we'll initialize io_wq_work's list to be NULL, then insert req to io_wqe->work_list. If this req is not inserted into tail of io_wqe->work_list, this req's io_wq_work list will point to another req's io_wq_work. For splitted bio case, this req maybe inserted to io_wqe->work_list repeatedly, once we insert it to tail of io_wqe->work_list for the second time, now io_wq_work->list->next will be invalid pointer, which then result in many strang error, panic, kernel soft-lockup, rcu stall, etc. In my vm, kernel doest not have commit cc29e1bf0d63f7 ("block: disable iopoll for split bio"), below fio job can reproduce this bug steadily: [global] name=iouring-sqpoll-iopoll-1 ioengine=io_uring iodepth=128 numjobs=1 thread rw=randread direct=1 registerfiles=1 hipri=1 bs=4m size=100M runtime=120 time_based group_reporting randrepeat=0 [device] directory=/home/feiman.wxg/mntpoint/ # an ext4 mount point If we have commit cc29e1bf0d63f7 ("block: disable iopoll for split bio"), there will no splitted bio case for polled io, but I think we still to need to fix this list corruption, it also should maybe go to stable branchs. To fix this corruption, if a req is inserted into tail of io_wqe->work_list, initialize req->io_wq_work->list->next to bu NULL. Cc: stable@vger.kernel.org Signed-off-by: Xiaoguang Wang Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io-wq.h b/fs/io-wq.h index 069496c6d4f9..75113bcd5889 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -59,6 +59,7 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, list->last->next = node; list->last = node; } + node->next = NULL; } static inline void wq_list_cut(struct io_wq_work_list *list, From dfea9fce29fda6f2f91161677e0e0d9b671bc099 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 18 Dec 2020 13:12:21 +0000 Subject: [PATCH 152/326] io_uring: close a small race gap for files cancel The purpose of io_uring_cancel_files() is to wait for all requests matching ->files to go/be cancelled. We should first drop files of a request in io_req_drop_files() and only then make it undiscoverable for io_uring_cancel_files. First drop, then delete from list. It's ok to leave req->id->files dangling, because it's not dereferenced by cancellation code, only compared against. It would potentially go to sleep and be awaken by following in io_req_drop_files() wake_up(). Fixes: 0f2122045b946 ("io_uring: don't rely on weak ->files references") Cc: # 5.5+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8cf6f22afc5e..b74957856e68 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6098,15 +6098,15 @@ static void io_req_drop_files(struct io_kiocb *req) struct io_uring_task *tctx = req->task->io_uring; unsigned long flags; - spin_lock_irqsave(&ctx->inflight_lock, flags); - list_del(&req->inflight_entry); - if (atomic_read(&tctx->in_idle)) - wake_up(&tctx->wait); - spin_unlock_irqrestore(&ctx->inflight_lock, flags); - req->flags &= ~REQ_F_INFLIGHT; put_files_struct(req->work.identity->files); put_nsproxy(req->work.identity->nsproxy); + spin_lock_irqsave(&ctx->inflight_lock, flags); + list_del(&req->inflight_entry); + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + req->flags &= ~REQ_F_INFLIGHT; req->work.flags &= ~IO_WQ_WORK_FILES; + if (atomic_read(&tctx->in_idle)) + wake_up(&tctx->wait); } static void __io_clean_op(struct io_kiocb *req) From f2283366c2919fda71e6eb725c3e5c1bd47bae1a Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Fri, 18 Dec 2020 16:34:00 +0100 Subject: [PATCH 153/326] ALSA: pcm: Remove snd_pcm_lib_preallocate_dma_free() Since commit d4cfb30fce03 ("ALSA: pcm: Set per-card upper limit of PCM buffer allocations") snd_pcm_lib_preallocate_dma_free() is a single line function that has one caller, which is another single line function. Clean this up a bit and remove snd_pcm_lib_preallocate_dma_free() and directly call do_free_pages() from snd_pcm_lib_preallocate_free(). This is a bit less boilerplate. Signed-off-by: Lars-Peter Clausen Link: https://lore.kernel.org/r/20201218153400.18394-1-lars@metafoo.de Signed-off-by: Takashi Iwai --- sound/core/pcm_memory.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/sound/core/pcm_memory.c b/sound/core/pcm_memory.c index 4f03ba8ed0ae..ee6e9c5eec45 100644 --- a/sound/core/pcm_memory.c +++ b/sound/core/pcm_memory.c @@ -89,14 +89,6 @@ static int preallocate_pcm_pages(struct snd_pcm_substream *substream, size_t siz return 0; } -/* - * release the preallocated buffer if not yet done. - */ -static void snd_pcm_lib_preallocate_dma_free(struct snd_pcm_substream *substream) -{ - do_free_pages(substream->pcm->card, &substream->dma_buffer); -} - /** * snd_pcm_lib_preallocate_free - release the preallocated buffer of the specified substream. * @substream: the pcm substream instance @@ -105,7 +97,7 @@ static void snd_pcm_lib_preallocate_dma_free(struct snd_pcm_substream *substream */ void snd_pcm_lib_preallocate_free(struct snd_pcm_substream *substream) { - snd_pcm_lib_preallocate_dma_free(substream); + do_free_pages(substream->pcm->card, &substream->dma_buffer); } /** From 9df28edce7c6ab38050235f6f8b43dd7ccd01b6d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 18 Dec 2020 15:58:58 +0100 Subject: [PATCH 154/326] ALSA: usb-audio: Disable sample read check if firmware doesn't give back Some buggy firmware don't give the current sample rate but leaves zero. Handle this case more gracefully without warning but just skip the current rate verification from the next time. Cc: Link: https://lore.kernel.org/r/20201218145858.2357-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/clock.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/usb/clock.c b/sound/usb/clock.c index e940dcee792b..31051f2be46d 100644 --- a/sound/usb/clock.c +++ b/sound/usb/clock.c @@ -534,6 +534,12 @@ static int set_sample_rate_v1(struct snd_usb_audio *chip, } crate = data[0] | (data[1] << 8) | (data[2] << 16); + if (!crate) { + dev_info(&dev->dev, "failed to read current rate; disabling the check\n"); + chip->sample_rate_read_error = 3; /* three strikes, see above */ + return 0; + } + if (crate != rate) { dev_warn(&dev->dev, "current rate %d is different from the runtime rate %d\n", crate, rate); // runtime->rate = crate; From 5c1733e33c888a3cb7f576564d8ad543d5ad4a9e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 18 Dec 2020 15:56:24 +0100 Subject: [PATCH 155/326] ALSA: memalloc: Align buffer allocations in page size Currently the standard memory allocator (snd_dma_malloc_pages*()) passes the byte size to allocate as is. Most of the backends allocates real pages, hence the actual allocations are aligned in page size. However, the genalloc doesn't seem assuring the size alignment, hence it may result in the access outside the buffer when the whole memory pages are exposed via mmap. For avoiding such inconsistencies, this patch makes the allocation size always to be aligned in page size. Note that, after this change, snd_dma_buffer.bytes field contains the aligned size, not the originally requested size. This value is also used for releasing the pages in return. Reviewed-by: Lars-Peter Clausen Link: https://lore.kernel.org/r/20201218145625.2045-2-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/memalloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index 0f335162f87c..966bef5acc75 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -133,6 +133,7 @@ int snd_dma_alloc_pages(int type, struct device *device, size_t size, if (WARN_ON(!dmab)) return -ENXIO; + size = PAGE_ALIGN(size); dmab->dev.type = type; dmab->dev.dev = device; dmab->bytes = 0; From 618de0f4ef11acd8cf26902e65493d46cc20cc89 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 18 Dec 2020 15:56:25 +0100 Subject: [PATCH 156/326] ALSA: pcm: Clear the full allocated memory at hw_params The PCM hw_params core function tries to clear up the PCM buffer before actually using for avoiding the information leak from the previous usages or the usage before a new allocation. It performs the memset() with runtime->dma_bytes, but this might still leave some remaining bytes untouched; namely, the PCM buffer size is aligned in page size for mmap, hence runtime->dma_bytes doesn't necessarily cover all PCM buffer pages, and the remaining bytes are exposed via mmap. This patch changes the memory clearance to cover the all buffer pages if the stream is supposed to be mmap-ready (that guarantees that the buffer size is aligned in page size). Reviewed-by: Lars-Peter Clausen Link: https://lore.kernel.org/r/20201218145625.2045-3-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/pcm_native.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 47b155a49226..9f3f8e953ff0 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -755,8 +755,13 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream, runtime->boundary *= 2; /* clear the buffer for avoiding possible kernel info leaks */ - if (runtime->dma_area && !substream->ops->copy_user) - memset(runtime->dma_area, 0, runtime->dma_bytes); + if (runtime->dma_area && !substream->ops->copy_user) { + size_t size = runtime->dma_bytes; + + if (runtime->info & SNDRV_PCM_INFO_MMAP) + size = PAGE_ALIGN(size); + memset(runtime->dma_area, 0, size); + } snd_pcm_timer_resolution_change(substream); snd_pcm_set_state(substream, SNDRV_PCM_STATE_SETUP); From 11cb881bf075cea41092a20236ba708b18e1dbb2 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 18 Dec 2020 17:17:30 +0100 Subject: [PATCH 157/326] ALSA: pcm: oss: Fix a few more UBSAN fixes There are a few places that call round{up|down}_pow_of_two() with the value zero, and this causes undefined behavior warnings. Avoid calling those macros if such a nonsense value is passed; it's a minor optimization as well, as we handle it as either an error or a value to be skipped, instead. Reported-by: syzbot+33ef0b6639a8d2d42b4c@syzkaller.appspotmail.com Cc: Link: https://lore.kernel.org/r/20201218161730.26596-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/oss/pcm_oss.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index de1917484647..142fc751a847 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -693,6 +693,8 @@ static int snd_pcm_oss_period_size(struct snd_pcm_substream *substream, oss_buffer_size = snd_pcm_plug_client_size(substream, snd_pcm_hw_param_value_max(slave_params, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, NULL)) * oss_frame_size; + if (!oss_buffer_size) + return -EINVAL; oss_buffer_size = rounddown_pow_of_two(oss_buffer_size); if (atomic_read(&substream->mmap_count)) { if (oss_buffer_size > runtime->oss.mmap_bytes) @@ -728,17 +730,21 @@ static int snd_pcm_oss_period_size(struct snd_pcm_substream *substream, min_period_size = snd_pcm_plug_client_size(substream, snd_pcm_hw_param_value_min(slave_params, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, NULL)); - min_period_size *= oss_frame_size; - min_period_size = roundup_pow_of_two(min_period_size); - if (oss_period_size < min_period_size) - oss_period_size = min_period_size; + if (min_period_size) { + min_period_size *= oss_frame_size; + min_period_size = roundup_pow_of_two(min_period_size); + if (oss_period_size < min_period_size) + oss_period_size = min_period_size; + } max_period_size = snd_pcm_plug_client_size(substream, snd_pcm_hw_param_value_max(slave_params, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, NULL)); - max_period_size *= oss_frame_size; - max_period_size = rounddown_pow_of_two(max_period_size); - if (oss_period_size > max_period_size) - oss_period_size = max_period_size; + if (max_period_size) { + max_period_size *= oss_frame_size; + max_period_size = rounddown_pow_of_two(max_period_size); + if (oss_period_size > max_period_size) + oss_period_size = max_period_size; + } oss_periods = oss_buffer_size / oss_period_size; From 4b501262826f5b20f54433c586b111dd190bea25 Mon Sep 17 00:00:00 2001 From: James Qian Wang Date: Thu, 19 Nov 2020 09:39:48 +0800 Subject: [PATCH 158/326] drm/komeda: Correct the sequence of hw_done() and flip_done() Komeda HW has no special, program the update to HW is done first, then flip happens. So correct the sequence to hw_done() first then flip_done(). Reported-by: Daniel Vetter Signed-off-by: James Qian Wang Acked-by: Daniel Vetter Signed-off-by: Liviu Dudau Link: https://patchwork.freedesktop.org/patch/msgid/20201119013948.2866343-1-james.qian.wang@arm.com --- drivers/gpu/drm/arm/display/komeda/komeda_kms.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_kms.c b/drivers/gpu/drm/arm/display/komeda/komeda_kms.c index 6b99df696384..034ee08482e0 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_kms.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_kms.c @@ -81,10 +81,10 @@ static void komeda_kms_commit_tail(struct drm_atomic_state *old_state) drm_atomic_helper_commit_modeset_enables(dev, old_state); - drm_atomic_helper_wait_for_flip_done(dev, old_state); - drm_atomic_helper_commit_hw_done(old_state); + drm_atomic_helper_wait_for_flip_done(dev, old_state); + drm_atomic_helper_cleanup_planes(dev, old_state); } From 8e8fbfc682481b7f814985341020129161afd9de Mon Sep 17 00:00:00 2001 From: Carsten Haitzler Date: Fri, 27 Nov 2020 11:00:27 +0000 Subject: [PATCH 159/326] drm/komeda: Remove useless variable assignment ret is not actually read after this (only written in one case then returned), so this assign line is useless. This removes that assignment. Signed-off-by: Carsten Haitzler Reviewed-by: Steven Price Acked-by: Liviu Dudau Signed-off-by: Liviu Dudau Link: https://patchwork.freedesktop.org/patch/msgid/20201127110027.133569-1-carsten.haitzler@foss.arm.com --- drivers/gpu/drm/arm/display/komeda/komeda_dev.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_dev.c b/drivers/gpu/drm/arm/display/komeda/komeda_dev.c index 1f8195bad536..ca891ae14d36 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_dev.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_dev.c @@ -152,7 +152,6 @@ static int komeda_parse_dt(struct device *dev, struct komeda_dev *mdev) ret = of_reserved_mem_device_init(dev); if (ret && ret != -ENODEV) return ret; - ret = 0; for_each_available_child_of_node(np, child) { if (of_node_name_eq(child, "pipeline")) { From a24cf238c771a1d3f0dc68b9f2b62c6d23359026 Mon Sep 17 00:00:00 2001 From: Carsten Haitzler Date: Fri, 27 Nov 2020 11:00:54 +0000 Subject: [PATCH 160/326] drm/komeda: Handle NULL pointer access code path in error case komeda_component_get_old_state() technically can return a NULL pointer. komeda_compiz_set_input() even warns when this happens, but then proceeeds to use that NULL pointer to compare memory content there agains the new state to see if it changed. In this case, it's better to assume that the input changed as there is no old state to compare against and thus assume the changes happen anyway. Signed-off-by: Carsten Haitzler Reviewed-by: Steven Price Acked-by: Liviu Dudau [Applied small spelling fixes and fix suggested by Steven Price] Signed-off-by: Liviu Dudau Link: https://patchwork.freedesktop.org/patch/msgid/20201127110054.133686-1-carsten.haitzler@foss.arm.com --- drivers/gpu/drm/arm/display/komeda/komeda_pipeline_state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_pipeline_state.c b/drivers/gpu/drm/arm/display/komeda/komeda_pipeline_state.c index 8f32ae7c25d0..5c085116de3f 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_pipeline_state.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_pipeline_state.c @@ -704,10 +704,10 @@ komeda_compiz_set_input(struct komeda_compiz *compiz, cin->layer_alpha = dflow->layer_alpha; old_st = komeda_component_get_old_state(&compiz->base, drm_st); - WARN_ON(!old_st); /* compare with old to check if this input has been changed */ - if (memcmp(&(to_compiz_st(old_st)->cins[idx]), cin, sizeof(*cin))) + if (WARN_ON(!old_st) || + memcmp(&(to_compiz_st(old_st)->cins[idx]), cin, sizeof(*cin))) c_st->changed_active_inputs |= BIT(idx); komeda_component_add_input(c_st, &dflow->input, idx); From be3e477effba636ad25dcd244db264c6cd5c1f36 Mon Sep 17 00:00:00 2001 From: Carsten Haitzler Date: Fri, 18 Dec 2020 15:08:12 +0000 Subject: [PATCH 161/326] drm/komeda: Fix bit check to import to value of proper type KASAN found this problem. find_first_bit() expects to look at a pointer pointing to a long, but we look at a u32 - this is going to be an issue with endianness but, KSAN already flags this as out-of-bounds stack reads. This fixes it by just importing inot a local long. Signed-off-by: Carsten Haitzler Acked-by: Liviu Dudau Signed-off-by: Liviu Dudau Link: https://patchwork.freedesktop.org/patch/msgid/20201218150812.68195-1-carsten.haitzler@foss.arm.com --- drivers/gpu/drm/arm/display/komeda/komeda_pipeline.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_pipeline.c b/drivers/gpu/drm/arm/display/komeda/komeda_pipeline.c index 452e505a1fd3..719a79728e24 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_pipeline.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_pipeline.c @@ -137,9 +137,10 @@ komeda_pipeline_get_first_component(struct komeda_pipeline *pipe, u32 comp_mask) { struct komeda_component *c = NULL; + unsigned long comp_mask_local = (unsigned long)comp_mask; int id; - id = find_first_bit((unsigned long *)&comp_mask, 32); + id = find_first_bit(&comp_mask_local, 32); if (id < 32) c = komeda_pipeline_get_component(pipe, id); From 6725f21157b4b6a9fe689cdf07b040d21ea536dd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:37:47 +0100 Subject: [PATCH 162/326] virtio-mem: determine nid only once using memory_add_physaddr_to_nid() Let's determine the target nid only once in case we have none specified - usually, we'll end up with node 0 either way. Reviewed-by: Wei Yang Reviewed-by: Pankaj Gupta Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-2-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 181e2f18beae..a37fd73588da 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -70,7 +70,7 @@ struct virtio_mem { /* The device block size (for communicating with the device). */ uint64_t device_block_size; - /* The translated node id. NUMA_NO_NODE in case not specified. */ + /* The determined node id for all memory of the device. */ int nid; /* Physical start address of the memory region. */ uint64_t addr; @@ -406,10 +406,6 @@ static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem *vm) static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); - int nid = vm->nid; - - if (nid == NUMA_NO_NODE) - nid = memory_add_physaddr_to_nid(addr); /* * When force-unloading the driver and we still have memory added to @@ -423,7 +419,8 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) } dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id); - return add_memory_driver_managed(nid, addr, memory_block_size_bytes(), + return add_memory_driver_managed(vm->nid, addr, + memory_block_size_bytes(), vm->resource_name, MEMHP_MERGE_RESOURCE); } @@ -440,13 +437,9 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); - int nid = vm->nid; - - if (nid == NUMA_NO_NODE) - nid = memory_add_physaddr_to_nid(addr); dev_dbg(&vm->vdev->dev, "removing memory block: %lu\n", mb_id); - return remove_memory(nid, addr, memory_block_size_bytes()); + return remove_memory(vm->nid, addr, memory_block_size_bytes()); } /* @@ -461,14 +454,11 @@ static int virtio_mem_mb_offline_and_remove(struct virtio_mem *vm, unsigned long mb_id) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); - int nid = vm->nid; - - if (nid == NUMA_NO_NODE) - nid = memory_add_physaddr_to_nid(addr); dev_dbg(&vm->vdev->dev, "offlining and removing memory block: %lu\n", mb_id); - return offline_and_remove_memory(nid, addr, memory_block_size_bytes()); + return offline_and_remove_memory(vm->nid, addr, + memory_block_size_bytes()); } /* @@ -1659,6 +1649,10 @@ static int virtio_mem_init(struct virtio_mem *vm) virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, &vm->region_size); + /* Determine the nid for the device based on the lowest address. */ + if (vm->nid == NUMA_NO_NODE) + vm->nid = memory_add_physaddr_to_nid(vm->addr); + /* * We always hotplug memory in memory block granularity. This way, * we have to wait for exactly one memory block to online. @@ -1707,7 +1701,7 @@ static int virtio_mem_init(struct virtio_mem *vm) memory_block_size_bytes()); dev_info(&vm->vdev->dev, "subblock size: 0x%llx", (unsigned long long)vm->subblock_size); - if (vm->nid != NUMA_NO_NODE) + if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) dev_info(&vm->vdev->dev, "nid: %d", vm->nid); return 0; From 347202dc04a110bdab8d4e1c38ceccd7758fe13e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:37:48 +0100 Subject: [PATCH 163/326] virtio-mem: more precise calculation in virtio_mem_mb_state_prepare_next_mb() We actually need one byte less (next_mb_id is exclusive, first_mb_id is inclusive). While at it, compact the code. Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-3-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index a37fd73588da..dee46865bae2 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -257,10 +257,8 @@ static enum virtio_mem_mb_state virtio_mem_mb_get_state(struct virtio_mem *vm, */ static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem *vm) { - unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id + 1; - unsigned long new_bytes = vm->next_mb_id - vm->first_mb_id + 2; - int old_pages = PFN_UP(old_bytes); - int new_pages = PFN_UP(new_bytes); + int old_pages = PFN_UP(vm->next_mb_id - vm->first_mb_id); + int new_pages = PFN_UP(vm->next_mb_id - vm->first_mb_id + 1); uint8_t *new_mb_state; if (vm->mb_state && old_pages == new_pages) From 20b9150225c8e9599999b4e161192d8a8d56a4cb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:37:49 +0100 Subject: [PATCH 164/326] virtio-mem: simplify MAX_ORDER - 1 / pageblock_order handling Let's use pageblock_nr_pages and MAX_ORDER_NR_PAGES instead where possible to simplify. Add a comment why we have that restriction for now. Reviewed-by: Wei Yang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-4-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index dee46865bae2..0f9d854e8e42 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -753,14 +753,15 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn, */ static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages) { - const int order = MAX_ORDER - 1; + const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES; int i; /* - * We are always called with subblock granularity, which is at least - * aligned to MAX_ORDER - 1. + * We are always called at least with MAX_ORDER_NR_PAGES + * granularity/alignment (e.g., the way subblocks work). All pages + * inside such a block are alike. */ - for (i = 0; i < nr_pages; i += 1 << order) { + for (i = 0; i < nr_pages; i += max_nr_pages) { struct page *page = pfn_to_page(pfn + i); /* @@ -770,14 +771,14 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages) * alike. */ if (PageDirty(page)) { - virtio_mem_clear_fake_offline(pfn + i, 1 << order, + virtio_mem_clear_fake_offline(pfn + i, max_nr_pages, false); - generic_online_page(page, order); + generic_online_page(page, MAX_ORDER - 1); } else { - virtio_mem_clear_fake_offline(pfn + i, 1 << order, + virtio_mem_clear_fake_offline(pfn + i, max_nr_pages, true); - free_contig_range(pfn + i, 1 << order); - adjust_managed_page_count(page, 1 << order); + free_contig_range(pfn + i, max_nr_pages); + adjust_managed_page_count(page, max_nr_pages); } } } @@ -790,7 +791,7 @@ static void virtio_mem_online_page_cb(struct page *page, unsigned int order) int sb_id; /* - * We exploit here that subblocks have at least MAX_ORDER - 1 + * We exploit here that subblocks have at least MAX_ORDER_NR_PAGES. * size/alignment and that this callback is is called with such a * size/alignment. So we cannot cross subblocks and therefore * also not memory blocks. @@ -1673,13 +1674,15 @@ static int virtio_mem_init(struct virtio_mem *vm) "Some memory is not addressable. This can make some memory unusable.\n"); /* - * Calculate the subblock size: - * - At least MAX_ORDER - 1 / pageblock_order. - * - At least the device block size. - * In the worst case, a single subblock per memory block. + * We want subblocks to span at least MAX_ORDER_NR_PAGES and + * pageblock_nr_pages pages. This: + * - Simplifies our page onlining code (virtio_mem_online_page_cb) + * and fake page onlining code (virtio_mem_fake_online). + * - Is required for now for alloc_contig_range() to work reliably - + * it doesn't properly handle smaller granularity on ZONE_NORMAL. */ - vm->subblock_size = PAGE_SIZE * 1ul << max_t(uint32_t, MAX_ORDER - 1, - pageblock_order); + vm->subblock_size = max_t(uint64_t, MAX_ORDER_NR_PAGES, + pageblock_nr_pages) * PAGE_SIZE; vm->subblock_size = max_t(uint64_t, vm->device_block_size, vm->subblock_size); vm->nb_sb_per_mb = memory_block_size_bytes() / vm->subblock_size; From d76944f80d5f500c8be74feb7938edddf68ee931 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:37:50 +0100 Subject: [PATCH 165/326] virtio-mem: drop rc2 in virtio_mem_mb_plug_and_add() We can drop rc2, we don't actually need the value. Reviewed-by: Pankaj Gupta Reviewed-by: Wei Yang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-5-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 0f9d854e8e42..4f18d9855a0e 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -1070,7 +1070,7 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, uint64_t *nb_sb) { const int count = min_t(int, *nb_sb, vm->nb_sb_per_mb); - int rc, rc2; + int rc; if (WARN_ON_ONCE(!count)) return -EINVAL; @@ -1101,13 +1101,12 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, dev_err(&vm->vdev->dev, "adding memory block %lu failed with %d\n", mb_id, rc); - rc2 = virtio_mem_mb_unplug_sb(vm, mb_id, 0, count); /* * TODO: Linux MM does not properly clean up yet in all cases * where adding of memory failed - especially on -ENOMEM. */ - if (rc2) + if (virtio_mem_mb_unplug_sb(vm, mb_id, 0, count)) new_state = VIRTIO_MEM_MB_STATE_PLUGGED; virtio_mem_mb_set_state(vm, mb_id, new_state); return rc; From 2a6285114bc543b70612e2bc0fcf13d2dd6ce5b9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:37:51 +0100 Subject: [PATCH 166/326] virtio-mem: use "unsigned long" for nr_pages when fake onlining/offlining No harm done, but let's be consistent. Reviewed-by: Pankaj Gupta Reviewed-by: Wei Yang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-6-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 4f18d9855a0e..94451b401fba 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -717,7 +717,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, * (via generic_online_page()) using PageDirty(). */ static void virtio_mem_set_fake_offline(unsigned long pfn, - unsigned int nr_pages, bool onlined) + unsigned long nr_pages, bool onlined) { for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); @@ -736,7 +736,7 @@ static void virtio_mem_set_fake_offline(unsigned long pfn, * (via generic_online_page()), clear PageDirty(). */ static void virtio_mem_clear_fake_offline(unsigned long pfn, - unsigned int nr_pages, bool onlined) + unsigned long nr_pages, bool onlined) { for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); @@ -751,10 +751,10 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn, * Release a range of fake-offline pages to the buddy, effectively * fake-onlining them. */ -static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages) +static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) { const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES; - int i; + unsigned long i; /* * We are always called at least with MAX_ORDER_NR_PAGES From 41e6215c6d29a7bbcee599411cdf0911fde1f09b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:37:52 +0100 Subject: [PATCH 167/326] virtio-mem: factor out calculation of the bit number within the subblock bitmap The calculation is already complicated enough, let's limit it to one location. Reviewed-by: Wei Yang Reviewed-by: Pankaj Gupta Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-7-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 94451b401fba..30b4d07f5263 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -290,6 +290,16 @@ static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem *vm) _mb_id--) \ if (virtio_mem_mb_get_state(_vm, _mb_id) == _state) +/* + * Calculate the bit number in the subblock bitmap for the given subblock + * inside the given memory block. + */ +static int virtio_mem_sb_bitmap_bit_nr(struct virtio_mem *vm, + unsigned long mb_id, int sb_id) +{ + return (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; +} + /* * Mark all selected subblocks plugged. * @@ -299,7 +309,7 @@ static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm, unsigned long mb_id, int sb_id, int count) { - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + const int bit = virtio_mem_sb_bitmap_bit_nr(vm, mb_id, sb_id); __bitmap_set(vm->sb_bitmap, bit, count); } @@ -313,7 +323,7 @@ static void virtio_mem_mb_set_sb_unplugged(struct virtio_mem *vm, unsigned long mb_id, int sb_id, int count) { - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + const int bit = virtio_mem_sb_bitmap_bit_nr(vm, mb_id, sb_id); __bitmap_clear(vm->sb_bitmap, bit, count); } @@ -325,7 +335,7 @@ static bool virtio_mem_mb_test_sb_plugged(struct virtio_mem *vm, unsigned long mb_id, int sb_id, int count) { - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + const int bit = virtio_mem_sb_bitmap_bit_nr(vm, mb_id, sb_id); if (count == 1) return test_bit(bit, vm->sb_bitmap); @@ -342,7 +352,7 @@ static bool virtio_mem_mb_test_sb_unplugged(struct virtio_mem *vm, unsigned long mb_id, int sb_id, int count) { - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + const int bit = virtio_mem_sb_bitmap_bit_nr(vm, mb_id, sb_id); /* TODO: Helper similar to bitmap_set() */ return find_next_bit(vm->sb_bitmap, bit + count, bit) >= bit + count; @@ -355,7 +365,7 @@ static bool virtio_mem_mb_test_sb_unplugged(struct virtio_mem *vm, static int virtio_mem_mb_first_unplugged_sb(struct virtio_mem *vm, unsigned long mb_id) { - const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb; + const int bit = virtio_mem_sb_bitmap_bit_nr(vm, mb_id, 0); return find_next_zero_bit(vm->sb_bitmap, bit + vm->nb_sb_per_mb, bit) - bit; From 6beb3a9421fd81d36bd4d87a6b307fc744ea9dd2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:37:53 +0100 Subject: [PATCH 168/326] virtio-mem: print debug messages from virtio_mem_send_*_request() Let's move the existing dev_dbg() into the functions, print if something went wrong, and also print for virtio_mem_send_unplug_all_request(). Reviewed-by: Wei Yang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-8-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 50 ++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 30b4d07f5263..4742497feff0 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -869,23 +869,33 @@ static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr, .u.plug.addr = cpu_to_virtio64(vm->vdev, addr), .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), }; + int rc = -ENOMEM; if (atomic_read(&vm->config_changed)) return -EAGAIN; + dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + switch (virtio_mem_send_request(vm, &req)) { case VIRTIO_MEM_RESP_ACK: vm->plugged_size += size; return 0; case VIRTIO_MEM_RESP_NACK: - return -EAGAIN; + rc = -EAGAIN; + break; case VIRTIO_MEM_RESP_BUSY: - return -ETXTBSY; + rc = -ETXTBSY; + break; case VIRTIO_MEM_RESP_ERROR: - return -EINVAL; + rc = -EINVAL; + break; default: - return -ENOMEM; + break; } + + dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc); + return rc; } static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, @@ -897,21 +907,30 @@ static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), }; + int rc = -ENOMEM; if (atomic_read(&vm->config_changed)) return -EAGAIN; + dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + switch (virtio_mem_send_request(vm, &req)) { case VIRTIO_MEM_RESP_ACK: vm->plugged_size -= size; return 0; case VIRTIO_MEM_RESP_BUSY: - return -ETXTBSY; + rc = -ETXTBSY; + break; case VIRTIO_MEM_RESP_ERROR: - return -EINVAL; + rc = -EINVAL; + break; default: - return -ENOMEM; + break; } + + dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc); + return rc; } static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) @@ -919,6 +938,9 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) const struct virtio_mem_req req = { .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), }; + int rc = -ENOMEM; + + dev_dbg(&vm->vdev->dev, "unplugging all memory"); switch (virtio_mem_send_request(vm, &req)) { case VIRTIO_MEM_RESP_ACK: @@ -928,10 +950,14 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) atomic_set(&vm->config_changed, 1); return 0; case VIRTIO_MEM_RESP_BUSY: - return -ETXTBSY; + rc = -ETXTBSY; + break; default: - return -ENOMEM; + break; } + + dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc); + return rc; } /* @@ -946,9 +972,6 @@ static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id, const uint64_t size = count * vm->subblock_size; int rc; - dev_dbg(&vm->vdev->dev, "plugging memory block: %lu : %i - %i\n", mb_id, - sb_id, sb_id + count - 1); - rc = virtio_mem_send_plug_request(vm, addr, size); if (!rc) virtio_mem_mb_set_sb_plugged(vm, mb_id, sb_id, count); @@ -967,9 +990,6 @@ static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, const uint64_t size = count * vm->subblock_size; int rc; - dev_dbg(&vm->vdev->dev, "unplugging memory block: %lu : %i - %i\n", - mb_id, sb_id, sb_id + count - 1); - rc = virtio_mem_send_unplug_request(vm, addr, size); if (!rc) virtio_mem_mb_set_sb_unplugged(vm, mb_id, sb_id, count); From 89c486c47f2a450d7f064b4927b7f0ab911569a4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:37:54 +0100 Subject: [PATCH 169/326] virtio-mem: factor out fake-offlining into virtio_mem_fake_offline() ... which now matches virtio_mem_fake_online(). We'll reuse this functionality soon. Reviewed-by: Wei Yang Reviewed-by: Pankaj Gupta Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-9-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 4742497feff0..fedfea27967e 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -793,6 +793,27 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) } } +/* + * Try to allocate a range, marking pages fake-offline, effectively + * fake-offlining them. + */ +static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) +{ + int rc; + + rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, + GFP_KERNEL); + if (rc == -ENOMEM) + /* whoops, out of memory */ + return rc; + if (rc) + return -EBUSY; + + virtio_mem_set_fake_offline(pfn, nr_pages, true); + adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); + return 0; +} + static void virtio_mem_online_page_cb(struct page *page, unsigned int order) { const unsigned long addr = page_to_phys(page); @@ -1328,17 +1349,10 @@ static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm, start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + sb_id * vm->subblock_size); - rc = alloc_contig_range(start_pfn, start_pfn + nr_pages, - MIGRATE_MOVABLE, GFP_KERNEL); - if (rc == -ENOMEM) - /* whoops, out of memory */ - return rc; - if (rc) - return -EBUSY; - /* Mark it as fake-offline before unplugging it */ - virtio_mem_set_fake_offline(start_pfn, nr_pages, true); - adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); + rc = virtio_mem_fake_offline(start_pfn, nr_pages); + if (rc) + return rc; /* Try to unplug the allocated memory */ rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count); From 7a34c77dab7e0c7ecb58da8bf600b7aadb4d878c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:37:55 +0100 Subject: [PATCH 170/326] virtio-mem: factor out handling of fake-offline pages in memory notifier Let's factor out the core pieces and place the implementation next to virtio_mem_fake_offline(). We'll reuse this functionality soon. Reviewed-by: Wei Yang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-10-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 73 +++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 23 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index fedfea27967e..c24055248f9d 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -160,6 +160,10 @@ static DEFINE_MUTEX(virtio_mem_mutex); static LIST_HEAD(virtio_mem_devices); static void virtio_mem_online_page_cb(struct page *page, unsigned int order); +static void virtio_mem_fake_offline_going_offline(unsigned long pfn, + unsigned long nr_pages); +static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, + unsigned long nr_pages); /* * Register a virtio-mem device so it will be considered for the online_page @@ -586,27 +590,15 @@ static void virtio_mem_notify_going_offline(struct virtio_mem *vm, unsigned long mb_id) { const unsigned long nr_pages = PFN_DOWN(vm->subblock_size); - struct page *page; unsigned long pfn; - int sb_id, i; + int sb_id; for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) { if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) continue; - /* - * Drop our reference to the pages so the memory can get - * offlined and add the unplugged pages to the managed - * page counters (so offlining code can correctly subtract - * them again). - */ pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + sb_id * vm->subblock_size); - adjust_managed_page_count(pfn_to_page(pfn), nr_pages); - for (i = 0; i < nr_pages; i++) { - page = pfn_to_page(pfn + i); - if (WARN_ON(!page_ref_dec_and_test(page))) - dump_page(page, "unplugged page referenced"); - } + virtio_mem_fake_offline_going_offline(pfn, nr_pages); } } @@ -615,21 +607,14 @@ static void virtio_mem_notify_cancel_offline(struct virtio_mem *vm, { const unsigned long nr_pages = PFN_DOWN(vm->subblock_size); unsigned long pfn; - int sb_id, i; + int sb_id; for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) { if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) continue; - /* - * Get the reference we dropped when going offline and - * subtract the unplugged pages from the managed page - * counters. - */ pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + sb_id * vm->subblock_size); - adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); - for (i = 0; i < nr_pages; i++) - page_ref_inc(pfn_to_page(pfn + i)); + virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); } } @@ -814,6 +799,48 @@ static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) return 0; } +/* + * Handle fake-offline pages when memory is going offline - such that the + * pages can be skipped by mm-core when offlining. + */ +static void virtio_mem_fake_offline_going_offline(unsigned long pfn, + unsigned long nr_pages) +{ + struct page *page; + unsigned long i; + + /* + * Drop our reference to the pages so the memory can get offlined + * and add the unplugged pages to the managed page counters (so + * offlining code can correctly subtract them again). + */ + adjust_managed_page_count(pfn_to_page(pfn), nr_pages); + /* Drop our reference to the pages so the memory can get offlined. */ + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(pfn + i); + if (WARN_ON(!page_ref_dec_and_test(page))) + dump_page(page, "fake-offline page referenced"); + } +} + +/* + * Handle fake-offline pages when memory offlining is canceled - to undo + * what we did in virtio_mem_fake_offline_going_offline(). + */ +static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, + unsigned long nr_pages) +{ + unsigned long i; + + /* + * Get the reference we dropped when going offline and subtract the + * unplugged pages from the managed page counters. + */ + adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); + for (i = 0; i < nr_pages; i++) + page_ref_inc(pfn_to_page(pfn + i)); +} + static void virtio_mem_online_page_cb(struct page *page, unsigned int order) { const unsigned long addr = page_to_phys(page); From f2d799d591359685a3a74d28c2989c56f4bb9898 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:37:56 +0100 Subject: [PATCH 171/326] virtio-mem: retry fake-offlining via alloc_contig_range() on ZONE_MOVABLE ZONE_MOVABLE is supposed to give some guarantees, yet, alloc_contig_range() isn't prepared to properly deal with some racy cases properly (e.g., temporary page pinning when exiting processed, PCP). Retry 5 times for now. There is certainly room for improvement in the future. Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-11-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index c24055248f9d..2f1ce4d4781b 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -784,19 +784,34 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) */ static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) { - int rc; + const bool is_movable = zone_idx(page_zone(pfn_to_page(pfn))) == + ZONE_MOVABLE; + int rc, retry_count; - rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, - GFP_KERNEL); - if (rc == -ENOMEM) - /* whoops, out of memory */ - return rc; - if (rc) - return -EBUSY; + /* + * TODO: We want an alloc_contig_range() mode that tries to allocate + * harder (e.g., dealing with temporarily pinned pages, PCP), especially + * with ZONE_MOVABLE. So for now, retry a couple of times with + * ZONE_MOVABLE before giving up - because that zone is supposed to give + * some guarantees. + */ + for (retry_count = 0; retry_count < 5; retry_count++) { + rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE, + GFP_KERNEL); + if (rc == -ENOMEM) + /* whoops, out of memory */ + return rc; + else if (rc && !is_movable) + break; + else if (rc) + continue; - virtio_mem_set_fake_offline(pfn, nr_pages, true); - adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); - return 0; + virtio_mem_set_fake_offline(pfn, nr_pages, true); + adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); + return 0; + } + + return -EBUSY; } /* From 989ff82527074b79bc89ba1c390be1eda01784a5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:37:57 +0100 Subject: [PATCH 172/326] virtio-mem: generalize check for added memory Let's check by traversing busy system RAM resources instead, to avoid relying on memory block states. Don't use walk_system_ram_range(), as that works on pages and we want to use the bare addresses we have easily at hand. This is a preparation for Big Block Mode (BBM), which won't have memory block states. Reviewed-by: Wei Yang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-12-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 2f1ce4d4781b..3731097cd9e8 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -1833,6 +1833,20 @@ static void virtio_mem_delete_resource(struct virtio_mem *vm) vm->parent_resource = NULL; } +static int virtio_mem_range_has_system_ram(struct resource *res, void *arg) +{ + return 1; +} + +static bool virtio_mem_has_memory_added(struct virtio_mem *vm) +{ + const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + + return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr, + vm->addr + vm->region_size, NULL, + virtio_mem_range_has_system_ram) == 1; +} + static int virtio_mem_probe(struct virtio_device *vdev) { struct virtio_mem *vm; @@ -1954,10 +1968,7 @@ static void virtio_mem_remove(struct virtio_device *vdev) * the system. And there is no way to stop the driver/device from going * away. Warn at least. */ - if (vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] || - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL] || - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE] || - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL]) { + if (virtio_mem_has_memory_added(vm)) { dev_warn(&vdev->dev, "device still has system memory added\n"); } else { virtio_mem_delete_resource(vm); From 8464e3bdf208e86410e369601ca363b2a81683e3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:37:58 +0100 Subject: [PATCH 173/326] virtio-mem: generalize virtio_mem_owned_mb() Avoid using memory block ids. Rename it to virtio_mem_contains_range(). This is a preparation for Big Block Mode (BBM). Reviewed-by: Pankaj Gupta Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-13-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 3731097cd9e8..2193c5172195 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -512,12 +512,13 @@ static bool virtio_mem_overlaps_range(struct virtio_mem *vm, } /* - * Test if a virtio-mem device owns a memory block. Can be called from + * Test if a virtio-mem device contains a given range. Can be called from * (notifier) callbacks lockless. */ -static bool virtio_mem_owned_mb(struct virtio_mem *vm, unsigned long mb_id) +static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, + uint64_t size) { - return mb_id >= vm->first_mb_id && mb_id <= vm->last_mb_id; + return start >= vm->addr && start + size <= vm->addr + vm->region_size; } static int virtio_mem_notify_going_online(struct virtio_mem *vm, @@ -871,7 +872,7 @@ static void virtio_mem_online_page_cb(struct page *page, unsigned int order) */ rcu_read_lock(); list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { - if (!virtio_mem_owned_mb(vm, mb_id)) + if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order))) continue; sb_id = virtio_mem_phys_to_sb_id(vm, addr); From 835491c554fbdbc18452f4b1546df21879b8b26d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:37:59 +0100 Subject: [PATCH 174/326] virtio-mem: generalize virtio_mem_overlaps_range() Avoid using memory block ids. While at it, use uint64_t for address/size. This is a preparation for Big Block Mode (BBM). Reviewed-by: Pankaj Gupta Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-14-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 2193c5172195..bd76aa79a82e 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -501,14 +501,10 @@ static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) * Test if a virtio-mem device overlaps with the given range. Can be called * from (notifier) callbacks lockless. */ -static bool virtio_mem_overlaps_range(struct virtio_mem *vm, - unsigned long start, unsigned long size) +static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start, + uint64_t size) { - unsigned long dev_start = virtio_mem_mb_id_to_phys(vm->first_mb_id); - unsigned long dev_end = virtio_mem_mb_id_to_phys(vm->last_mb_id) + - memory_block_size_bytes(); - - return start < dev_end && dev_start < start + size; + return start < vm->addr + vm->region_size && vm->addr < start + size; } /* From 420066829bb614826115892e81f85b8c4341ee95 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:00 +0100 Subject: [PATCH 175/326] virtio-mem: drop last_mb_id No longer used, let's drop it. Reviewed-by: Wei Yang Reviewed-by: Pankaj Gupta Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-15-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index bd76aa79a82e..a7beac5942e0 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -84,8 +84,6 @@ struct virtio_mem { /* Id of the first memory block of this device. */ unsigned long first_mb_id; - /* Id of the last memory block of this device. */ - unsigned long last_mb_id; /* Id of the last usable memory block of this device. */ unsigned long last_usable_mb_id; /* Id of the next memory bock to prepare when needed. */ @@ -1773,8 +1771,6 @@ static int virtio_mem_init(struct virtio_mem *vm) vm->first_mb_id = virtio_mem_phys_to_mb_id(vm->addr - 1 + memory_block_size_bytes()); vm->next_mb_id = vm->first_mb_id; - vm->last_mb_id = virtio_mem_phys_to_mb_id(vm->addr + - vm->region_size) - 1; dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); From 1d33c2caa8cbdc0f093a8cdad5a4c153ef9cbe8f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:01 +0100 Subject: [PATCH 176/326] virtio-mem: don't always trigger the workqueue when offlining memory Let's trigger from offlining code only when we're not allowed to unplug online memory. Handle the other case (memmap possibly freeing up another memory block) when actually removing memory. We now also properly handle the case when removing already offline memory blocks via virtio_mem_mb_remove(). When removing via virtio_mem_remove(), when unloading the driver, virtio_mem_retry() is a NOP and safe to use. While at it, move retry handling when offlining out of virtio_mem_notify_offline(), to share it with Big Block Mode (BBM) soon. This is a preparation for Big Block Mode (BBM), whereby we can see some temporary offlining of memory blocks without actually making progress. Imagine you have a Big Block that spans to Linux memory blocks. Assume the first Linux memory blocks has no unmovable data on it. When we would call offline_and_remove_memory() on the big block, we would 1. Try to offline the first block. Works, notifiers triggered. virtio_mem_retry() called. 2. Try to offline the second block. Does not work. 3. Re-online first block. 4. Exit to main loop, exit workqueue. 5. Retry immediately (due to virtio_mem_retry()), go to 1. The result are endless retries. Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-16-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 40 ++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index a7beac5942e0..f86654af8b6b 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -162,6 +162,7 @@ static void virtio_mem_fake_offline_going_offline(unsigned long pfn, unsigned long nr_pages); static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, unsigned long nr_pages); +static void virtio_mem_retry(struct virtio_mem *vm); /* * Register a virtio-mem device so it will be considered for the online_page @@ -447,9 +448,17 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); + int rc; dev_dbg(&vm->vdev->dev, "removing memory block: %lu\n", mb_id); - return remove_memory(vm->nid, addr, memory_block_size_bytes()); + rc = remove_memory(vm->nid, addr, memory_block_size_bytes()); + if (!rc) + /* + * We might have freed up memory we can now unplug, retry + * immediately instead of waiting. + */ + virtio_mem_retry(vm); + return rc; } /* @@ -464,11 +473,19 @@ static int virtio_mem_mb_offline_and_remove(struct virtio_mem *vm, unsigned long mb_id) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); + int rc; dev_dbg(&vm->vdev->dev, "offlining and removing memory block: %lu\n", mb_id); - return offline_and_remove_memory(vm->nid, addr, - memory_block_size_bytes()); + rc = offline_and_remove_memory(vm->nid, addr, + memory_block_size_bytes()); + if (!rc) + /* + * We might have freed up memory we can now unplug, retry + * immediately instead of waiting. + */ + virtio_mem_retry(vm); + return rc; } /* @@ -546,15 +563,6 @@ static void virtio_mem_notify_offline(struct virtio_mem *vm, BUG(); break; } - - /* - * Trigger the workqueue, maybe we can now unplug memory. Also, - * when we offline and remove a memory block, this will re-trigger - * us immediately - which is often nice because the removal of - * the memory block (e.g., memmap) might have freed up memory - * on other memory blocks we manage. - */ - virtio_mem_retry(vm); } static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id) @@ -672,6 +680,14 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, break; case MEM_OFFLINE: virtio_mem_notify_offline(vm, mb_id); + + /* + * Trigger the workqueue. Now that we have some offline memory, + * maybe we can handle pending unplug requests. + */ + if (!unplug_online) + virtio_mem_retry(vm); + vm->hotplug_active = false; mutex_unlock(&vm->hotplug_mutex); break; From 98ff9f9411860073f952f1e62a05afb9f6a9e77e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:02 +0100 Subject: [PATCH 177/326] virtio-mem: generalize handling when memory is getting onlined deferred We don't want to add too much memory when it's not getting onlined immediately, to avoid running OOM. Generalize the handling, to avoid making use of memory block states. Use a threshold of 1 GiB for now. Properly adjust the offline size when adding/removing memory. As we are not always protected by a lock when touching the offline size, use an atomic64_t. We don't care about races (e.g., someone offlining memory while we are adding more), only about consistent values. (1 GiB needs a memmap of ~16MiB - which sounds reasonable even for setups with little boot memory and (possibly) one virtio-mem device per node) We don't want to retrigger when onlining is caused immediately by our action (e.g., adding memory which immediately gets onlined), so use a flag to indicate if the workqueue is active and use that as an indicator whether to trigger a retry. This will also be especially relevant for Big Block Mode (BBM), whereby we might re-online memory in case offlining of another memory block failed. Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-17-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 95 ++++++++++++++++++++++++------------- 1 file changed, 63 insertions(+), 32 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index f86654af8b6b..cbd0aa5eb95c 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -51,6 +51,7 @@ struct virtio_mem { /* Workqueue that processes the plug/unplug requests. */ struct work_struct wq; + atomic_t wq_active; atomic_t config_changed; /* Virtqueue for guest->host requests. */ @@ -99,7 +100,15 @@ struct virtio_mem { /* Summary of all memory block states. */ unsigned long nb_mb_state[VIRTIO_MEM_MB_STATE_COUNT]; -#define VIRTIO_MEM_NB_OFFLINE_THRESHOLD 10 + + /* + * We don't want to add too much memory if it's not getting onlined, + * to avoid running OOM. Besides this threshold, we allow to have at + * least two offline blocks at a time (whatever is bigger). + */ +#define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024) + atomic64_t offline_size; + uint64_t offline_threshold; /* * One byte state per memory block. @@ -405,6 +414,18 @@ static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem *vm) return 0; } +/* + * Test if we could add memory without creating too much offline memory - + * to avoid running OOM if memory is getting onlined deferred. + */ +static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) +{ + if (WARN_ON_ONCE(size > vm->offline_threshold)) + return false; + + return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold; +} + /* * Try to add a memory block to Linux. This will usually only fail * if out of memory. @@ -417,6 +438,8 @@ static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem *vm) static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); + const uint64_t size = memory_block_size_bytes(); + int rc; /* * When force-unloading the driver and we still have memory added to @@ -430,10 +453,13 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) } dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id); - return add_memory_driver_managed(vm->nid, addr, - memory_block_size_bytes(), - vm->resource_name, - MEMHP_MERGE_RESOURCE); + /* Memory might get onlined immediately. */ + atomic64_add(size, &vm->offline_size); + rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name, + MEMHP_MERGE_RESOURCE); + if (rc) + atomic64_sub(size, &vm->offline_size); + return rc; } /* @@ -448,16 +474,19 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); + const uint64_t size = memory_block_size_bytes(); int rc; dev_dbg(&vm->vdev->dev, "removing memory block: %lu\n", mb_id); - rc = remove_memory(vm->nid, addr, memory_block_size_bytes()); - if (!rc) + rc = remove_memory(vm->nid, addr, size); + if (!rc) { + atomic64_sub(size, &vm->offline_size); /* * We might have freed up memory we can now unplug, retry * immediately instead of waiting. */ virtio_mem_retry(vm); + } return rc; } @@ -473,18 +502,20 @@ static int virtio_mem_mb_offline_and_remove(struct virtio_mem *vm, unsigned long mb_id) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); + const uint64_t size = memory_block_size_bytes(); int rc; dev_dbg(&vm->vdev->dev, "offlining and removing memory block: %lu\n", mb_id); - rc = offline_and_remove_memory(vm->nid, addr, - memory_block_size_bytes()); - if (!rc) + rc = offline_and_remove_memory(vm->nid, addr, size); + if (!rc) { + atomic64_sub(size, &vm->offline_size); /* * We might have freed up memory we can now unplug, retry * immediately instead of waiting. */ virtio_mem_retry(vm); + } return rc; } @@ -567,8 +598,6 @@ static void virtio_mem_notify_offline(struct virtio_mem *vm, static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id) { - unsigned long nb_offline; - switch (virtio_mem_mb_get_state(vm, mb_id)) { case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: virtio_mem_mb_set_state(vm, mb_id, @@ -581,12 +610,6 @@ static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id) BUG(); break; } - nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] + - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL]; - - /* see if we can add new blocks now that we onlined one block */ - if (nb_offline == VIRTIO_MEM_NB_OFFLINE_THRESHOLD - 1) - virtio_mem_retry(vm); } static void virtio_mem_notify_going_offline(struct virtio_mem *vm, @@ -681,6 +704,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, case MEM_OFFLINE: virtio_mem_notify_offline(vm, mb_id); + atomic64_add(size, &vm->offline_size); /* * Trigger the workqueue. Now that we have some offline memory, * maybe we can handle pending unplug requests. @@ -693,6 +717,18 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, break; case MEM_ONLINE: virtio_mem_notify_online(vm, mb_id); + + atomic64_sub(size, &vm->offline_size); + /* + * Start adding more memory once we onlined half of our + * threshold. Don't trigger if it's possibly due to our actipn + * (e.g., us adding memory which gets onlined immediately from + * the core). + */ + if (!atomic_read(&vm->wq_active) && + virtio_mem_could_add_memory(vm, vm->offline_threshold / 2)) + virtio_mem_retry(vm); + vm->hotplug_active = false; mutex_unlock(&vm->hotplug_mutex); break; @@ -1151,18 +1187,6 @@ static int virtio_mem_prepare_next_mb(struct virtio_mem *vm, return 0; } -/* - * Don't add too many blocks that are not onlined yet to avoid running OOM. - */ -static bool virtio_mem_too_many_mb_offline(struct virtio_mem *vm) -{ - unsigned long nb_offline; - - nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] + - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL]; - return nb_offline >= VIRTIO_MEM_NB_OFFLINE_THRESHOLD; -} - /* * Try to plug the desired number of subblocks and add the memory block * to Linux. @@ -1316,7 +1340,7 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) /* Try to plug and add unused blocks */ virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED) { - if (virtio_mem_too_many_mb_offline(vm)) + if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) return -ENOSPC; rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb); @@ -1327,7 +1351,7 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) /* Try to prepare, plug and add new blocks */ while (nb_sb) { - if (virtio_mem_too_many_mb_offline(vm)) + if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) return -ENOSPC; rc = virtio_mem_prepare_next_mb(vm, &mb_id); @@ -1620,6 +1644,7 @@ static void virtio_mem_run_wq(struct work_struct *work) if (vm->broken) return; + atomic_set(&vm->wq_active, 1); retry: rc = 0; @@ -1680,6 +1705,8 @@ retry: "unknown error, marking device broken: %d\n", rc); vm->broken = true; } + + atomic_set(&vm->wq_active, 0); } static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) @@ -1788,6 +1815,10 @@ static int virtio_mem_init(struct virtio_mem *vm) memory_block_size_bytes()); vm->next_mb_id = vm->first_mb_id; + /* Prepare the offline threshold - make sure we can add two blocks. */ + vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), + VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); + dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); dev_info(&vm->vdev->dev, "device block size: 0x%llx", From d5614944254cf288b8fd46fda8c86d916346131d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:03 +0100 Subject: [PATCH 178/326] virito-mem: document Sub Block Mode (SBM) Let's add some documentation for the current mode - Sub Block Mode (SBM) - to prepare for a new mode - Big Block Mode (BBM). Follow-up patches will properly factor out the existing Sub Block Mode (SBM) and implement Big Block Mode (BBM). Reviewed-by: Wei Yang Reviewed-by: Pankaj Gupta Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-18-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index cbd0aa5eb95c..4234bfc0cf52 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -27,6 +27,21 @@ static bool unplug_online = true; module_param(unplug_online, bool, 0644); MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); +/* + * virtio-mem currently supports the following modes of operation: + * + * * Sub Block Mode (SBM): A Linux memory block spans 1..X subblocks (SB). The + * size of a Sub Block (SB) is determined based on the device block size, the + * pageblock size, and the maximum allocation granularity of the buddy. + * Subblocks within a Linux memory block might either be plugged or unplugged. + * Memory is added/removed to Linux MM in Linux memory block granularity. + * + * User space / core MM (auto onlining) is responsible for onlining added + * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are + * always onlined separately, and all memory within a Linux memory block is + * onlined to the same zone - virtio-mem relies on this behavior. + */ + enum virtio_mem_mb_state { /* Unplugged, not added to Linux. Can be reused later. */ VIRTIO_MEM_MB_STATE_UNUSED = 0, From 99f0b55ea6c3a2ed29776ca0dd549d523ae8d6d3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:04 +0100 Subject: [PATCH 179/326] virtio-mem: memory block states are specific to Sub Block Mode (SBM) let's use a new "sbm" sub-struct to hold SBM-specific state and rename + move applicable definitions, functions, and variables (related to memory block states). While at it: - Drop the "_STATE" part from memory block states - Rename "nb_mb_state" to "mb_count" - "set_mb_state" / "get_mb_state" vs. "mb_set_state" / "mb_get_state" - Don't use lengthy "enum virtio_mem_smb_mb_state", simply use "uint8_t" Reviewed-by: Wei Yang Reviewed-by: Pankaj Gupta Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-19-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 215 ++++++++++++++++++------------------ 1 file changed, 109 insertions(+), 106 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 4234bfc0cf52..c6cc301c78e1 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -42,20 +42,23 @@ MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); * onlined to the same zone - virtio-mem relies on this behavior. */ -enum virtio_mem_mb_state { +/* + * State of a Linux memory block in SBM. + */ +enum virtio_mem_sbm_mb_state { /* Unplugged, not added to Linux. Can be reused later. */ - VIRTIO_MEM_MB_STATE_UNUSED = 0, + VIRTIO_MEM_SBM_MB_UNUSED = 0, /* (Partially) plugged, not added to Linux. Error on add_memory(). */ - VIRTIO_MEM_MB_STATE_PLUGGED, + VIRTIO_MEM_SBM_MB_PLUGGED, /* Fully plugged, fully added to Linux, offline. */ - VIRTIO_MEM_MB_STATE_OFFLINE, + VIRTIO_MEM_SBM_MB_OFFLINE, /* Partially plugged, fully added to Linux, offline. */ - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, /* Fully plugged, fully added to Linux, online. */ - VIRTIO_MEM_MB_STATE_ONLINE, + VIRTIO_MEM_SBM_MB_ONLINE, /* Partially plugged, fully added to Linux, online. */ - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL, - VIRTIO_MEM_MB_STATE_COUNT + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL, + VIRTIO_MEM_SBM_MB_COUNT }; struct virtio_mem { @@ -113,9 +116,6 @@ struct virtio_mem { */ const char *resource_name; - /* Summary of all memory block states. */ - unsigned long nb_mb_state[VIRTIO_MEM_MB_STATE_COUNT]; - /* * We don't want to add too much memory if it's not getting onlined, * to avoid running OOM. Besides this threshold, we allow to have at @@ -125,27 +125,29 @@ struct virtio_mem { atomic64_t offline_size; uint64_t offline_threshold; - /* - * One byte state per memory block. - * - * Allocated via vmalloc(). When preparing new blocks, resized - * (alloc+copy+free) when needed (crossing pages with the next mb). - * (when crossing pages). - * - * With 128MB memory blocks, we have states for 512GB of memory in one - * page. - */ - uint8_t *mb_state; + struct { + /* Summary of all memory block states. */ + unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; + + /* + * One byte state per memory block. Allocated via vmalloc(). + * Resized (alloc+copy+free) on demand. + * + * With 128 MiB memory blocks, we have states for 512 GiB of + * memory in one 4 KiB page. + */ + uint8_t *mb_states; + } sbm; /* - * $nb_sb_per_mb bit per memory block. Handled similar to mb_state. + * $nb_sb_per_mb bit per memory block. Handled similar to sbm.mb_states. * * With 4MB subblocks, we manage 128GB of memory in one page. */ unsigned long *sb_bitmap; /* - * Mutex that protects the nb_mb_state, mb_state, and sb_bitmap. + * Mutex that protects the sbm.mb_count, sbm.mb_states, and sb_bitmap. * * When this lock is held the pointers can't change, ONLINE and * OFFLINE blocks can't change the state and no subblocks will get @@ -254,68 +256,68 @@ static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, /* * Set the state of a memory block, taking care of the state counter. */ -static void virtio_mem_mb_set_state(struct virtio_mem *vm, unsigned long mb_id, - enum virtio_mem_mb_state state) +static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, + unsigned long mb_id, uint8_t state) { const unsigned long idx = mb_id - vm->first_mb_id; - enum virtio_mem_mb_state old_state; + uint8_t old_state; - old_state = vm->mb_state[idx]; - vm->mb_state[idx] = state; + old_state = vm->sbm.mb_states[idx]; + vm->sbm.mb_states[idx] = state; - BUG_ON(vm->nb_mb_state[old_state] == 0); - vm->nb_mb_state[old_state]--; - vm->nb_mb_state[state]++; + BUG_ON(vm->sbm.mb_count[old_state] == 0); + vm->sbm.mb_count[old_state]--; + vm->sbm.mb_count[state]++; } /* * Get the state of a memory block. */ -static enum virtio_mem_mb_state virtio_mem_mb_get_state(struct virtio_mem *vm, - unsigned long mb_id) +static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, + unsigned long mb_id) { const unsigned long idx = mb_id - vm->first_mb_id; - return vm->mb_state[idx]; + return vm->sbm.mb_states[idx]; } /* * Prepare the state array for the next memory block. */ -static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem *vm) +static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) { int old_pages = PFN_UP(vm->next_mb_id - vm->first_mb_id); int new_pages = PFN_UP(vm->next_mb_id - vm->first_mb_id + 1); - uint8_t *new_mb_state; + uint8_t *new_array; - if (vm->mb_state && old_pages == new_pages) + if (vm->sbm.mb_states && old_pages == new_pages) return 0; - new_mb_state = vzalloc(new_pages * PAGE_SIZE); - if (!new_mb_state) + new_array = vzalloc(new_pages * PAGE_SIZE); + if (!new_array) return -ENOMEM; mutex_lock(&vm->hotplug_mutex); - if (vm->mb_state) - memcpy(new_mb_state, vm->mb_state, old_pages * PAGE_SIZE); - vfree(vm->mb_state); - vm->mb_state = new_mb_state; + if (vm->sbm.mb_states) + memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE); + vfree(vm->sbm.mb_states); + vm->sbm.mb_states = new_array; mutex_unlock(&vm->hotplug_mutex); return 0; } -#define virtio_mem_for_each_mb_state(_vm, _mb_id, _state) \ +#define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ for (_mb_id = _vm->first_mb_id; \ - _mb_id < _vm->next_mb_id && _vm->nb_mb_state[_state]; \ + _mb_id < _vm->next_mb_id && _vm->sbm.mb_count[_state]; \ _mb_id++) \ - if (virtio_mem_mb_get_state(_vm, _mb_id) == _state) + if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) -#define virtio_mem_for_each_mb_state_rev(_vm, _mb_id, _state) \ +#define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ for (_mb_id = _vm->next_mb_id - 1; \ - _mb_id >= _vm->first_mb_id && _vm->nb_mb_state[_state]; \ + _mb_id >= _vm->first_mb_id && _vm->sbm.mb_count[_state]; \ _mb_id--) \ - if (virtio_mem_mb_get_state(_vm, _mb_id) == _state) + if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) /* * Calculate the bit number in the subblock bitmap for the given subblock @@ -581,9 +583,9 @@ static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, static int virtio_mem_notify_going_online(struct virtio_mem *vm, unsigned long mb_id) { - switch (virtio_mem_mb_get_state(vm, mb_id)) { - case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: - case VIRTIO_MEM_MB_STATE_OFFLINE: + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { + case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: + case VIRTIO_MEM_SBM_MB_OFFLINE: return NOTIFY_OK; default: break; @@ -596,14 +598,14 @@ static int virtio_mem_notify_going_online(struct virtio_mem *vm, static void virtio_mem_notify_offline(struct virtio_mem *vm, unsigned long mb_id) { - switch (virtio_mem_mb_get_state(vm, mb_id)) { - case VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL: - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { + case VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); break; - case VIRTIO_MEM_MB_STATE_ONLINE: - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE); + case VIRTIO_MEM_SBM_MB_ONLINE: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE); break; default: BUG(); @@ -613,13 +615,14 @@ static void virtio_mem_notify_offline(struct virtio_mem *vm, static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id) { - switch (virtio_mem_mb_get_state(vm, mb_id)) { - case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL); + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { + case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL); break; - case VIRTIO_MEM_MB_STATE_OFFLINE: - virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_ONLINE); + case VIRTIO_MEM_SBM_MB_OFFLINE: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_ONLINE); break; default: BUG(); @@ -1188,7 +1191,7 @@ static int virtio_mem_prepare_next_mb(struct virtio_mem *vm, return -ENOSPC; /* Resize the state array if required. */ - rc = virtio_mem_mb_state_prepare_next_mb(vm); + rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm); if (rc) return rc; @@ -1197,7 +1200,7 @@ static int virtio_mem_prepare_next_mb(struct virtio_mem *vm, if (rc) return rc; - vm->nb_mb_state[VIRTIO_MEM_MB_STATE_UNUSED]++; + vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++; *mb_id = vm->next_mb_id++; return 0; } @@ -1231,16 +1234,16 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, * so the memory notifiers will find the block in the right state. */ if (count == vm->nb_sb_per_mb) - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE); else - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); /* Add the memory block to linux - if that fails, try to unplug. */ rc = virtio_mem_mb_add(vm, mb_id); if (rc) { - enum virtio_mem_mb_state new_state = VIRTIO_MEM_MB_STATE_UNUSED; + int new_state = VIRTIO_MEM_SBM_MB_UNUSED; dev_err(&vm->vdev->dev, "adding memory block %lu failed with %d\n", mb_id, rc); @@ -1250,8 +1253,8 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, * where adding of memory failed - especially on -ENOMEM. */ if (virtio_mem_mb_unplug_sb(vm, mb_id, 0, count)) - new_state = VIRTIO_MEM_MB_STATE_PLUGGED; - virtio_mem_mb_set_state(vm, mb_id, new_state); + new_state = VIRTIO_MEM_SBM_MB_PLUGGED; + virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); return rc; } @@ -1304,11 +1307,11 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, if (virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { if (online) - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_ONLINE); else - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE); } return 0; @@ -1330,8 +1333,8 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) mutex_lock(&vm->hotplug_mutex); /* Try to plug subblocks of partially plugged online blocks. */ - virtio_mem_for_each_mb_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) { + virtio_mem_sbm_for_each_mb(vm, mb_id, + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, true); if (rc || !nb_sb) goto out_unlock; @@ -1339,8 +1342,8 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) } /* Try to plug subblocks of partially plugged offline blocks. */ - virtio_mem_for_each_mb_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) { + virtio_mem_sbm_for_each_mb(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, false); if (rc || !nb_sb) goto out_unlock; @@ -1354,7 +1357,7 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) mutex_unlock(&vm->hotplug_mutex); /* Try to plug and add unused blocks */ - virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED) { + virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) { if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) return -ENOSPC; @@ -1403,8 +1406,8 @@ static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm, /* some subblocks might have been unplugged even on failure */ if (!virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); if (rc) return rc; @@ -1415,8 +1418,8 @@ static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm, * unplugged. Temporarily drop the mutex, so * any pending GOING_ONLINE requests can be serviced/rejected. */ - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_UNUSED); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_UNUSED); mutex_unlock(&vm->hotplug_mutex); rc = virtio_mem_mb_remove(vm, mb_id); @@ -1454,8 +1457,8 @@ static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm, return rc; } - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL); return 0; } @@ -1515,8 +1518,8 @@ unplugged: rc = virtio_mem_mb_offline_and_remove(vm, mb_id); mutex_lock(&vm->hotplug_mutex); if (!rc) - virtio_mem_mb_set_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_UNUSED); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_UNUSED); } return 0; @@ -1542,8 +1545,8 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) mutex_lock(&vm->hotplug_mutex); /* Try to unplug subblocks of partially plugged offline blocks. */ - virtio_mem_for_each_mb_state_rev(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) { + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id, &nb_sb); if (rc || !nb_sb) @@ -1552,8 +1555,7 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) } /* Try to unplug subblocks of plugged offline blocks. */ - virtio_mem_for_each_mb_state_rev(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE) { + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE) { rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id, &nb_sb); if (rc || !nb_sb) @@ -1567,8 +1569,8 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) } /* Try to unplug subblocks of partially plugged online blocks. */ - virtio_mem_for_each_mb_state_rev(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) { + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id, &nb_sb); if (rc || !nb_sb) @@ -1579,8 +1581,7 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) } /* Try to unplug subblocks of plugged online blocks. */ - virtio_mem_for_each_mb_state_rev(vm, mb_id, - VIRTIO_MEM_MB_STATE_ONLINE) { + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE) { rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id, &nb_sb); if (rc || !nb_sb) @@ -1606,11 +1607,12 @@ static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) unsigned long mb_id; int rc; - virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_PLUGGED) { + virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_PLUGGED) { rc = virtio_mem_mb_unplug(vm, mb_id); if (rc) return rc; - virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_UNUSED); } return 0; @@ -2002,11 +2004,12 @@ static void virtio_mem_remove(struct virtio_device *vdev) * After we unregistered our callbacks, user space can online partially * plugged offline blocks. Make sure to remove them. */ - virtio_mem_for_each_mb_state(vm, mb_id, - VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) { + virtio_mem_sbm_for_each_mb(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { rc = virtio_mem_mb_remove(vm, mb_id); BUG_ON(rc); - virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_UNUSED); } /* * After we unregistered our callbacks, user space can no longer @@ -2031,7 +2034,7 @@ static void virtio_mem_remove(struct virtio_device *vdev) } /* remove all tracking data - no locking needed */ - vfree(vm->mb_state); + vfree(vm->sbm.mb_states); vfree(vm->sb_bitmap); /* reset the device and cleanup the queues */ From 54c6a6ba75ba4c428b659b167f87c07100ba260e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:05 +0100 Subject: [PATCH 180/326] virito-mem: subblock states are specific to Sub Block Mode (SBM) Let's rename and move accordingly. While at it, rename sb_bitmap to "sb_states". Reviewed-by: Wei Yang Reviewed-by: Pankaj Gupta Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-20-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 132 +++++++++++++++++++----------------- 1 file changed, 69 insertions(+), 63 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index c6cc301c78e1..851cddf5c606 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -137,17 +137,23 @@ struct virtio_mem { * memory in one 4 KiB page. */ uint8_t *mb_states; + + /* + * Bitmap: one bit per subblock. Allocated similar to + * sbm.mb_states. + * + * A set bit means the corresponding subblock is plugged, + * otherwise it's unblocked. + * + * With 4 MiB subblocks, we manage 128 GiB of memory in one + * 4 KiB page. + */ + unsigned long *sb_states; } sbm; /* - * $nb_sb_per_mb bit per memory block. Handled similar to sbm.mb_states. - * - * With 4MB subblocks, we manage 128GB of memory in one page. - */ - unsigned long *sb_bitmap; - - /* - * Mutex that protects the sbm.mb_count, sbm.mb_states, and sb_bitmap. + * Mutex that protects the sbm.mb_count, sbm.mb_states, and + * sbm.sb_states. * * When this lock is held the pointers can't change, ONLINE and * OFFLINE blocks can't change the state and no subblocks will get @@ -323,8 +329,8 @@ static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) * Calculate the bit number in the subblock bitmap for the given subblock * inside the given memory block. */ -static int virtio_mem_sb_bitmap_bit_nr(struct virtio_mem *vm, - unsigned long mb_id, int sb_id) +static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, + unsigned long mb_id, int sb_id) { return (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; } @@ -334,13 +340,13 @@ static int virtio_mem_sb_bitmap_bit_nr(struct virtio_mem *vm, * * Will not modify the state of the memory block. */ -static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm, - unsigned long mb_id, int sb_id, - int count) +static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) { - const int bit = virtio_mem_sb_bitmap_bit_nr(vm, mb_id, sb_id); + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); - __bitmap_set(vm->sb_bitmap, bit, count); + __bitmap_set(vm->sbm.sb_states, bit, count); } /* @@ -348,86 +354,87 @@ static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm, * * Will not modify the state of the memory block. */ -static void virtio_mem_mb_set_sb_unplugged(struct virtio_mem *vm, - unsigned long mb_id, int sb_id, - int count) +static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) { - const int bit = virtio_mem_sb_bitmap_bit_nr(vm, mb_id, sb_id); + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); - __bitmap_clear(vm->sb_bitmap, bit, count); + __bitmap_clear(vm->sbm.sb_states, bit, count); } /* * Test if all selected subblocks are plugged. */ -static bool virtio_mem_mb_test_sb_plugged(struct virtio_mem *vm, - unsigned long mb_id, int sb_id, - int count) +static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) { - const int bit = virtio_mem_sb_bitmap_bit_nr(vm, mb_id, sb_id); + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); if (count == 1) - return test_bit(bit, vm->sb_bitmap); + return test_bit(bit, vm->sbm.sb_states); /* TODO: Helper similar to bitmap_set() */ - return find_next_zero_bit(vm->sb_bitmap, bit + count, bit) >= + return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >= bit + count; } /* * Test if all selected subblocks are unplugged. */ -static bool virtio_mem_mb_test_sb_unplugged(struct virtio_mem *vm, - unsigned long mb_id, int sb_id, - int count) +static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) { - const int bit = virtio_mem_sb_bitmap_bit_nr(vm, mb_id, sb_id); + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id); /* TODO: Helper similar to bitmap_set() */ - return find_next_bit(vm->sb_bitmap, bit + count, bit) >= bit + count; + return find_next_bit(vm->sbm.sb_states, bit + count, bit) >= + bit + count; } /* * Find the first unplugged subblock. Returns vm->nb_sb_per_mb in case there is * none. */ -static int virtio_mem_mb_first_unplugged_sb(struct virtio_mem *vm, +static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, unsigned long mb_id) { - const int bit = virtio_mem_sb_bitmap_bit_nr(vm, mb_id, 0); + const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); - return find_next_zero_bit(vm->sb_bitmap, bit + vm->nb_sb_per_mb, bit) - - bit; + return find_next_zero_bit(vm->sbm.sb_states, + bit + vm->nb_sb_per_mb, bit) - bit; } /* * Prepare the subblock bitmap for the next memory block. */ -static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem *vm) +static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) { const unsigned long old_nb_mb = vm->next_mb_id - vm->first_mb_id; const unsigned long old_nb_bits = old_nb_mb * vm->nb_sb_per_mb; const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->nb_sb_per_mb; int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); - unsigned long *new_sb_bitmap, *old_sb_bitmap; + unsigned long *new_bitmap, *old_bitmap; - if (vm->sb_bitmap && old_pages == new_pages) + if (vm->sbm.sb_states && old_pages == new_pages) return 0; - new_sb_bitmap = vzalloc(new_pages * PAGE_SIZE); - if (!new_sb_bitmap) + new_bitmap = vzalloc(new_pages * PAGE_SIZE); + if (!new_bitmap) return -ENOMEM; mutex_lock(&vm->hotplug_mutex); - if (new_sb_bitmap) - memcpy(new_sb_bitmap, vm->sb_bitmap, old_pages * PAGE_SIZE); + if (new_bitmap) + memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE); - old_sb_bitmap = vm->sb_bitmap; - vm->sb_bitmap = new_sb_bitmap; + old_bitmap = vm->sbm.sb_states; + vm->sbm.sb_states = new_bitmap; mutex_unlock(&vm->hotplug_mutex); - vfree(old_sb_bitmap); + vfree(old_bitmap); return 0; } @@ -638,7 +645,7 @@ static void virtio_mem_notify_going_offline(struct virtio_mem *vm, int sb_id; for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) { - if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) continue; pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + sb_id * vm->subblock_size); @@ -654,7 +661,7 @@ static void virtio_mem_notify_cancel_offline(struct virtio_mem *vm, int sb_id; for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) { - if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) continue; pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + sb_id * vm->subblock_size); @@ -944,7 +951,7 @@ static void virtio_mem_online_page_cb(struct page *page, unsigned int order) * If plugged, online the pages, otherwise, set them fake * offline (PageOffline). */ - if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) generic_online_page(page, order); else virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, @@ -1102,7 +1109,7 @@ static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id, rc = virtio_mem_send_plug_request(vm, addr, size); if (!rc) - virtio_mem_mb_set_sb_plugged(vm, mb_id, sb_id, count); + virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count); return rc; } @@ -1120,7 +1127,7 @@ static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, rc = virtio_mem_send_unplug_request(vm, addr, size); if (!rc) - virtio_mem_mb_set_sb_unplugged(vm, mb_id, sb_id, count); + virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count); return rc; } @@ -1143,14 +1150,14 @@ static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm, while (*nb_sb) { /* Find the next candidate subblock */ while (sb_id >= 0 && - virtio_mem_mb_test_sb_unplugged(vm, mb_id, sb_id, 1)) + virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1)) sb_id--; if (sb_id < 0) break; /* Try to unplug multiple subblocks at a time */ count = 1; while (count < *nb_sb && sb_id > 0 && - virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { + virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { count++; sb_id--; } @@ -1196,7 +1203,7 @@ static int virtio_mem_prepare_next_mb(struct virtio_mem *vm, return rc; /* Resize the subblock bitmap if required. */ - rc = virtio_mem_sb_bitmap_prepare_next_mb(vm); + rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm); if (rc) return rc; @@ -1281,14 +1288,13 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, return -EINVAL; while (*nb_sb) { - sb_id = virtio_mem_mb_first_unplugged_sb(vm, mb_id); + sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id); if (sb_id >= vm->nb_sb_per_mb) break; count = 1; while (count < *nb_sb && sb_id + count < vm->nb_sb_per_mb && - !virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id + count, - 1)) + !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) count++; rc = virtio_mem_mb_plug_sb(vm, mb_id, sb_id, count); @@ -1305,7 +1311,7 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, virtio_mem_fake_online(pfn, nr_pages); } - if (virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { if (online) virtio_mem_sbm_set_mb_state(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE); @@ -1405,13 +1411,13 @@ static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm, rc = virtio_mem_mb_unplug_any_sb(vm, mb_id, nb_sb); /* some subblocks might have been unplugged even on failure */ - if (!virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) + if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) virtio_mem_sbm_set_mb_state(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); if (rc) return rc; - if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { + if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { /* * Remove the block from Linux - this should never fail. * Hinder the block from getting onlined by marking it @@ -1480,7 +1486,7 @@ static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm, /* If possible, try to unplug the complete block in one shot. */ if (*nb_sb >= vm->nb_sb_per_mb && - virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { + virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, 0, vm->nb_sb_per_mb); if (!rc) { @@ -1494,7 +1500,7 @@ static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm, for (sb_id = vm->nb_sb_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { /* Find the next candidate subblock */ while (sb_id >= 0 && - !virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) + !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) sb_id--; if (sb_id < 0) break; @@ -1513,7 +1519,7 @@ unplugged: * remove it. This will usually not fail, as no memory is in use * anymore - however some other notifiers might NACK the request. */ - if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { + if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { mutex_unlock(&vm->hotplug_mutex); rc = virtio_mem_mb_offline_and_remove(vm, mb_id); mutex_lock(&vm->hotplug_mutex); @@ -2035,7 +2041,7 @@ static void virtio_mem_remove(struct virtio_device *vdev) /* remove all tracking data - no locking needed */ vfree(vm->sbm.mb_states); - vfree(vm->sb_bitmap); + vfree(vm->sbm.sb_states); /* reset the device and cleanup the queues */ vdev->config->reset(vdev); From 905c4c5146dcb1b1e0a534ae9b5da6c5e4f29c21 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:06 +0100 Subject: [PATCH 181/326] virtio-mem: nb_sb_per_mb and subblock_size are specific to Sub Block Mode (SBM) Let's rename to "sbs_per_mb" and "sb_size" and move accordingly. Reviewed-by: Wei Yang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-21-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 96 ++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 851cddf5c606..6395c3090252 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -96,11 +96,6 @@ struct virtio_mem { /* Maximum region size in bytes. */ uint64_t region_size; - /* The subblock size. */ - uint64_t subblock_size; - /* The number of subblocks per memory block. */ - uint32_t nb_sb_per_mb; - /* Id of the first memory block of this device. */ unsigned long first_mb_id; /* Id of the last usable memory block of this device. */ @@ -126,6 +121,11 @@ struct virtio_mem { uint64_t offline_threshold; struct { + /* The subblock size. */ + uint64_t sb_size; + /* The number of subblocks per Linux memory block. */ + uint32_t sbs_per_mb; + /* Summary of all memory block states. */ unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; @@ -256,7 +256,7 @@ static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); - return (addr - mb_addr) / vm->subblock_size; + return (addr - mb_addr) / vm->sbm.sb_size; } /* @@ -332,7 +332,7 @@ static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, unsigned long mb_id, int sb_id) { - return (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; + return (mb_id - vm->first_mb_id) * vm->sbm.sbs_per_mb + sb_id; } /* @@ -395,7 +395,7 @@ static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm, } /* - * Find the first unplugged subblock. Returns vm->nb_sb_per_mb in case there is + * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is * none. */ static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, @@ -404,7 +404,7 @@ static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0); return find_next_zero_bit(vm->sbm.sb_states, - bit + vm->nb_sb_per_mb, bit) - bit; + bit + vm->sbm.sbs_per_mb, bit) - bit; } /* @@ -413,8 +413,8 @@ static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) { const unsigned long old_nb_mb = vm->next_mb_id - vm->first_mb_id; - const unsigned long old_nb_bits = old_nb_mb * vm->nb_sb_per_mb; - const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->nb_sb_per_mb; + const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; + const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); unsigned long *new_bitmap, *old_bitmap; @@ -640,15 +640,15 @@ static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id) static void virtio_mem_notify_going_offline(struct virtio_mem *vm, unsigned long mb_id) { - const unsigned long nr_pages = PFN_DOWN(vm->subblock_size); + const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); unsigned long pfn; int sb_id; - for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) { + for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) continue; pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + - sb_id * vm->subblock_size); + sb_id * vm->sbm.sb_size); virtio_mem_fake_offline_going_offline(pfn, nr_pages); } } @@ -656,15 +656,15 @@ static void virtio_mem_notify_going_offline(struct virtio_mem *vm, static void virtio_mem_notify_cancel_offline(struct virtio_mem *vm, unsigned long mb_id) { - const unsigned long nr_pages = PFN_DOWN(vm->subblock_size); + const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); unsigned long pfn; int sb_id; - for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) { + for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) { if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) continue; pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + - sb_id * vm->subblock_size); + sb_id * vm->sbm.sb_size); virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); } } @@ -1103,8 +1103,8 @@ static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id, int sb_id, int count) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + - sb_id * vm->subblock_size; - const uint64_t size = count * vm->subblock_size; + sb_id * vm->sbm.sb_size; + const uint64_t size = count * vm->sbm.sb_size; int rc; rc = virtio_mem_send_plug_request(vm, addr, size); @@ -1121,8 +1121,8 @@ static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, int sb_id, int count) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + - sb_id * vm->subblock_size; - const uint64_t size = count * vm->subblock_size; + sb_id * vm->sbm.sb_size; + const uint64_t size = count * vm->sbm.sb_size; int rc; rc = virtio_mem_send_unplug_request(vm, addr, size); @@ -1146,7 +1146,7 @@ static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm, int sb_id, count; int rc; - sb_id = vm->nb_sb_per_mb - 1; + sb_id = vm->sbm.sbs_per_mb - 1; while (*nb_sb) { /* Find the next candidate subblock */ while (sb_id >= 0 && @@ -1181,7 +1181,7 @@ static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm, */ static int virtio_mem_mb_unplug(struct virtio_mem *vm, unsigned long mb_id) { - uint64_t nb_sb = vm->nb_sb_per_mb; + uint64_t nb_sb = vm->sbm.sbs_per_mb; return virtio_mem_mb_unplug_any_sb(vm, mb_id, &nb_sb); } @@ -1222,7 +1222,7 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, unsigned long mb_id, uint64_t *nb_sb) { - const int count = min_t(int, *nb_sb, vm->nb_sb_per_mb); + const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb); int rc; if (WARN_ON_ONCE(!count)) @@ -1240,7 +1240,7 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, * Mark the block properly offline before adding it to Linux, * so the memory notifiers will find the block in the right state. */ - if (count == vm->nb_sb_per_mb) + if (count == vm->sbm.sbs_per_mb) virtio_mem_sbm_set_mb_state(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE); else @@ -1289,11 +1289,11 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, while (*nb_sb) { sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id); - if (sb_id >= vm->nb_sb_per_mb) + if (sb_id >= vm->sbm.sbs_per_mb) break; count = 1; while (count < *nb_sb && - sb_id + count < vm->nb_sb_per_mb && + sb_id + count < vm->sbm.sbs_per_mb && !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) count++; @@ -1306,12 +1306,12 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, /* fake-online the pages if the memory block is online */ pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + - sb_id * vm->subblock_size); - nr_pages = PFN_DOWN(count * vm->subblock_size); + sb_id * vm->sbm.sb_size); + nr_pages = PFN_DOWN(count * vm->sbm.sb_size); virtio_mem_fake_online(pfn, nr_pages); } - if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { if (online) virtio_mem_sbm_set_mb_state(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE); @@ -1328,7 +1328,7 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, */ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) { - uint64_t nb_sb = diff / vm->subblock_size; + uint64_t nb_sb = diff / vm->sbm.sb_size; unsigned long mb_id; int rc; @@ -1411,13 +1411,13 @@ static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm, rc = virtio_mem_mb_unplug_any_sb(vm, mb_id, nb_sb); /* some subblocks might have been unplugged even on failure */ - if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) + if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) virtio_mem_sbm_set_mb_state(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); if (rc) return rc; - if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { + if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { /* * Remove the block from Linux - this should never fail. * Hinder the block from getting onlined by marking it @@ -1444,12 +1444,12 @@ static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm, unsigned long mb_id, int sb_id, int count) { - const unsigned long nr_pages = PFN_DOWN(vm->subblock_size) * count; + const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; unsigned long start_pfn; int rc; start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + - sb_id * vm->subblock_size); + sb_id * vm->sbm.sb_size); rc = virtio_mem_fake_offline(start_pfn, nr_pages); if (rc) @@ -1485,19 +1485,19 @@ static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm, int rc, sb_id; /* If possible, try to unplug the complete block in one shot. */ - if (*nb_sb >= vm->nb_sb_per_mb && - virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { + if (*nb_sb >= vm->sbm.sbs_per_mb && + virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, 0, - vm->nb_sb_per_mb); + vm->sbm.sbs_per_mb); if (!rc) { - *nb_sb -= vm->nb_sb_per_mb; + *nb_sb -= vm->sbm.sbs_per_mb; goto unplugged; } else if (rc != -EBUSY) return rc; } /* Fallback to single subblocks. */ - for (sb_id = vm->nb_sb_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { + for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { /* Find the next candidate subblock */ while (sb_id >= 0 && !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) @@ -1519,7 +1519,7 @@ unplugged: * remove it. This will usually not fail, as no memory is in use * anymore - however some other notifiers might NACK the request. */ - if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { + if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { mutex_unlock(&vm->hotplug_mutex); rc = virtio_mem_mb_offline_and_remove(vm, mb_id); mutex_lock(&vm->hotplug_mutex); @@ -1536,7 +1536,7 @@ unplugged: */ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) { - uint64_t nb_sb = diff / vm->subblock_size; + uint64_t nb_sb = diff / vm->sbm.sb_size; unsigned long mb_id; int rc; @@ -1827,11 +1827,11 @@ static int virtio_mem_init(struct virtio_mem *vm) * - Is required for now for alloc_contig_range() to work reliably - * it doesn't properly handle smaller granularity on ZONE_NORMAL. */ - vm->subblock_size = max_t(uint64_t, MAX_ORDER_NR_PAGES, - pageblock_nr_pages) * PAGE_SIZE; - vm->subblock_size = max_t(uint64_t, vm->device_block_size, - vm->subblock_size); - vm->nb_sb_per_mb = memory_block_size_bytes() / vm->subblock_size; + vm->sbm.sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES, + pageblock_nr_pages) * PAGE_SIZE; + vm->sbm.sb_size = max_t(uint64_t, vm->device_block_size, + vm->sbm.sb_size); + vm->sbm.sbs_per_mb = memory_block_size_bytes() / vm->sbm.sb_size; /* Round up to the next full memory block */ vm->first_mb_id = virtio_mem_phys_to_mb_id(vm->addr - 1 + @@ -1849,7 +1849,7 @@ static int virtio_mem_init(struct virtio_mem *vm) dev_info(&vm->vdev->dev, "memory block size: 0x%lx", memory_block_size_bytes()); dev_info(&vm->vdev->dev, "subblock size: 0x%llx", - (unsigned long long)vm->subblock_size); + (unsigned long long)vm->sbm.sb_size); if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) dev_info(&vm->vdev->dev, "nid: %d", vm->nid); From 8a6f082babea6744b876a23ff5ed6081bf12968d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:07 +0100 Subject: [PATCH 182/326] virtio-mem: memory block ids are specific to Sub Block Mode (SBM) Let's move first_mb_id/next_mb_id/last_usable_mb_id accordingly. Reviewed-by: Wei Yang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-22-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 46 ++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 6395c3090252..248d28e653a9 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -96,13 +96,6 @@ struct virtio_mem { /* Maximum region size in bytes. */ uint64_t region_size; - /* Id of the first memory block of this device. */ - unsigned long first_mb_id; - /* Id of the last usable memory block of this device. */ - unsigned long last_usable_mb_id; - /* Id of the next memory bock to prepare when needed. */ - unsigned long next_mb_id; - /* The parent resource for all memory added via this device. */ struct resource *parent_resource; /* @@ -121,6 +114,13 @@ struct virtio_mem { uint64_t offline_threshold; struct { + /* Id of the first memory block of this device. */ + unsigned long first_mb_id; + /* Id of the last usable memory block of this device. */ + unsigned long last_usable_mb_id; + /* Id of the next memory bock to prepare when needed. */ + unsigned long next_mb_id; + /* The subblock size. */ uint64_t sb_size; /* The number of subblocks per Linux memory block. */ @@ -265,7 +265,7 @@ static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, unsigned long mb_id, uint8_t state) { - const unsigned long idx = mb_id - vm->first_mb_id; + const unsigned long idx = mb_id - vm->sbm.first_mb_id; uint8_t old_state; old_state = vm->sbm.mb_states[idx]; @@ -282,7 +282,7 @@ static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm, static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, unsigned long mb_id) { - const unsigned long idx = mb_id - vm->first_mb_id; + const unsigned long idx = mb_id - vm->sbm.first_mb_id; return vm->sbm.mb_states[idx]; } @@ -292,8 +292,8 @@ static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm, */ static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) { - int old_pages = PFN_UP(vm->next_mb_id - vm->first_mb_id); - int new_pages = PFN_UP(vm->next_mb_id - vm->first_mb_id + 1); + int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id); + int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1); uint8_t *new_array; if (vm->sbm.mb_states && old_pages == new_pages) @@ -314,14 +314,14 @@ static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) } #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \ - for (_mb_id = _vm->first_mb_id; \ - _mb_id < _vm->next_mb_id && _vm->sbm.mb_count[_state]; \ + for (_mb_id = _vm->sbm.first_mb_id; \ + _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \ _mb_id++) \ if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \ - for (_mb_id = _vm->next_mb_id - 1; \ - _mb_id >= _vm->first_mb_id && _vm->sbm.mb_count[_state]; \ + for (_mb_id = _vm->sbm.next_mb_id - 1; \ + _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \ _mb_id--) \ if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state) @@ -332,7 +332,7 @@ static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm) static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm, unsigned long mb_id, int sb_id) { - return (mb_id - vm->first_mb_id) * vm->sbm.sbs_per_mb + sb_id; + return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id; } /* @@ -412,7 +412,7 @@ static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm, */ static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm) { - const unsigned long old_nb_mb = vm->next_mb_id - vm->first_mb_id; + const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id; const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb; const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb; int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); @@ -1194,7 +1194,7 @@ static int virtio_mem_prepare_next_mb(struct virtio_mem *vm, { int rc; - if (vm->next_mb_id > vm->last_usable_mb_id) + if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id) return -ENOSPC; /* Resize the state array if required. */ @@ -1208,7 +1208,7 @@ static int virtio_mem_prepare_next_mb(struct virtio_mem *vm, return rc; vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++; - *mb_id = vm->next_mb_id++; + *mb_id = vm->sbm.next_mb_id++; return 0; } @@ -1643,7 +1643,7 @@ static void virtio_mem_refresh_config(struct virtio_mem *vm) usable_region_size, &usable_region_size); end_addr = vm->addr + usable_region_size; end_addr = min(end_addr, phys_limit); - vm->last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr) - 1; + vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr) - 1; /* see if there is a request to change the size */ virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, @@ -1834,9 +1834,9 @@ static int virtio_mem_init(struct virtio_mem *vm) vm->sbm.sbs_per_mb = memory_block_size_bytes() / vm->sbm.sb_size; /* Round up to the next full memory block */ - vm->first_mb_id = virtio_mem_phys_to_mb_id(vm->addr - 1 + - memory_block_size_bytes()); - vm->next_mb_id = vm->first_mb_id; + vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(vm->addr - 1 + + memory_block_size_bytes()); + vm->sbm.next_mb_id = vm->sbm.first_mb_id; /* Prepare the offline threshold - make sure we can add two blocks. */ vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), From 602ef89457173a24dde30874faec1f15a00e112a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:08 +0100 Subject: [PATCH 183/326] virito-mem: existing (un)plug functions are specific to Sub Block Mode (SBM) Let's rename them accordingly. virtio_mem_plug_request() and virtio_mem_unplug_request() will be handled separately. Reviewed-by: Wei Yang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-23-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 90 ++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 47 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 248d28e653a9..ec81f9d4bccf 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -1099,8 +1099,8 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) * Plug selected subblocks. Updates the plugged state, but not the state * of the memory block. */ -static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id, - int sb_id, int count) +static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id, + int sb_id, int count) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + sb_id * vm->sbm.sb_size; @@ -1117,8 +1117,8 @@ static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id, * Unplug selected subblocks. Updates the plugged state, but not the state * of the memory block. */ -static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, - int sb_id, int count) +static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, + int sb_id, int count) { const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + sb_id * vm->sbm.sb_size; @@ -1140,8 +1140,8 @@ static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, * * Note: can fail after some subblocks were unplugged. */ -static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm, - unsigned long mb_id, uint64_t *nb_sb) +static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, + unsigned long mb_id, uint64_t *nb_sb) { int sb_id, count; int rc; @@ -1162,7 +1162,7 @@ static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm, sb_id--; } - rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count); + rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); if (rc) return rc; *nb_sb -= count; @@ -1179,18 +1179,18 @@ static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm, * * Note: can fail after some subblocks were unplugged. */ -static int virtio_mem_mb_unplug(struct virtio_mem *vm, unsigned long mb_id) +static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) { uint64_t nb_sb = vm->sbm.sbs_per_mb; - return virtio_mem_mb_unplug_any_sb(vm, mb_id, &nb_sb); + return virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); } /* * Prepare tracking data for the next memory block. */ -static int virtio_mem_prepare_next_mb(struct virtio_mem *vm, - unsigned long *mb_id) +static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm, + unsigned long *mb_id) { int rc; @@ -1218,9 +1218,8 @@ static int virtio_mem_prepare_next_mb(struct virtio_mem *vm, * * Will modify the state of the memory block. */ -static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, - unsigned long mb_id, - uint64_t *nb_sb) +static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, + unsigned long mb_id, uint64_t *nb_sb) { const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb); int rc; @@ -1232,7 +1231,7 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, * Plug the requested number of subblocks before adding it to linux, * so that onlining will directly online all plugged subblocks. */ - rc = virtio_mem_mb_plug_sb(vm, mb_id, 0, count); + rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count); if (rc) return rc; @@ -1259,7 +1258,7 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, * TODO: Linux MM does not properly clean up yet in all cases * where adding of memory failed - especially on -ENOMEM. */ - if (virtio_mem_mb_unplug_sb(vm, mb_id, 0, count)) + if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count)) new_state = VIRTIO_MEM_SBM_MB_PLUGGED; virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); return rc; @@ -1277,8 +1276,9 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, * * Note: Can fail after some subblocks were successfully plugged. */ -static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, - uint64_t *nb_sb, bool online) +static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, + unsigned long mb_id, uint64_t *nb_sb, + bool online) { unsigned long pfn, nr_pages; int sb_id, count; @@ -1297,7 +1297,7 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1)) count++; - rc = virtio_mem_mb_plug_sb(vm, mb_id, sb_id, count); + rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count); if (rc) return rc; *nb_sb -= count; @@ -1341,7 +1341,7 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) /* Try to plug subblocks of partially plugged online blocks. */ virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { - rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, true); + rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, true); if (rc || !nb_sb) goto out_unlock; cond_resched(); @@ -1350,7 +1350,7 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) /* Try to plug subblocks of partially plugged offline blocks. */ virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { - rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, false); + rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, false); if (rc || !nb_sb) goto out_unlock; cond_resched(); @@ -1367,7 +1367,7 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) return -ENOSPC; - rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb); + rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); if (rc || !nb_sb) return rc; cond_resched(); @@ -1378,10 +1378,10 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes())) return -ENOSPC; - rc = virtio_mem_prepare_next_mb(vm, &mb_id); + rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id); if (rc) return rc; - rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb); + rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb); if (rc) return rc; cond_resched(); @@ -1402,13 +1402,13 @@ out_unlock: * * Note: Can fail after some subblocks were successfully unplugged. */ -static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm, - unsigned long mb_id, - uint64_t *nb_sb) +static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, + unsigned long mb_id, + uint64_t *nb_sb) { int rc; - rc = virtio_mem_mb_unplug_any_sb(vm, mb_id, nb_sb); + rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, nb_sb); /* some subblocks might have been unplugged even on failure */ if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) @@ -1440,9 +1440,9 @@ static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm, * * Will modify the state of the memory block. */ -static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm, - unsigned long mb_id, int sb_id, - int count) +static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, + unsigned long mb_id, int sb_id, + int count) { const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; unsigned long start_pfn; @@ -1456,7 +1456,7 @@ static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm, return rc; /* Try to unplug the allocated memory */ - rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count); + rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count); if (rc) { /* Return the memory to the buddy. */ virtio_mem_fake_online(start_pfn, nr_pages); @@ -1478,17 +1478,17 @@ static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm, * Note: Can fail after some subblocks were successfully unplugged. Can * return 0 even if subblocks were busy and could not get unplugged. */ -static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm, - unsigned long mb_id, - uint64_t *nb_sb) +static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm, + unsigned long mb_id, + uint64_t *nb_sb) { int rc, sb_id; /* If possible, try to unplug the complete block in one shot. */ if (*nb_sb >= vm->sbm.sbs_per_mb && virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { - rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, 0, - vm->sbm.sbs_per_mb); + rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0, + vm->sbm.sbs_per_mb); if (!rc) { *nb_sb -= vm->sbm.sbs_per_mb; goto unplugged; @@ -1505,7 +1505,7 @@ static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm, if (sb_id < 0) break; - rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, sb_id, 1); + rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1); if (rc == -EBUSY) continue; else if (rc) @@ -1553,8 +1553,7 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) /* Try to unplug subblocks of partially plugged offline blocks. */ virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { - rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id, - &nb_sb); + rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb); if (rc || !nb_sb) goto out_unlock; cond_resched(); @@ -1562,8 +1561,7 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) /* Try to unplug subblocks of plugged offline blocks. */ virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE) { - rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id, - &nb_sb); + rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb); if (rc || !nb_sb) goto out_unlock; cond_resched(); @@ -1577,8 +1575,7 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) /* Try to unplug subblocks of partially plugged online blocks. */ virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { - rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id, - &nb_sb); + rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb); if (rc || !nb_sb) goto out_unlock; mutex_unlock(&vm->hotplug_mutex); @@ -1588,8 +1585,7 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) /* Try to unplug subblocks of plugged online blocks. */ virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE) { - rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id, - &nb_sb); + rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb); if (rc || !nb_sb) goto out_unlock; mutex_unlock(&vm->hotplug_mutex); @@ -1614,7 +1610,7 @@ static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) int rc; virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_PLUGGED) { - rc = virtio_mem_mb_unplug(vm, mb_id); + rc = virtio_mem_sbm_unplug_mb(vm, mb_id); if (rc) return rc; virtio_mem_sbm_set_mb_state(vm, mb_id, From d46dfb62f676f949352c7fd8b7a0fa3b7fe1b933 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:09 +0100 Subject: [PATCH 184/326] virtio-mem: memory notifier callbacks are specific to Sub Block Mode (SBM) Let's rename accordingly. Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-24-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index ec81f9d4bccf..cdcf67e55a56 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -587,8 +587,8 @@ static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start, return start >= vm->addr && start + size <= vm->addr + vm->region_size; } -static int virtio_mem_notify_going_online(struct virtio_mem *vm, - unsigned long mb_id) +static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm, + unsigned long mb_id) { switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: @@ -602,8 +602,8 @@ static int virtio_mem_notify_going_online(struct virtio_mem *vm, return NOTIFY_BAD; } -static void virtio_mem_notify_offline(struct virtio_mem *vm, - unsigned long mb_id) +static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, + unsigned long mb_id) { switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { case VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL: @@ -620,7 +620,8 @@ static void virtio_mem_notify_offline(struct virtio_mem *vm, } } -static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id) +static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, + unsigned long mb_id) { switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: @@ -637,8 +638,8 @@ static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id) } } -static void virtio_mem_notify_going_offline(struct virtio_mem *vm, - unsigned long mb_id) +static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, + unsigned long mb_id) { const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); unsigned long pfn; @@ -653,8 +654,8 @@ static void virtio_mem_notify_going_offline(struct virtio_mem *vm, } } -static void virtio_mem_notify_cancel_offline(struct virtio_mem *vm, - unsigned long mb_id) +static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, + unsigned long mb_id) { const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size); unsigned long pfn; @@ -714,7 +715,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, break; } vm->hotplug_active = true; - virtio_mem_notify_going_offline(vm, mb_id); + virtio_mem_sbm_notify_going_offline(vm, mb_id); break; case MEM_GOING_ONLINE: mutex_lock(&vm->hotplug_mutex); @@ -724,10 +725,10 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, break; } vm->hotplug_active = true; - rc = virtio_mem_notify_going_online(vm, mb_id); + rc = virtio_mem_sbm_notify_going_online(vm, mb_id); break; case MEM_OFFLINE: - virtio_mem_notify_offline(vm, mb_id); + virtio_mem_sbm_notify_offline(vm, mb_id); atomic64_add(size, &vm->offline_size); /* @@ -741,7 +742,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, mutex_unlock(&vm->hotplug_mutex); break; case MEM_ONLINE: - virtio_mem_notify_online(vm, mb_id); + virtio_mem_sbm_notify_online(vm, mb_id); atomic64_sub(size, &vm->offline_size); /* @@ -760,7 +761,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, case MEM_CANCEL_OFFLINE: if (!vm->hotplug_active) break; - virtio_mem_notify_cancel_offline(vm, mb_id); + virtio_mem_sbm_notify_cancel_offline(vm, mb_id); vm->hotplug_active = false; mutex_unlock(&vm->hotplug_mutex); break; From 01afdee29aef144ad956d1d5302aaaeabf498f48 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:10 +0100 Subject: [PATCH 185/326] virtio-mem: factor out adding/removing memory from Linux Let's use wrappers for the low-level functions that dev_dbg/dev_warn and work on addr + size, such that we can reuse them for adding/removing in other granularity. We only warn when adding memory failed, because that's something to pay attention to. We won't warn when removing failed, we'll reuse that in racy context soon (and we do have proper BUG_ON() statements in the current cases where it must never happen). Reviewed-by: Wei Yang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-25-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 107 ++++++++++++++++++++++++------------ 1 file changed, 73 insertions(+), 34 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index cdcf67e55a56..95fa0262af1d 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -451,18 +451,16 @@ static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size) } /* - * Try to add a memory block to Linux. This will usually only fail - * if out of memory. + * Try adding memory to Linux. Will usually only fail if out of memory. * * Must not be called with the vm->hotplug_mutex held (possible deadlock with * onlining code). * - * Will not modify the state of the memory block. + * Will not modify the state of memory blocks in virtio-mem. */ -static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) +static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr, + uint64_t size) { - const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); - const uint64_t size = memory_block_size_bytes(); int rc; /* @@ -476,32 +474,50 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) return -ENOMEM; } - dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id); + dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); /* Memory might get onlined immediately. */ atomic64_add(size, &vm->offline_size); rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name, MEMHP_MERGE_RESOURCE); - if (rc) + if (rc) { atomic64_sub(size, &vm->offline_size); + dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc); + /* + * TODO: Linux MM does not properly clean up yet in all cases + * where adding of memory failed - especially on -ENOMEM. + */ + } return rc; } /* - * Try to remove a memory block from Linux. Will only fail if the memory block - * is not offline. + * See virtio_mem_add_memory(): Try adding a single Linux memory block. + */ +static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) +{ + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); + const uint64_t size = memory_block_size_bytes(); + + return virtio_mem_add_memory(vm, addr, size); +} + +/* + * Try removing memory from Linux. Will only fail if memory blocks aren't + * offline. * * Must not be called with the vm->hotplug_mutex held (possible deadlock with * onlining code). * - * Will not modify the state of the memory block. + * Will not modify the state of memory blocks in virtio-mem. */ -static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id) +static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr, + uint64_t size) { - const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); - const uint64_t size = memory_block_size_bytes(); int rc; - dev_dbg(&vm->vdev->dev, "removing memory block: %lu\n", mb_id); + dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); rc = remove_memory(vm->nid, addr, size); if (!rc) { atomic64_sub(size, &vm->offline_size); @@ -510,27 +526,41 @@ static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id) * immediately instead of waiting. */ virtio_mem_retry(vm); + } else { + dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc); } return rc; } /* - * Try to offline and remove a memory block from Linux. + * See virtio_mem_remove_memory(): Try removing a single Linux memory block. + */ +static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) +{ + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); + const uint64_t size = memory_block_size_bytes(); + + return virtio_mem_remove_memory(vm, addr, size); +} + +/* + * Try offlining and removing memory from Linux. * * Must not be called with the vm->hotplug_mutex held (possible deadlock with * onlining code). * - * Will not modify the state of the memory block. + * Will not modify the state of memory blocks in virtio-mem. */ -static int virtio_mem_mb_offline_and_remove(struct virtio_mem *vm, - unsigned long mb_id) +static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm, + uint64_t addr, + uint64_t size) { - const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); - const uint64_t size = memory_block_size_bytes(); int rc; - dev_dbg(&vm->vdev->dev, "offlining and removing memory block: %lu\n", - mb_id); + dev_dbg(&vm->vdev->dev, + "offlining and removing memory: 0x%llx - 0x%llx\n", addr, + addr + size - 1); + rc = offline_and_remove_memory(vm->nid, addr, size); if (!rc) { atomic64_sub(size, &vm->offline_size); @@ -539,10 +569,26 @@ static int virtio_mem_mb_offline_and_remove(struct virtio_mem *vm, * immediately instead of waiting. */ virtio_mem_retry(vm); + } else { + dev_dbg(&vm->vdev->dev, + "offlining and removing memory failed: %d\n", rc); } return rc; } +/* + * See virtio_mem_offline_and_remove_memory(): Try offlining and removing + * a single Linux memory block. + */ +static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, + unsigned long mb_id) +{ + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); + const uint64_t size = memory_block_size_bytes(); + + return virtio_mem_offline_and_remove_memory(vm, addr, size); +} + /* * Trigger the workqueue so the device can perform its magic. */ @@ -1248,17 +1294,10 @@ static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); /* Add the memory block to linux - if that fails, try to unplug. */ - rc = virtio_mem_mb_add(vm, mb_id); + rc = virtio_mem_sbm_add_mb(vm, mb_id); if (rc) { int new_state = VIRTIO_MEM_SBM_MB_UNUSED; - dev_err(&vm->vdev->dev, - "adding memory block %lu failed with %d\n", mb_id, rc); - - /* - * TODO: Linux MM does not properly clean up yet in all cases - * where adding of memory failed - especially on -ENOMEM. - */ if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count)) new_state = VIRTIO_MEM_SBM_MB_PLUGGED; virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); @@ -1429,7 +1468,7 @@ static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, VIRTIO_MEM_SBM_MB_UNUSED); mutex_unlock(&vm->hotplug_mutex); - rc = virtio_mem_mb_remove(vm, mb_id); + rc = virtio_mem_sbm_remove_mb(vm, mb_id); BUG_ON(rc); mutex_lock(&vm->hotplug_mutex); } @@ -1522,7 +1561,7 @@ unplugged: */ if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { mutex_unlock(&vm->hotplug_mutex); - rc = virtio_mem_mb_offline_and_remove(vm, mb_id); + rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id); mutex_lock(&vm->hotplug_mutex); if (!rc) virtio_mem_sbm_set_mb_state(vm, mb_id, @@ -2009,7 +2048,7 @@ static void virtio_mem_remove(struct virtio_device *vdev) */ virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { - rc = virtio_mem_mb_remove(vm, mb_id); + rc = virtio_mem_sbm_remove_mb(vm, mb_id); BUG_ON(rc); virtio_mem_sbm_set_mb_state(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED); From 4ba50cd3355d742c8befbfe38dcbe559f2b0f758 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:11 +0100 Subject: [PATCH 186/326] virtio-mem: Big Block Mode (BBM) memory hotplug Currently, we do not support device block sizes that exceed the Linux memory block size. For example, having a device block size of 1 GiB (e.g., gigantic pages in the hypervisor) won't work with 128 MiB Linux memory blocks. Let's implement Big Block Mode (BBM), whereby we add/remove at least one Linux memory block at a time. With a 1 GiB device block size, a Big Block (BB) will cover 8 Linux memory blocks. We'll keep registering the online_page_callback machinery, it will be used for safe memory hotunplug in BBM next. Note: BBM is properly prepared for variable-sized Linux memory blocks that we might see in the future. So we won't care how many Linux memory blocks a big block actually spans, and how the memory notifier is called. Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Cc: Michal Hocko Cc: Oscar Salvador Cc: Wei Yang Cc: Andrew Morton Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-26-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 552 ++++++++++++++++++++++++++++-------- 1 file changed, 437 insertions(+), 115 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 95fa0262af1d..8a4f735360ac 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -30,12 +30,18 @@ MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); /* * virtio-mem currently supports the following modes of operation: * - * * Sub Block Mode (SBM): A Linux memory block spans 1..X subblocks (SB). The + * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The * size of a Sub Block (SB) is determined based on the device block size, the * pageblock size, and the maximum allocation granularity of the buddy. * Subblocks within a Linux memory block might either be plugged or unplugged. * Memory is added/removed to Linux MM in Linux memory block granularity. * + * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks. + * Memory is added/removed to Linux MM in Big Block granularity. + * + * The mode is determined automatically based on the Linux memory block size + * and the device block size. + * * User space / core MM (auto onlining) is responsible for onlining added * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are * always onlined separately, and all memory within a Linux memory block is @@ -61,6 +67,19 @@ enum virtio_mem_sbm_mb_state { VIRTIO_MEM_SBM_MB_COUNT }; +/* + * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks. + */ +enum virtio_mem_bbm_bb_state { + /* Unplugged, not added to Linux. Can be reused later. */ + VIRTIO_MEM_BBM_BB_UNUSED = 0, + /* Plugged, not added to Linux. Error on add_memory(). */ + VIRTIO_MEM_BBM_BB_PLUGGED, + /* Plugged and added to Linux. */ + VIRTIO_MEM_BBM_BB_ADDED, + VIRTIO_MEM_BBM_BB_COUNT +}; + struct virtio_mem { struct virtio_device *vdev; @@ -113,47 +132,70 @@ struct virtio_mem { atomic64_t offline_size; uint64_t offline_threshold; - struct { - /* Id of the first memory block of this device. */ - unsigned long first_mb_id; - /* Id of the last usable memory block of this device. */ - unsigned long last_usable_mb_id; - /* Id of the next memory bock to prepare when needed. */ - unsigned long next_mb_id; + /* If set, the driver is in SBM, otherwise in BBM. */ + bool in_sbm; - /* The subblock size. */ - uint64_t sb_size; - /* The number of subblocks per Linux memory block. */ - uint32_t sbs_per_mb; + union { + struct { + /* Id of the first memory block of this device. */ + unsigned long first_mb_id; + /* Id of the last usable memory block of this device. */ + unsigned long last_usable_mb_id; + /* Id of the next memory bock to prepare when needed. */ + unsigned long next_mb_id; - /* Summary of all memory block states. */ - unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; + /* The subblock size. */ + uint64_t sb_size; + /* The number of subblocks per Linux memory block. */ + uint32_t sbs_per_mb; - /* - * One byte state per memory block. Allocated via vmalloc(). - * Resized (alloc+copy+free) on demand. - * - * With 128 MiB memory blocks, we have states for 512 GiB of - * memory in one 4 KiB page. - */ - uint8_t *mb_states; + /* Summary of all memory block states. */ + unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT]; - /* - * Bitmap: one bit per subblock. Allocated similar to - * sbm.mb_states. - * - * A set bit means the corresponding subblock is plugged, - * otherwise it's unblocked. - * - * With 4 MiB subblocks, we manage 128 GiB of memory in one - * 4 KiB page. - */ - unsigned long *sb_states; - } sbm; + /* + * One byte state per memory block. Allocated via + * vmalloc(). Resized (alloc+copy+free) on demand. + * + * With 128 MiB memory blocks, we have states for 512 + * GiB of memory in one 4 KiB page. + */ + uint8_t *mb_states; + + /* + * Bitmap: one bit per subblock. Allocated similar to + * sbm.mb_states. + * + * A set bit means the corresponding subblock is + * plugged, otherwise it's unblocked. + * + * With 4 MiB subblocks, we manage 128 GiB of memory + * in one 4 KiB page. + */ + unsigned long *sb_states; + } sbm; + + struct { + /* Id of the first big block of this device. */ + unsigned long first_bb_id; + /* Id of the last usable big block of this device. */ + unsigned long last_usable_bb_id; + /* Id of the next device bock to prepare when needed. */ + unsigned long next_bb_id; + + /* Summary of all big block states. */ + unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT]; + + /* One byte state per big block. See sbm.mb_states. */ + uint8_t *bb_states; + + /* The block size used for plugging/adding/removing. */ + uint64_t bb_size; + } bbm; + }; /* - * Mutex that protects the sbm.mb_count, sbm.mb_states, and - * sbm.sb_states. + * Mutex that protects the sbm.mb_count, sbm.mb_states, + * sbm.sb_states, bbm.bb_count, and bbm.bb_states * * When this lock is held the pointers can't change, ONLINE and * OFFLINE blocks can't change the state and no subblocks will get @@ -247,6 +289,24 @@ static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) return mb_id * memory_block_size_bytes(); } +/* + * Calculate the big block id of a given address. + */ +static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm, + uint64_t addr) +{ + return addr / vm->bbm.bb_size; +} + +/* + * Calculate the physical start address of a given big block id. + */ +static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm, + unsigned long bb_id) +{ + return bb_id * vm->bbm.bb_size; +} + /* * Calculate the subblock id of a given address. */ @@ -259,6 +319,67 @@ static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, return (addr - mb_addr) / vm->sbm.sb_size; } +/* + * Set the state of a big block, taking care of the state counter. + */ +static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm, + unsigned long bb_id, + enum virtio_mem_bbm_bb_state state) +{ + const unsigned long idx = bb_id - vm->bbm.first_bb_id; + enum virtio_mem_bbm_bb_state old_state; + + old_state = vm->bbm.bb_states[idx]; + vm->bbm.bb_states[idx] = state; + + BUG_ON(vm->bbm.bb_count[old_state] == 0); + vm->bbm.bb_count[old_state]--; + vm->bbm.bb_count[state]++; +} + +/* + * Get the state of a big block. + */ +static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm, + unsigned long bb_id) +{ + return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id]; +} + +/* + * Prepare the big block state array for the next big block. + */ +static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) +{ + unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id; + unsigned long new_bytes = old_bytes + 1; + int old_pages = PFN_UP(old_bytes); + int new_pages = PFN_UP(new_bytes); + uint8_t *new_array; + + if (vm->bbm.bb_states && old_pages == new_pages) + return 0; + + new_array = vzalloc(new_pages * PAGE_SIZE); + if (!new_array) + return -ENOMEM; + + mutex_lock(&vm->hotplug_mutex); + if (vm->bbm.bb_states) + memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE); + vfree(vm->bbm.bb_states); + vm->bbm.bb_states = new_array; + mutex_unlock(&vm->hotplug_mutex); + + return 0; +} + +#define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \ + for (_bb_id = vm->bbm.first_bb_id; \ + _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \ + _bb_id++) \ + if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) + /* * Set the state of a memory block, taking care of the state counter. */ @@ -502,6 +623,17 @@ static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id) return virtio_mem_add_memory(vm, addr, size); } +/* + * See virtio_mem_add_memory(): Try adding a big block. + */ +static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; + + return virtio_mem_add_memory(vm, addr, size); +} + /* * Try removing memory from Linux. Will only fail if memory blocks aren't * offline. @@ -729,20 +861,33 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, struct memory_notify *mhp = arg; const unsigned long start = PFN_PHYS(mhp->start_pfn); const unsigned long size = PFN_PHYS(mhp->nr_pages); - const unsigned long mb_id = virtio_mem_phys_to_mb_id(start); int rc = NOTIFY_OK; + unsigned long id; if (!virtio_mem_overlaps_range(vm, start, size)) return NOTIFY_DONE; - /* - * Memory is onlined/offlined in memory block granularity. We cannot - * cross virtio-mem device boundaries and memory block boundaries. Bail - * out if this ever changes. - */ - if (WARN_ON_ONCE(size != memory_block_size_bytes() || - !IS_ALIGNED(start, memory_block_size_bytes()))) - return NOTIFY_BAD; + if (vm->in_sbm) { + id = virtio_mem_phys_to_mb_id(start); + /* + * In SBM, we add memory in separate memory blocks - we expect + * it to be onlined/offlined in the same granularity. Bail out + * if this ever changes. + */ + if (WARN_ON_ONCE(size != memory_block_size_bytes() || + !IS_ALIGNED(start, memory_block_size_bytes()))) + return NOTIFY_BAD; + } else { + id = virtio_mem_phys_to_bb_id(vm, start); + /* + * In BBM, we only care about onlining/offlining happening + * within a single big block, we don't care about the + * actual granularity as we don't track individual Linux + * memory blocks. + */ + if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1))) + return NOTIFY_BAD; + } /* * Avoid circular locking lockdep warnings. We lock the mutex @@ -761,7 +906,8 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, break; } vm->hotplug_active = true; - virtio_mem_sbm_notify_going_offline(vm, mb_id); + if (vm->in_sbm) + virtio_mem_sbm_notify_going_offline(vm, id); break; case MEM_GOING_ONLINE: mutex_lock(&vm->hotplug_mutex); @@ -771,10 +917,12 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, break; } vm->hotplug_active = true; - rc = virtio_mem_sbm_notify_going_online(vm, mb_id); + if (vm->in_sbm) + rc = virtio_mem_sbm_notify_going_online(vm, id); break; case MEM_OFFLINE: - virtio_mem_sbm_notify_offline(vm, mb_id); + if (vm->in_sbm) + virtio_mem_sbm_notify_offline(vm, id); atomic64_add(size, &vm->offline_size); /* @@ -788,7 +936,8 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, mutex_unlock(&vm->hotplug_mutex); break; case MEM_ONLINE: - virtio_mem_sbm_notify_online(vm, mb_id); + if (vm->in_sbm) + virtio_mem_sbm_notify_online(vm, id); atomic64_sub(size, &vm->offline_size); /* @@ -807,7 +956,8 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, case MEM_CANCEL_OFFLINE: if (!vm->hotplug_active) break; - virtio_mem_sbm_notify_cancel_offline(vm, mb_id); + if (vm->in_sbm) + virtio_mem_sbm_notify_cancel_offline(vm, id); vm->hotplug_active = false; mutex_unlock(&vm->hotplug_mutex); break; @@ -978,27 +1128,29 @@ static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn, static void virtio_mem_online_page_cb(struct page *page, unsigned int order) { const unsigned long addr = page_to_phys(page); - const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); + unsigned long id, sb_id; struct virtio_mem *vm; - int sb_id; + bool do_online; - /* - * We exploit here that subblocks have at least MAX_ORDER_NR_PAGES. - * size/alignment and that this callback is is called with such a - * size/alignment. So we cannot cross subblocks and therefore - * also not memory blocks. - */ rcu_read_lock(); list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order))) continue; - sb_id = virtio_mem_phys_to_sb_id(vm, addr); - /* - * If plugged, online the pages, otherwise, set them fake - * offline (PageOffline). - */ - if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1)) + if (vm->in_sbm) { + /* + * We exploit here that subblocks have at least + * MAX_ORDER_NR_PAGES size/alignment - so we cannot + * cross subblocks within one call. + */ + id = virtio_mem_phys_to_mb_id(addr); + sb_id = virtio_mem_phys_to_sb_id(vm, addr); + do_online = virtio_mem_sbm_test_sb_plugged(vm, id, + sb_id, 1); + } else { + do_online = true; + } + if (do_online) generic_online_page(page, order); else virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, @@ -1178,6 +1330,32 @@ static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, return rc; } +/* + * Request to unplug a big block. + * + * Will not modify the state of the big block. + */ +static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; + + return virtio_mem_send_unplug_request(vm, addr, size); +} + +/* + * Request to plug a big block. + * + * Will not modify the state of the big block. + */ +static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; + + return virtio_mem_send_plug_request(vm, addr, size); +} + /* * Unplug the desired number of plugged subblocks of a offline or not-added * memory block. Will fail if any subblock cannot get unplugged (instead of @@ -1363,10 +1541,7 @@ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, return 0; } -/* - * Try to plug the requested amount of memory. - */ -static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) +static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) { uint64_t nb_sb = diff / vm->sbm.sb_size; unsigned long mb_id; @@ -1433,6 +1608,112 @@ out_unlock: return rc; } +/* + * Plug a big block and add it to Linux. + * + * Will modify the state of the big block. + */ +static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm, + unsigned long bb_id) +{ + int rc; + + if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_UNUSED)) + return -EINVAL; + + rc = virtio_mem_bbm_plug_bb(vm, bb_id); + if (rc) + return rc; + virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); + + rc = virtio_mem_bbm_add_bb(vm, bb_id); + if (rc) { + if (!virtio_mem_bbm_unplug_bb(vm, bb_id)) + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_UNUSED); + else + /* Retry from the main loop. */ + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_PLUGGED); + return rc; + } + return 0; +} + +/* + * Prepare tracking data for the next big block. + */ +static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm, + unsigned long *bb_id) +{ + int rc; + + if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id) + return -ENOSPC; + + /* Resize the big block state array if required. */ + rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm); + if (rc) + return rc; + + vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++; + *bb_id = vm->bbm.next_bb_id; + vm->bbm.next_bb_id++; + return 0; +} + +static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff) +{ + uint64_t nb_bb = diff / vm->bbm.bb_size; + unsigned long bb_id; + int rc; + + if (!nb_bb) + return 0; + + /* Try to plug and add unused big blocks */ + virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) { + if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) + return -ENOSPC; + + rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); + if (!rc) + nb_bb--; + if (rc || !nb_bb) + return rc; + cond_resched(); + } + + /* Try to prepare, plug and add new big blocks */ + while (nb_bb) { + if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size)) + return -ENOSPC; + + rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id); + if (rc) + return rc; + rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id); + if (!rc) + nb_bb--; + if (rc) + return rc; + cond_resched(); + } + + return 0; +} + +/* + * Try to plug the requested amount of memory. + */ +static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) +{ + if (vm->in_sbm) + return virtio_mem_sbm_plug_request(vm, diff); + return virtio_mem_bbm_plug_request(vm, diff); +} + /* * Unplug the desired number of plugged subblocks of an offline memory block. * Will fail if any subblock cannot get unplugged (instead of skipping it). @@ -1571,10 +1852,7 @@ unplugged: return 0; } -/* - * Try to unplug the requested amount of memory. - */ -static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) +static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) { uint64_t nb_sb = diff / vm->sbm.sb_size; unsigned long mb_id; @@ -1640,20 +1918,42 @@ out_unlock: return rc; } +/* + * Try to unplug the requested amount of memory. + */ +static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) +{ + if (vm->in_sbm) + return virtio_mem_sbm_unplug_request(vm, diff); + return -EBUSY; +} + /* * Try to unplug all blocks that couldn't be unplugged before, for example, * because the hypervisor was busy. */ static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) { - unsigned long mb_id; + unsigned long id; int rc; - virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_PLUGGED) { - rc = virtio_mem_sbm_unplug_mb(vm, mb_id); + if (!vm->in_sbm) { + virtio_mem_bbm_for_each_bb(vm, id, + VIRTIO_MEM_BBM_BB_PLUGGED) { + rc = virtio_mem_bbm_unplug_bb(vm, id); + if (rc) + return rc; + virtio_mem_bbm_set_bb_state(vm, id, + VIRTIO_MEM_BBM_BB_UNUSED); + } + return 0; + } + + virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) { + rc = virtio_mem_sbm_unplug_mb(vm, id); if (rc) return rc; - virtio_mem_sbm_set_mb_state(vm, mb_id, + virtio_mem_sbm_set_mb_state(vm, id, VIRTIO_MEM_SBM_MB_UNUSED); } @@ -1679,7 +1979,13 @@ static void virtio_mem_refresh_config(struct virtio_mem *vm) usable_region_size, &usable_region_size); end_addr = vm->addr + usable_region_size; end_addr = min(end_addr, phys_limit); - vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr) - 1; + + if (vm->in_sbm) + vm->sbm.last_usable_mb_id = + virtio_mem_phys_to_mb_id(end_addr) - 1; + else + vm->bbm.last_usable_bb_id = + virtio_mem_phys_to_bb_id(vm, end_addr) - 1; /* see if there is a request to change the size */ virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, @@ -1802,6 +2108,7 @@ static int virtio_mem_init_vq(struct virtio_mem *vm) static int virtio_mem_init(struct virtio_mem *vm) { const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS; + uint64_t sb_size, addr; uint16_t node_id; if (!vm->vdev->config->get) { @@ -1834,16 +2141,6 @@ static int virtio_mem_init(struct virtio_mem *vm) if (vm->nid == NUMA_NO_NODE) vm->nid = memory_add_physaddr_to_nid(vm->addr); - /* - * We always hotplug memory in memory block granularity. This way, - * we have to wait for exactly one memory block to online. - */ - if (vm->device_block_size > memory_block_size_bytes()) { - dev_err(&vm->vdev->dev, - "The block size is not supported (too big).\n"); - return -EINVAL; - } - /* bad device setup - warn only */ if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) dev_warn(&vm->vdev->dev, @@ -1863,20 +2160,35 @@ static int virtio_mem_init(struct virtio_mem *vm) * - Is required for now for alloc_contig_range() to work reliably - * it doesn't properly handle smaller granularity on ZONE_NORMAL. */ - vm->sbm.sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES, - pageblock_nr_pages) * PAGE_SIZE; - vm->sbm.sb_size = max_t(uint64_t, vm->device_block_size, - vm->sbm.sb_size); - vm->sbm.sbs_per_mb = memory_block_size_bytes() / vm->sbm.sb_size; + sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES, + pageblock_nr_pages) * PAGE_SIZE; + sb_size = max_t(uint64_t, vm->device_block_size, sb_size); - /* Round up to the next full memory block */ - vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(vm->addr - 1 + - memory_block_size_bytes()); - vm->sbm.next_mb_id = vm->sbm.first_mb_id; + if (sb_size < memory_block_size_bytes()) { + /* SBM: At least two subblocks per Linux memory block. */ + vm->in_sbm = true; + vm->sbm.sb_size = sb_size; + vm->sbm.sbs_per_mb = memory_block_size_bytes() / + vm->sbm.sb_size; + + /* Round up to the next full memory block */ + addr = vm->addr + memory_block_size_bytes() - 1; + vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr); + vm->sbm.next_mb_id = vm->sbm.first_mb_id; + } else { + /* BBM: At least one Linux memory block. */ + vm->bbm.bb_size = vm->device_block_size; + + vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, vm->addr); + vm->bbm.next_bb_id = vm->bbm.first_bb_id; + } /* Prepare the offline threshold - make sure we can add two blocks. */ vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); + /* In BBM, we also want at least two big blocks. */ + vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, + vm->offline_threshold); dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); @@ -1884,8 +2196,12 @@ static int virtio_mem_init(struct virtio_mem *vm) (unsigned long long)vm->device_block_size); dev_info(&vm->vdev->dev, "memory block size: 0x%lx", memory_block_size_bytes()); - dev_info(&vm->vdev->dev, "subblock size: 0x%llx", - (unsigned long long)vm->sbm.sb_size); + if (vm->in_sbm) + dev_info(&vm->vdev->dev, "subblock size: 0x%llx", + (unsigned long long)vm->sbm.sb_size); + else + dev_info(&vm->vdev->dev, "big block size: 0x%llx", + (unsigned long long)vm->bbm.bb_size); if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA)) dev_info(&vm->vdev->dev, "nid: %d", vm->nid); @@ -2042,22 +2358,24 @@ static void virtio_mem_remove(struct virtio_device *vdev) cancel_work_sync(&vm->wq); hrtimer_cancel(&vm->retry_timer); - /* - * After we unregistered our callbacks, user space can online partially - * plugged offline blocks. Make sure to remove them. - */ - virtio_mem_sbm_for_each_mb(vm, mb_id, - VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { - rc = virtio_mem_sbm_remove_mb(vm, mb_id); - BUG_ON(rc); - virtio_mem_sbm_set_mb_state(vm, mb_id, - VIRTIO_MEM_SBM_MB_UNUSED); + if (vm->in_sbm) { + /* + * After we unregistered our callbacks, user space can online + * partially plugged offline blocks. Make sure to remove them. + */ + virtio_mem_sbm_for_each_mb(vm, mb_id, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { + rc = virtio_mem_sbm_remove_mb(vm, mb_id); + BUG_ON(rc); + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_UNUSED); + } + /* + * After we unregistered our callbacks, user space can no longer + * offline partially plugged online memory blocks. No need to + * worry about them. + */ } - /* - * After we unregistered our callbacks, user space can no longer - * offline partially plugged online memory blocks. No need to worry - * about them. - */ /* unregister callbacks */ unregister_virtio_mem_device(vm); @@ -2076,8 +2394,12 @@ static void virtio_mem_remove(struct virtio_device *vdev) } /* remove all tracking data - no locking needed */ - vfree(vm->sbm.mb_states); - vfree(vm->sbm.sb_states); + if (vm->in_sbm) { + vfree(vm->sbm.mb_states); + vfree(vm->sbm.sb_states); + } else { + vfree(vm->bbm.bb_states); + } /* reset the device and cleanup the queues */ vdev->config->reset(vdev); From faa45ff4ce885af93a3233a408c5a74b2943226b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:12 +0100 Subject: [PATCH 187/326] virtio-mem: allow to force Big Block Mode (BBM) and set the big block size Let's allow to force BBM, even if subblocks would be possible. Take care of properly calculating the first big block id, because the start address might no longer be aligned to the big block size. Also, allow to manually configure the size of Big Blocks. Reviewed-by: Wei Yang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Cc: Michal Hocko Cc: Oscar Salvador Cc: Wei Yang Cc: Andrew Morton Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-27-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 8a4f735360ac..861149acafe5 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -27,6 +27,16 @@ static bool unplug_online = true; module_param(unplug_online, bool, 0644); MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); +static bool force_bbm; +module_param(force_bbm, bool, 0444); +MODULE_PARM_DESC(force_bbm, + "Force Big Block Mode. Default is 0 (auto-selection)"); + +static unsigned long bbm_block_size; +module_param(bbm_block_size, ulong, 0444); +MODULE_PARM_DESC(bbm_block_size, + "Big Block size in bytes. Default is 0 (auto-detection)."); + /* * virtio-mem currently supports the following modes of operation: * @@ -2164,7 +2174,7 @@ static int virtio_mem_init(struct virtio_mem *vm) pageblock_nr_pages) * PAGE_SIZE; sb_size = max_t(uint64_t, vm->device_block_size, sb_size); - if (sb_size < memory_block_size_bytes()) { + if (sb_size < memory_block_size_bytes() && !force_bbm) { /* SBM: At least two subblocks per Linux memory block. */ vm->in_sbm = true; vm->sbm.sb_size = sb_size; @@ -2177,9 +2187,24 @@ static int virtio_mem_init(struct virtio_mem *vm) vm->sbm.next_mb_id = vm->sbm.first_mb_id; } else { /* BBM: At least one Linux memory block. */ - vm->bbm.bb_size = vm->device_block_size; + vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size, + memory_block_size_bytes()); - vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, vm->addr); + if (bbm_block_size) { + if (!is_power_of_2(bbm_block_size)) { + dev_warn(&vm->vdev->dev, + "bbm_block_size is not a power of 2"); + } else if (bbm_block_size < vm->bbm.bb_size) { + dev_warn(&vm->vdev->dev, + "bbm_block_size is too small"); + } else { + vm->bbm.bb_size = bbm_block_size; + } + } + + /* Round up to the next aligned big block */ + addr = vm->addr + vm->bbm.bb_size - 1; + vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); vm->bbm.next_bb_id = vm->bbm.first_bb_id; } From 8dc4bb58a146655eb057247d7c9d19e73928715b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:13 +0100 Subject: [PATCH 188/326] mm/memory_hotplug: extend offline_and_remove_memory() to handle more than one memory block virtio-mem soon wants to use offline_and_remove_memory() memory that exceeds a single Linux memory block (memory_block_size_bytes()). Let's remove that restriction. Let's remember the old state and try to restore that if anything goes wrong. While re-onlining can, in general, fail, it's highly unlikely to happen (usually only when a notifier fails to allocate memory, and these are rather rare). This will be used by virtio-mem to offline+remove memory ranges that are bigger than a single memory block - for example, with a device block size of 1 GiB (e.g., gigantic pages in the hypervisor) and a Linux memory block size of 128MB. While we could compress the state into 2 bit, using 8 bit is much easier. This handling is similar, but different to acpi_scan_try_to_offline(): a) We don't try to offline twice. I am not sure if this CONFIG_MEMCG optimization is still relevant - it should only apply to ZONE_NORMAL (where we have no guarantees). If relevant, we can always add it. b) acpi_scan_try_to_offline() simply onlines all memory in case something goes wrong. It doesn't restore previous online type. Let's do that, so we won't overwrite what e.g., user space configured. Reviewed-by: Wei Yang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Cc: Michal Hocko Cc: Oscar Salvador Cc: Wei Yang Cc: Andrew Morton Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-28-david@redhat.com Signed-off-by: Michael S. Tsirkin Acked-by: Andrew Morton --- mm/memory_hotplug.c | 109 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 91 insertions(+), 18 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 63b2e46b6555..2b6cc42ba0a3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1788,39 +1788,112 @@ int remove_memory(int nid, u64 start, u64 size) } EXPORT_SYMBOL_GPL(remove_memory); +static int try_offline_memory_block(struct memory_block *mem, void *arg) +{ + uint8_t online_type = MMOP_ONLINE_KERNEL; + uint8_t **online_types = arg; + struct page *page; + int rc; + + /* + * Sense the online_type via the zone of the memory block. Offlining + * with multiple zones within one memory block will be rejected + * by offlining code ... so we don't care about that. + */ + page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr)); + if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE) + online_type = MMOP_ONLINE_MOVABLE; + + rc = device_offline(&mem->dev); + /* + * Default is MMOP_OFFLINE - change it only if offlining succeeded, + * so try_reonline_memory_block() can do the right thing. + */ + if (!rc) + **online_types = online_type; + + (*online_types)++; + /* Ignore if already offline. */ + return rc < 0 ? rc : 0; +} + +static int try_reonline_memory_block(struct memory_block *mem, void *arg) +{ + uint8_t **online_types = arg; + int rc; + + if (**online_types != MMOP_OFFLINE) { + mem->online_type = **online_types; + rc = device_online(&mem->dev); + if (rc < 0) + pr_warn("%s: Failed to re-online memory: %d", + __func__, rc); + } + + /* Continue processing all remaining memory blocks. */ + (*online_types)++; + return 0; +} + /* - * Try to offline and remove a memory block. Might take a long time to - * finish in case memory is still in use. Primarily useful for memory devices - * that logically unplugged all memory (so it's no longer in use) and want to - * offline + remove the memory block. + * Try to offline and remove memory. Might take a long time to finish in case + * memory is still in use. Primarily useful for memory devices that logically + * unplugged all memory (so it's no longer in use) and want to offline + remove + * that memory. */ int offline_and_remove_memory(int nid, u64 start, u64 size) { - struct memory_block *mem; - int rc = -EINVAL; + const unsigned long mb_count = size / memory_block_size_bytes(); + uint8_t *online_types, *tmp; + int rc; if (!IS_ALIGNED(start, memory_block_size_bytes()) || - size != memory_block_size_bytes()) - return rc; - - lock_device_hotplug(); - mem = find_memory_block(__pfn_to_section(PFN_DOWN(start))); - if (mem) - rc = device_offline(&mem->dev); - /* Ignore if the device is already offline. */ - if (rc > 0) - rc = 0; + !IS_ALIGNED(size, memory_block_size_bytes()) || !size) + return -EINVAL; /* - * In case we succeeded to offline the memory block, remove it. + * We'll remember the old online type of each memory block, so we can + * try to revert whatever we did when offlining one memory block fails + * after offlining some others succeeded. + */ + online_types = kmalloc_array(mb_count, sizeof(*online_types), + GFP_KERNEL); + if (!online_types) + return -ENOMEM; + /* + * Initialize all states to MMOP_OFFLINE, so when we abort processing in + * try_offline_memory_block(), we'll skip all unprocessed blocks in + * try_reonline_memory_block(). + */ + memset(online_types, MMOP_OFFLINE, mb_count); + + lock_device_hotplug(); + + tmp = online_types; + rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block); + + /* + * In case we succeeded to offline all memory, remove it. * This cannot fail as it cannot get onlined in the meantime. */ if (!rc) { rc = try_remove_memory(nid, start, size); - WARN_ON_ONCE(rc); + if (rc) + pr_err("%s: Failed to remove memory: %d", __func__, rc); + } + + /* + * Rollback what we did. While memory onlining might theoretically fail + * (nacked by a notifier), it barely ever happens. + */ + if (rc) { + tmp = online_types; + walk_memory_blocks(start, size, &tmp, + try_reonline_memory_block); } unlock_device_hotplug(); + kfree(online_types); return rc; } EXPORT_SYMBOL_GPL(offline_and_remove_memory); From 269ac9389db4854f7b05c4749ff051763e7578d3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:14 +0100 Subject: [PATCH 189/326] virtio-mem: Big Block Mode (BBM) - basic memory hotunplug Let's try to unplug completely offline big blocks first. Then, (if enabled via unplug_offline) try to offline and remove whole big blocks. No locking necessary - we can deal with concurrent onlining/offlining just fine. Note1: This is sub-optimal and might be dangerous in some environments: we could end up in an infinite loop when offlining (e.g., long-term pinnings), similar as with DIMMs. We'll introduce safe memory hotunplug via fake-offlining next, and use this basic mode only when explicitly enabled. Note2: Without ZONE_MOVABLE, memory unplug will be extremely unreliable with bigger block sizes. Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Cc: Michal Hocko Cc: Oscar Salvador Cc: Wei Yang Cc: Andrew Morton Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-29-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 156 +++++++++++++++++++++++++++++++++++- 1 file changed, 155 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 861149acafe5..f1696cdb7b0c 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -390,6 +390,12 @@ static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm) _bb_id++) \ if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) +#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \ + for (_bb_id = vm->bbm.next_bb_id - 1; \ + _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \ + _bb_id--) \ + if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state) + /* * Set the state of a memory block, taking care of the state counter. */ @@ -685,6 +691,18 @@ static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) return virtio_mem_remove_memory(vm, addr, size); } +/* + * See virtio_mem_remove_memory(): Try to remove all Linux memory blocks covered + * by the big block. + */ +static int virtio_mem_bbm_remove_bb(struct virtio_mem *vm, unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; + + return virtio_mem_remove_memory(vm, addr, size); +} + /* * Try offlining and removing memory from Linux. * @@ -731,6 +749,19 @@ static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm, return virtio_mem_offline_and_remove_memory(vm, addr, size); } +/* + * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a + * all Linux memory blocks covered by the big block. + */ +static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm, + unsigned long bb_id) +{ + const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); + const uint64_t size = vm->bbm.bb_size; + + return virtio_mem_offline_and_remove_memory(vm, addr, size); +} + /* * Trigger the workqueue so the device can perform its magic. */ @@ -1928,6 +1959,129 @@ out_unlock: return rc; } +/* + * Try to offline and remove a big block from Linux and unplug it. Will fail + * with -EBUSY if some memory is busy and cannot get unplugged. + * + * Will modify the state of the memory block. Might temporarily drop the + * hotplug_mutex. + */ +static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, + unsigned long bb_id) +{ + int rc; + + if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_ADDED)) + return -EINVAL; + + rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); + if (rc) + return rc; + + rc = virtio_mem_bbm_unplug_bb(vm, bb_id); + if (rc) + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_PLUGGED); + else + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_UNUSED); + return rc; +} + +/* + * Try to remove a big block from Linux and unplug it. Will fail with + * -EBUSY if some memory is online. + * + * Will modify the state of the memory block. + */ +static int virtio_mem_bbm_remove_and_unplug_bb(struct virtio_mem *vm, + unsigned long bb_id) +{ + int rc; + + if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_ADDED)) + return -EINVAL; + + rc = virtio_mem_bbm_remove_bb(vm, bb_id); + if (rc) + return -EBUSY; + + rc = virtio_mem_bbm_unplug_bb(vm, bb_id); + if (rc) + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_PLUGGED); + else + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_UNUSED); + return rc; +} + +/* + * Test if a big block is completely offline. + */ +static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, + unsigned long bb_id) +{ + const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); + const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); + unsigned long pfn; + + for (pfn = start_pfn; pfn < start_pfn + nr_pages; + pfn += PAGES_PER_SECTION) { + if (pfn_to_online_page(pfn)) + return false; + } + + return true; +} + +static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) +{ + uint64_t nb_bb = diff / vm->bbm.bb_size; + uint64_t bb_id; + int rc; + + if (!nb_bb) + return 0; + + /* Try to unplug completely offline big blocks first. */ + virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { + cond_resched(); + /* + * As we're holding no locks, this check is racy as memory + * can get onlined in the meantime - but we'll fail gracefully. + */ + if (!virtio_mem_bbm_bb_is_offline(vm, bb_id)) + continue; + rc = virtio_mem_bbm_remove_and_unplug_bb(vm, bb_id); + if (rc == -EBUSY) + continue; + if (!rc) + nb_bb--; + if (rc || !nb_bb) + return rc; + } + + if (!unplug_online) + return 0; + + /* Try to unplug any big blocks. */ + virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { + cond_resched(); + rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); + if (rc == -EBUSY) + continue; + if (!rc) + nb_bb--; + if (rc || !nb_bb) + return rc; + } + + return nb_bb ? -EBUSY : 0; +} + /* * Try to unplug the requested amount of memory. */ @@ -1935,7 +2089,7 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) { if (vm->in_sbm) return virtio_mem_sbm_unplug_request(vm, diff); - return -EBUSY; + return virtio_mem_bbm_unplug_request(vm, diff); } /* From 3711387a7543f2716e52ce5a5d92e3d580423a40 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 Nov 2020 14:38:15 +0100 Subject: [PATCH 190/326] virtio-mem: Big Block Mode (BBM) - safe memory hotunplug Let's add a safe mechanism to unplug memory, avoiding long/endless loops when trying to offline memory - similar to in SBM. Fake-offline all memory (via alloc_contig_range()) before trying to offline+remove it. Use this mode as default, but allow to enable the other mode explicitly (which could give better memory hotunplug guarantees in some environments). The "unsafe" mode can be enabled e.g., via virtio_mem.bbm_safe_unplug=0 on the cmdline. Reviewed-by: Wei Yang Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Pankaj Gupta Cc: Michal Hocko Cc: Oscar Salvador Cc: Wei Yang Cc: Andrew Morton Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20201112133815.13332-30-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 97 ++++++++++++++++++++++++++++++++++++- 1 file changed, 95 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index f1696cdb7b0c..9fc9ec4a25f5 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -37,6 +37,11 @@ module_param(bbm_block_size, ulong, 0444); MODULE_PARM_DESC(bbm_block_size, "Big Block size in bytes. Default is 0 (auto-detection)."); +static bool bbm_safe_unplug = true; +module_param(bbm_safe_unplug, bool, 0444); +MODULE_PARM_DESC(bbm_safe_unplug, + "Use a safe unplug mechanism in BBM, avoiding long/endless loops"); + /* * virtio-mem currently supports the following modes of operation: * @@ -87,6 +92,8 @@ enum virtio_mem_bbm_bb_state { VIRTIO_MEM_BBM_BB_PLUGGED, /* Plugged and added to Linux. */ VIRTIO_MEM_BBM_BB_ADDED, + /* All online parts are fake-offline, ready to remove. */ + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE, VIRTIO_MEM_BBM_BB_COUNT }; @@ -889,6 +896,32 @@ static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm, } } +static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm, + unsigned long bb_id, + unsigned long pfn, + unsigned long nr_pages) +{ + /* + * When marked as "fake-offline", all online memory of this device block + * is allocated by us. Otherwise, we don't have any memory allocated. + */ + if (virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) + return; + virtio_mem_fake_offline_going_offline(pfn, nr_pages); +} + +static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm, + unsigned long bb_id, + unsigned long pfn, + unsigned long nr_pages) +{ + if (virtio_mem_bbm_get_bb_state(vm, bb_id) != + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE) + return; + virtio_mem_fake_offline_cancel_offline(pfn, nr_pages); +} + /* * This callback will either be called synchronously from add_memory() or * asynchronously (e.g., triggered via user space). We have to be careful @@ -949,6 +982,10 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, vm->hotplug_active = true; if (vm->in_sbm) virtio_mem_sbm_notify_going_offline(vm, id); + else + virtio_mem_bbm_notify_going_offline(vm, id, + mhp->start_pfn, + mhp->nr_pages); break; case MEM_GOING_ONLINE: mutex_lock(&vm->hotplug_mutex); @@ -999,6 +1036,10 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, break; if (vm->in_sbm) virtio_mem_sbm_notify_cancel_offline(vm, id); + else + virtio_mem_bbm_notify_cancel_offline(vm, id, + mhp->start_pfn, + mhp->nr_pages); vm->hotplug_active = false; mutex_unlock(&vm->hotplug_mutex); break; @@ -1189,7 +1230,13 @@ static void virtio_mem_online_page_cb(struct page *page, unsigned int order) do_online = virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, 1); } else { - do_online = true; + /* + * If the whole block is marked fake offline, keep + * everything that way. + */ + id = virtio_mem_phys_to_bb_id(vm, addr); + do_online = virtio_mem_bbm_get_bb_state(vm, id) != + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE; } if (do_online) generic_online_page(page, order); @@ -1969,15 +2016,50 @@ out_unlock: static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, unsigned long bb_id) { + const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); + const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); + unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn; + struct page *page; int rc; if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != VIRTIO_MEM_BBM_BB_ADDED)) return -EINVAL; + if (bbm_safe_unplug) { + /* + * Start by fake-offlining all memory. Once we marked the device + * block as fake-offline, all newly onlined memory will + * automatically be kept fake-offline. Protect from concurrent + * onlining/offlining until we have a consistent state. + */ + mutex_lock(&vm->hotplug_mutex); + virtio_mem_bbm_set_bb_state(vm, bb_id, + VIRTIO_MEM_BBM_BB_FAKE_OFFLINE); + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + page = pfn_to_online_page(pfn); + if (!page) + continue; + + rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION); + if (rc) { + end_pfn = pfn; + goto rollback_safe_unplug; + } + } + mutex_unlock(&vm->hotplug_mutex); + } + rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id); - if (rc) + if (rc) { + if (bbm_safe_unplug) { + mutex_lock(&vm->hotplug_mutex); + goto rollback_safe_unplug; + } return rc; + } rc = virtio_mem_bbm_unplug_bb(vm, bb_id); if (rc) @@ -1987,6 +2069,17 @@ static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm, virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED); return rc; + +rollback_safe_unplug: + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + page = pfn_to_online_page(pfn); + if (!page) + continue; + virtio_mem_fake_online(pfn, PAGES_PER_SECTION); + } + virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED); + mutex_unlock(&vm->hotplug_mutex); + return rc; } /* From 0ab4b8901a8edda4fd1c2aded36192566d89353f Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Wed, 11 Nov 2020 09:14:48 +0800 Subject: [PATCH 191/326] vhost_vdpa: switch to vmemdup_user() Replace opencoded alloc and copy with vmemdup_user() Signed-off-by: Tian Tao Link: https://lore.kernel.org/r/1605057288-60400-1-git-send-email-tiantao6@hisilicon.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella --- drivers/vhost/vdpa.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 29ed4173f04e..ef688c8c0e0e 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -245,14 +245,10 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v, return -EFAULT; if (vhost_vdpa_config_validate(v, &config)) return -EINVAL; - buf = kvzalloc(config.len, GFP_KERNEL); - if (!buf) - return -ENOMEM; - if (copy_from_user(buf, c->buf, config.len)) { - kvfree(buf); - return -EFAULT; - } + buf = vmemdup_user(c->buf, config.len); + if (IS_ERR(buf)) + return PTR_ERR(buf); ops->set_config(vdpa, config.off, buf, config.len); From 4d10367fd411437d55850357e471d9d5f9f47e72 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 29 Nov 2020 13:54:34 +0100 Subject: [PATCH 192/326] vdpa: ifcvf: Use dma_set_mask_and_coherent to simplify code 'pci_set_dma_mask()' + 'pci_set_consistent_dma_mask()' can be replaced by an equivalent 'dma_set_mask_and_coherent()' which is much less verbose. While at it, fix a typo (s/confiugration/configuration) Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/20201129125434.1462638-1-christophe.jaillet@wanadoo.fr Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/ifcvf/ifcvf_main.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 8b4028556cb6..fa1af301cf55 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -417,16 +417,9 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; } - ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); if (ret) { - IFCVF_ERR(pdev, "No usable DMA confiugration\n"); - return ret; - } - - ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); - if (ret) { - IFCVF_ERR(pdev, - "No usable coherent DMA confiugration\n"); + IFCVF_ERR(pdev, "No usable DMA configuration\n"); return ret; } From 29b90f92ee64f4cae2d8ef83922286567da6c2c1 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:39 +0100 Subject: [PATCH 193/326] vdpa: remove unnecessary 'default n' in Kconfig entries 'default n' is not necessary since it is already the default when nothing is specified. Suggested-by: Jason Wang Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-2-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/Kconfig | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index 6caf539091e5..2c892e890b9e 100644 --- a/drivers/vdpa/Kconfig +++ b/drivers/vdpa/Kconfig @@ -14,7 +14,6 @@ config VDPA_SIM select DMA_OPS select VHOST_RING select GENERIC_NET_UTILS - default n help vDPA networking device simulator which loop TX traffic back to RX. This device is used for testing, prototyping and @@ -23,7 +22,6 @@ config VDPA_SIM config IFCVF tristate "Intel IFC VF vDPA driver" depends on PCI_MSI - default n help This kernel module can drive Intel IFC VF NIC to offload virtio dataplane traffic to hardware. @@ -42,7 +40,6 @@ config MLX5_VDPA_NET tristate "vDPA driver for ConnectX devices" select MLX5_VDPA depends on MLX5_CORE - default n help VDPA network driver for ConnectX6 and newer. Provides offloading of virtio net datapath such that descriptors put on the ring will From cc3d42386d14176e392d61da1de05c1d87c18b93 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:40 +0100 Subject: [PATCH 194/326] vdpa_sim: remove unnecessary headers inclusion Some headers are not necessary, so let's remove them to do some cleaning. Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-3-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 6a90fdb9cbfc..63b85541fb5e 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -11,16 +11,9 @@ #include #include #include -#include -#include #include #include -#include -#include -#include #include -#include -#include #include #include #include From 423248d60d2b655321fc49eca1545f95a1bc9d6c Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 15 Dec 2020 15:42:41 +0100 Subject: [PATCH 195/326] vdpa_sim: remove hard-coded virtq count Add a new attribute that will define the number of virt queues to be created for the vdpasim device. Signed-off-by: Max Gurtovoy [sgarzare: replace kmalloc_array() with kcalloc()] Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-4-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 63b85541fb5e..07ccc8609784 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -63,7 +63,7 @@ static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) | /* State of each vdpasim device */ struct vdpasim { struct vdpa_device vdpa; - struct vdpasim_virtqueue vqs[VDPASIM_VQ_NUM]; + struct vdpasim_virtqueue *vqs; struct work_struct work; /* spinlock to synchronize virtqueue state */ spinlock_t lock; @@ -73,6 +73,7 @@ struct vdpasim { u32 status; u32 generation; u64 features; + int nvqs; /* spinlock to synchronize iommu table */ spinlock_t iommu_lock; }; @@ -137,7 +138,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim) { int i; - for (i = 0; i < VDPASIM_VQ_NUM; i++) + for (i = 0; i < vdpasim->nvqs; i++) vdpasim_vq_reset(&vdpasim->vqs[i]); spin_lock(&vdpasim->iommu_lock); @@ -343,7 +344,7 @@ static struct vdpasim *vdpasim_create(void) const struct vdpa_config_ops *ops; struct vdpasim *vdpasim; struct device *dev; - int ret = -ENOMEM; + int i, ret = -ENOMEM; if (batch_mapping) ops = &vdpasim_net_batch_config_ops; @@ -354,6 +355,7 @@ static struct vdpasim *vdpasim_create(void) if (!vdpasim) goto err_alloc; + vdpasim->nvqs = VDPASIM_VQ_NUM; INIT_WORK(&vdpasim->work, vdpasim_work); spin_lock_init(&vdpasim->lock); spin_lock_init(&vdpasim->iommu_lock); @@ -364,6 +366,11 @@ static struct vdpasim *vdpasim_create(void) goto err_iommu; set_dma_ops(dev, &vdpasim_dma_ops); + vdpasim->vqs = kcalloc(vdpasim->nvqs, sizeof(struct vdpasim_virtqueue), + GFP_KERNEL); + if (!vdpasim->vqs) + goto err_iommu; + vdpasim->iommu = vhost_iotlb_alloc(2048, 0); if (!vdpasim->iommu) goto err_iommu; @@ -382,8 +389,8 @@ static struct vdpasim *vdpasim_create(void) eth_random_addr(vdpasim->config.mac); } - vringh_set_iotlb(&vdpasim->vqs[0].vring, vdpasim->iommu); - vringh_set_iotlb(&vdpasim->vqs[1].vring, vdpasim->iommu); + for (i = 0; i < vdpasim->nvqs; i++) + vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu); vdpasim->vdpa.dma_dev = dev; ret = vdpa_register_device(&vdpasim->vdpa); @@ -652,6 +659,7 @@ static void vdpasim_free(struct vdpa_device *vdpa) kfree(vdpasim->buffer); if (vdpasim->iommu) vhost_iotlb_free(vdpasim->iommu); + kfree(vdpasim->vqs); } static const struct vdpa_config_ops vdpasim_net_config_ops = { From 2fc0ebfa039025d88009e8f275ea8bcd177a9cd9 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:42 +0100 Subject: [PATCH 196/326] vdpa_sim: make IOTLB entries limit configurable Some devices may require a higher limit for the number of IOTLB entries, so let's make it configurable through a module parameter. By default, it's initialized with the current limit (2048). Suggested-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-5-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 07ccc8609784..d716bfaadb3b 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -31,6 +31,11 @@ static int batch_mapping = 1; module_param(batch_mapping, int, 0444); MODULE_PARM_DESC(batch_mapping, "Batched mapping 1 -Enable; 0 - Disable"); +static int max_iotlb_entries = 2048; +module_param(max_iotlb_entries, int, 0444); +MODULE_PARM_DESC(max_iotlb_entries, + "Maximum number of iotlb entries. 0 means unlimited. (default: 2048)"); + static char *macaddr; module_param(macaddr, charp, 0); MODULE_PARM_DESC(macaddr, "Ethernet MAC address"); @@ -371,7 +376,7 @@ static struct vdpasim *vdpasim_create(void) if (!vdpasim->vqs) goto err_iommu; - vdpasim->iommu = vhost_iotlb_alloc(2048, 0); + vdpasim->iommu = vhost_iotlb_alloc(max_iotlb_entries, 0); if (!vdpasim->iommu) goto err_iommu; From 36a9c30630256629e62a9186793c28735ade3ffc Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:43 +0100 Subject: [PATCH 197/326] vdpa_sim: rename vdpasim_config_ops variables These variables store generic callbacks used by the vDPA simulator core, so we can remove the 'net' word in their names. Co-developed-by: Max Gurtovoy Signed-off-by: Max Gurtovoy Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-6-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index d716bfaadb3b..923f29076b1b 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -341,8 +341,8 @@ static const struct dma_map_ops vdpasim_dma_ops = { .free = vdpasim_free_coherent, }; -static const struct vdpa_config_ops vdpasim_net_config_ops; -static const struct vdpa_config_ops vdpasim_net_batch_config_ops; +static const struct vdpa_config_ops vdpasim_config_ops; +static const struct vdpa_config_ops vdpasim_batch_config_ops; static struct vdpasim *vdpasim_create(void) { @@ -352,9 +352,9 @@ static struct vdpasim *vdpasim_create(void) int i, ret = -ENOMEM; if (batch_mapping) - ops = &vdpasim_net_batch_config_ops; + ops = &vdpasim_batch_config_ops; else - ops = &vdpasim_net_config_ops; + ops = &vdpasim_config_ops; vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, VDPASIM_VQ_NUM); if (!vdpasim) @@ -667,7 +667,7 @@ static void vdpasim_free(struct vdpa_device *vdpa) kfree(vdpasim->vqs); } -static const struct vdpa_config_ops vdpasim_net_config_ops = { +static const struct vdpa_config_ops vdpasim_config_ops = { .set_vq_address = vdpasim_set_vq_address, .set_vq_num = vdpasim_set_vq_num, .kick_vq = vdpasim_kick_vq, @@ -694,7 +694,7 @@ static const struct vdpa_config_ops vdpasim_net_config_ops = { .free = vdpasim_free, }; -static const struct vdpa_config_ops vdpasim_net_batch_config_ops = { +static const struct vdpa_config_ops vdpasim_batch_config_ops = { .set_vq_address = vdpasim_set_vq_address, .set_vq_num = vdpasim_set_vq_num, .kick_vq = vdpasim_kick_vq, From 6c6e28fe45794054410ad8cd2770af69fbe0338d Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:44 +0100 Subject: [PATCH 198/326] vdpa_sim: add struct vdpasim_dev_attr for device attributes vdpasim_dev_attr will contain device specific attributes. We starting moving the number of virtqueues (i.e. nvqs) to vdpasim_dev_attr. vdpasim_create() creates a new vDPA simulator following the device attributes defined in the vdpasim_dev_attr parameter. Co-developed-by: Max Gurtovoy Signed-off-by: Max Gurtovoy Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-7-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 923f29076b1b..ae295dc57d7d 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -65,11 +65,16 @@ static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) | (1ULL << VIRTIO_F_ACCESS_PLATFORM) | (1ULL << VIRTIO_NET_F_MAC); +struct vdpasim_dev_attr { + int nvqs; +}; + /* State of each vdpasim device */ struct vdpasim { struct vdpa_device vdpa; struct vdpasim_virtqueue *vqs; struct work_struct work; + struct vdpasim_dev_attr dev_attr; /* spinlock to synchronize virtqueue state */ spinlock_t lock; struct virtio_net_config config; @@ -78,7 +83,6 @@ struct vdpasim { u32 status; u32 generation; u64 features; - int nvqs; /* spinlock to synchronize iommu table */ spinlock_t iommu_lock; }; @@ -143,7 +147,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim) { int i; - for (i = 0; i < vdpasim->nvqs; i++) + for (i = 0; i < vdpasim->dev_attr.nvqs; i++) vdpasim_vq_reset(&vdpasim->vqs[i]); spin_lock(&vdpasim->iommu_lock); @@ -344,7 +348,7 @@ static const struct dma_map_ops vdpasim_dma_ops = { static const struct vdpa_config_ops vdpasim_config_ops; static const struct vdpa_config_ops vdpasim_batch_config_ops; -static struct vdpasim *vdpasim_create(void) +static struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) { const struct vdpa_config_ops *ops; struct vdpasim *vdpasim; @@ -356,11 +360,12 @@ static struct vdpasim *vdpasim_create(void) else ops = &vdpasim_config_ops; - vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, VDPASIM_VQ_NUM); + vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, + dev_attr->nvqs); if (!vdpasim) goto err_alloc; - vdpasim->nvqs = VDPASIM_VQ_NUM; + vdpasim->dev_attr = *dev_attr; INIT_WORK(&vdpasim->work, vdpasim_work); spin_lock_init(&vdpasim->lock); spin_lock_init(&vdpasim->iommu_lock); @@ -371,7 +376,7 @@ static struct vdpasim *vdpasim_create(void) goto err_iommu; set_dma_ops(dev, &vdpasim_dma_ops); - vdpasim->vqs = kcalloc(vdpasim->nvqs, sizeof(struct vdpasim_virtqueue), + vdpasim->vqs = kcalloc(dev_attr->nvqs, sizeof(struct vdpasim_virtqueue), GFP_KERNEL); if (!vdpasim->vqs) goto err_iommu; @@ -394,7 +399,7 @@ static struct vdpasim *vdpasim_create(void) eth_random_addr(vdpasim->config.mac); } - for (i = 0; i < vdpasim->nvqs; i++) + for (i = 0; i < dev_attr->nvqs; i++) vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu); vdpasim->vdpa.dma_dev = dev; @@ -722,7 +727,11 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { static int __init vdpasim_dev_init(void) { - vdpasim_dev = vdpasim_create(); + struct vdpasim_dev_attr dev_attr = {}; + + dev_attr.nvqs = VDPASIM_VQ_NUM; + + vdpasim_dev = vdpasim_create(&dev_attr); if (!IS_ERR(vdpasim_dev)) return 0; From 2f8f461888052f1b92ebe6419514355538f7cd68 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:45 +0100 Subject: [PATCH 199/326] vdpa_sim: add device id field in vdpasim_dev_attr Remove VDPASIM_DEVICE_ID macro and add 'id' field in vdpasim_dev_attr, that will be returned by vdpasim_get_device_id(). Use VIRTIO_ID_NET for vDPA-net simulator device id. Co-developed-by: Max Gurtovoy Signed-off-by: Max Gurtovoy Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-8-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index ae295dc57d7d..3a8e57ac7762 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -55,7 +55,6 @@ struct vdpasim_virtqueue { #define VDPASIM_QUEUE_ALIGN PAGE_SIZE #define VDPASIM_QUEUE_MAX 256 -#define VDPASIM_DEVICE_ID 0x1 #define VDPASIM_VENDOR_ID 0 #define VDPASIM_VQ_NUM 0x2 #define VDPASIM_NAME "vdpasim-netdev" @@ -67,6 +66,7 @@ static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) | struct vdpasim_dev_attr { int nvqs; + u32 id; }; /* State of each vdpasim device */ @@ -546,7 +546,9 @@ static u16 vdpasim_get_vq_num_max(struct vdpa_device *vdpa) static u32 vdpasim_get_device_id(struct vdpa_device *vdpa) { - return VDPASIM_DEVICE_ID; + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + return vdpasim->dev_attr.id; } static u32 vdpasim_get_vendor_id(struct vdpa_device *vdpa) @@ -729,6 +731,7 @@ static int __init vdpasim_dev_init(void) { struct vdpasim_dev_attr dev_attr = {}; + dev_attr.id = VIRTIO_ID_NET; dev_attr.nvqs = VDPASIM_VQ_NUM; vdpasim_dev = vdpasim_create(&dev_attr); From 011c35bac5ef25f701d9a79bc731782889c0ff58 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:46 +0100 Subject: [PATCH 200/326] vdpa_sim: add supported_features field in vdpasim_dev_attr Introduce a new VDPASIM_FEATURES macro with the generic features supported by the vDPA simulator, and VDPASIM_NET_FEATURES macro with vDPA-net features. Add 'supported_features' field in vdpasim_dev_attr, to allow devices to specify their features. Co-developed-by: Max Gurtovoy Signed-off-by: Max Gurtovoy Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-9-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 3a8e57ac7762..6cf3c78b0e33 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -59,12 +59,15 @@ struct vdpasim_virtqueue { #define VDPASIM_VQ_NUM 0x2 #define VDPASIM_NAME "vdpasim-netdev" -static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) | - (1ULL << VIRTIO_F_VERSION_1) | - (1ULL << VIRTIO_F_ACCESS_PLATFORM) | - (1ULL << VIRTIO_NET_F_MAC); +#define VDPASIM_FEATURES ((1ULL << VIRTIO_F_ANY_LAYOUT) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VIRTIO_F_ACCESS_PLATFORM)) + +#define VDPASIM_NET_FEATURES (VDPASIM_FEATURES | \ + (1ULL << VIRTIO_NET_F_MAC)) struct vdpasim_dev_attr { + u64 supported_features; int nvqs; u32 id; }; @@ -122,7 +125,7 @@ static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) { struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; - vringh_init_iotlb(&vq->vring, vdpasim_features, + vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features, VDPASIM_QUEUE_MAX, false, (struct vring_desc *)(uintptr_t)vq->desc_addr, (struct vring_avail *) @@ -131,7 +134,8 @@ static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) (uintptr_t)vq->device_addr); } -static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq) +static void vdpasim_vq_reset(struct vdpasim *vdpasim, + struct vdpasim_virtqueue *vq) { vq->ready = false; vq->desc_addr = 0; @@ -139,8 +143,8 @@ static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq) vq->device_addr = 0; vq->cb = NULL; vq->private = NULL; - vringh_init_iotlb(&vq->vring, vdpasim_features, VDPASIM_QUEUE_MAX, - false, NULL, NULL, NULL); + vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features, + VDPASIM_QUEUE_MAX, false, NULL, NULL, NULL); } static void vdpasim_reset(struct vdpasim *vdpasim) @@ -148,7 +152,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim) int i; for (i = 0; i < vdpasim->dev_attr.nvqs; i++) - vdpasim_vq_reset(&vdpasim->vqs[i]); + vdpasim_vq_reset(vdpasim, &vdpasim->vqs[i]); spin_lock(&vdpasim->iommu_lock); vhost_iotlb_reset(vdpasim->iommu); @@ -508,7 +512,9 @@ static u32 vdpasim_get_vq_align(struct vdpa_device *vdpa) static u64 vdpasim_get_features(struct vdpa_device *vdpa) { - return vdpasim_features; + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + return vdpasim->dev_attr.supported_features; } static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features) @@ -520,7 +526,7 @@ static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features) if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) return -EINVAL; - vdpasim->features = features & vdpasim_features; + vdpasim->features = features & vdpasim->dev_attr.supported_features; /* We generally only know whether guest is using the legacy interface * here, so generally that's the earliest we can set config fields. @@ -732,6 +738,7 @@ static int __init vdpasim_dev_init(void) struct vdpasim_dev_attr dev_attr = {}; dev_attr.id = VIRTIO_ID_NET; + dev_attr.supported_features = VDPASIM_NET_FEATURES; dev_attr.nvqs = VDPASIM_VQ_NUM; vdpasim_dev = vdpasim_create(&dev_attr); From a13b5918fdd0dd7987aa5f3c202f68ed6ad468bb Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:47 +0100 Subject: [PATCH 201/326] vdpa_sim: add work_fn in vdpasim_dev_attr Rename vdpasim_work() in vdpasim_net_work() and add it to the vdpasim_dev_attr structure. Co-developed-by: Max Gurtovoy Signed-off-by: Max Gurtovoy Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-10-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 6cf3c78b0e33..d356929f9dd3 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -70,6 +70,8 @@ struct vdpasim_dev_attr { u64 supported_features; int nvqs; u32 id; + + work_func_t work_fn; }; /* State of each vdpasim device */ @@ -163,7 +165,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim) ++vdpasim->generation; } -static void vdpasim_work(struct work_struct *work) +static void vdpasim_net_work(struct work_struct *work) { struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); @@ -370,7 +372,7 @@ static struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) goto err_alloc; vdpasim->dev_attr = *dev_attr; - INIT_WORK(&vdpasim->work, vdpasim_work); + INIT_WORK(&vdpasim->work, dev_attr->work_fn); spin_lock_init(&vdpasim->lock); spin_lock_init(&vdpasim->iommu_lock); @@ -740,6 +742,7 @@ static int __init vdpasim_dev_init(void) dev_attr.id = VIRTIO_ID_NET; dev_attr.supported_features = VDPASIM_NET_FEATURES; dev_attr.nvqs = VDPASIM_VQ_NUM; + dev_attr.work_fn = vdpasim_net_work; vdpasim_dev = vdpasim_create(&dev_attr); From cf1a3b35382c10ce315c32bd2b3d7789897fbe13 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:48 +0100 Subject: [PATCH 202/326] vdpa_sim: store parsed MAC address in a buffer As preparation for the next patches, we store the MAC address, parsed during the vdpasim_create(), in a buffer that will be used to fill 'config' together with other configurations. Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-11-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index d356929f9dd3..ccb40501cbd7 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -40,6 +40,8 @@ static char *macaddr; module_param(macaddr, charp, 0); MODULE_PARM_DESC(macaddr, "Ethernet MAC address"); +u8 macaddr_buf[ETH_ALEN]; + struct vdpasim_virtqueue { struct vringh vring; struct vringh_kiov iov; @@ -396,13 +398,13 @@ static struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) goto err_iommu; if (macaddr) { - mac_pton(macaddr, vdpasim->config.mac); - if (!is_valid_ether_addr(vdpasim->config.mac)) { + mac_pton(macaddr, macaddr_buf); + if (!is_valid_ether_addr(macaddr_buf)) { ret = -EADDRNOTAVAIL; goto err_iommu; } } else { - eth_random_addr(vdpasim->config.mac); + eth_random_addr(macaddr_buf); } for (i = 0; i < dev_attr->nvqs; i++) @@ -538,6 +540,8 @@ static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features) config->mtu = cpu_to_vdpasim16(vdpasim, 1500); config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP); + memcpy(config->mac, macaddr_buf, ETH_ALEN); + return 0; } From f37cbbc65178e0a45823d281d290c4c02da9631c Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:49 +0100 Subject: [PATCH 203/326] vdpa_sim: make 'config' generic and usable for any device type Add new 'config_size' attribute in 'vdpasim_dev_attr' and allocates 'config' dynamically to support any device types. Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-12-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index ccb40501cbd7..4a0a6cadb9ff 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -70,6 +70,7 @@ struct vdpasim_virtqueue { struct vdpasim_dev_attr { u64 supported_features; + size_t config_size; int nvqs; u32 id; @@ -84,7 +85,8 @@ struct vdpasim { struct vdpasim_dev_attr dev_attr; /* spinlock to synchronize virtqueue state */ spinlock_t lock; - struct virtio_net_config config; + /* virtio config according to device type */ + void *config; struct vhost_iotlb *iommu; void *buffer; u32 status; @@ -384,6 +386,10 @@ static struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) goto err_iommu; set_dma_ops(dev, &vdpasim_dma_ops); + vdpasim->config = kzalloc(dev_attr->config_size, GFP_KERNEL); + if (!vdpasim->config) + goto err_iommu; + vdpasim->vqs = kcalloc(dev_attr->nvqs, sizeof(struct vdpasim_virtqueue), GFP_KERNEL); if (!vdpasim->vqs) @@ -524,7 +530,8 @@ static u64 vdpasim_get_features(struct vdpa_device *vdpa) static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); - struct virtio_net_config *config = &vdpasim->config; + struct virtio_net_config *config = + (struct virtio_net_config *)vdpasim->config; /* DMA mapping must be done by driver */ if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) @@ -596,8 +603,8 @@ static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset, { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); - if (offset + len < sizeof(struct virtio_net_config)) - memcpy(buf, (u8 *)&vdpasim->config + offset, len); + if (offset + len < vdpasim->dev_attr.config_size) + memcpy(buf, vdpasim->config + offset, len); } static void vdpasim_set_config(struct vdpa_device *vdpa, unsigned int offset, @@ -684,6 +691,7 @@ static void vdpasim_free(struct vdpa_device *vdpa) if (vdpasim->iommu) vhost_iotlb_free(vdpasim->iommu); kfree(vdpasim->vqs); + kfree(vdpasim->config); } static const struct vdpa_config_ops vdpasim_config_ops = { @@ -746,6 +754,7 @@ static int __init vdpasim_dev_init(void) dev_attr.id = VIRTIO_ID_NET; dev_attr.supported_features = VDPASIM_NET_FEATURES; dev_attr.nvqs = VDPASIM_VQ_NUM; + dev_attr.config_size = sizeof(struct virtio_net_config); dev_attr.work_fn = vdpasim_net_work; vdpasim_dev = vdpasim_create(&dev_attr); From 65b709586e222fa6ffd4166ac7fdb5d5dad113ee Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:50 +0100 Subject: [PATCH 204/326] vdpa_sim: add get_config callback in vdpasim_dev_attr The get_config callback can be used by the device to fill the config structure. The callback will be invoked in vdpasim_get_config() before copying bytes into caller buffer. Move vDPA-net config updates from vdpasim_set_features() in the new vdpasim_net_get_config() callback. This is safe since in vdpa_get_config() we already check that .set_features() callback is called before .get_config(). Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-13-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 35 +++++++++++++++++++------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 4a0a6cadb9ff..5eadcd19ab6f 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -68,6 +68,8 @@ struct vdpasim_virtqueue { #define VDPASIM_NET_FEATURES (VDPASIM_FEATURES | \ (1ULL << VIRTIO_NET_F_MAC)) +struct vdpasim; + struct vdpasim_dev_attr { u64 supported_features; size_t config_size; @@ -75,6 +77,7 @@ struct vdpasim_dev_attr { u32 id; work_func_t work_fn; + void (*get_config)(struct vdpasim *vdpasim, void *config); }; /* State of each vdpasim device */ @@ -530,8 +533,6 @@ static u64 vdpasim_get_features(struct vdpa_device *vdpa) static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); - struct virtio_net_config *config = - (struct virtio_net_config *)vdpasim->config; /* DMA mapping must be done by driver */ if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) @@ -539,16 +540,6 @@ static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features) vdpasim->features = features & vdpasim->dev_attr.supported_features; - /* We generally only know whether guest is using the legacy interface - * here, so generally that's the earliest we can set config fields. - * Note: We actually require VIRTIO_F_ACCESS_PLATFORM above which - * implies VIRTIO_F_VERSION_1, but let's not try to be clever here. - */ - - config->mtu = cpu_to_vdpasim16(vdpasim, 1500); - config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP); - memcpy(config->mac, macaddr_buf, ETH_ALEN); - return 0; } @@ -603,8 +594,13 @@ static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset, { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); - if (offset + len < vdpasim->dev_attr.config_size) - memcpy(buf, vdpasim->config + offset, len); + if (offset + len > vdpasim->dev_attr.config_size) + return; + + if (vdpasim->dev_attr.get_config) + vdpasim->dev_attr.get_config(vdpasim, vdpasim->config); + + memcpy(buf, vdpasim->config + offset, len); } static void vdpasim_set_config(struct vdpa_device *vdpa, unsigned int offset, @@ -747,6 +743,16 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .free = vdpasim_free, }; +static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config) +{ + struct virtio_net_config *net_config = + (struct virtio_net_config *)config; + + net_config->mtu = cpu_to_vdpasim16(vdpasim, 1500); + net_config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP); + memcpy(net_config->mac, macaddr_buf, ETH_ALEN); +} + static int __init vdpasim_dev_init(void) { struct vdpasim_dev_attr dev_attr = {}; @@ -755,6 +761,7 @@ static int __init vdpasim_dev_init(void) dev_attr.supported_features = VDPASIM_NET_FEATURES; dev_attr.nvqs = VDPASIM_VQ_NUM; dev_attr.config_size = sizeof(struct virtio_net_config); + dev_attr.get_config = vdpasim_net_get_config; dev_attr.work_fn = vdpasim_net_work; vdpasim_dev = vdpasim_create(&dev_attr); From c124a95e304bc5d37144e2fff6e52bb904d41810 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:51 +0100 Subject: [PATCH 205/326] vdpa_sim: add set_config callback in vdpasim_dev_attr The set_config callback can be used by the device to parse the config structure modified by the driver. The callback will be invoked, if set, in vdpasim_set_config() after copying bytes from caller buffer into vdpasim->config buffer. Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-14-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 5eadcd19ab6f..e219aa852ef8 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -78,6 +78,7 @@ struct vdpasim_dev_attr { work_func_t work_fn; void (*get_config)(struct vdpasim *vdpasim, void *config); + void (*set_config)(struct vdpasim *vdpasim, const void *config); }; /* State of each vdpasim device */ @@ -606,7 +607,15 @@ static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset, static void vdpasim_set_config(struct vdpa_device *vdpa, unsigned int offset, const void *buf, unsigned int len) { - /* No writable config supportted by vdpasim */ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + if (offset + len > vdpasim->dev_attr.config_size) + return; + + memcpy(vdpasim->config + offset, buf, len); + + if (vdpasim->dev_attr.set_config) + vdpasim->dev_attr.set_config(vdpasim, vdpasim->config); } static u32 vdpasim_get_generation(struct vdpa_device *vdpa) From b240491b7a48028fb67e5377ffd1be21e9260c4e Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:52 +0100 Subject: [PATCH 206/326] vdpa_sim: set vringh notify callback Instead of calling the vq callback directly, we can leverage the vringh_notify() function, adding vdpasim_vq_notify() and setting it in the vringh notify callback. Suggested-by: Jason Wang Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-15-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index e219aa852ef8..19ff5e352782 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -131,6 +131,17 @@ static struct vdpasim *dev_to_sim(struct device *dev) return vdpa_to_sim(vdpa); } +static void vdpasim_vq_notify(struct vringh *vring) +{ + struct vdpasim_virtqueue *vq = + container_of(vring, struct vdpasim_virtqueue, vring); + + if (!vq->cb) + return; + + vq->cb(vq->private); +} + static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) { struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; @@ -142,6 +153,8 @@ static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) (uintptr_t)vq->driver_addr, (struct vring_used *) (uintptr_t)vq->device_addr); + + vq->vring.notify = vdpasim_vq_notify; } static void vdpasim_vq_reset(struct vdpasim *vdpasim, @@ -155,6 +168,8 @@ static void vdpasim_vq_reset(struct vdpasim *vdpasim, vq->private = NULL; vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features, VDPASIM_QUEUE_MAX, false, NULL, NULL, NULL); + + vq->vring.notify = NULL; } static void vdpasim_reset(struct vdpasim *vdpasim) @@ -231,10 +246,10 @@ static void vdpasim_net_work(struct work_struct *work) smp_wmb(); local_bh_disable(); - if (txq->cb) - txq->cb(txq->private); - if (rxq->cb) - rxq->cb(rxq->private); + if (vringh_need_notify_iotlb(&txq->vring) > 0) + vringh_notify(&txq->vring); + if (vringh_need_notify_iotlb(&rxq->vring) > 0) + vringh_notify(&rxq->vring); local_bh_enable(); if (++pkts > 4) { From 165be1f80b8807687f7426d3f36f1031d633e979 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:53 +0100 Subject: [PATCH 207/326] vdpa_sim: use kvmalloc to allocate vdpasim->buffer The next patch will make the buffer size configurable from each device. Since the buffer could be larger than a page, we use kvmalloc() instead of kmalloc(). Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-16-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 19ff5e352782..87529899033a 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -418,7 +418,7 @@ static struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) if (!vdpasim->iommu) goto err_iommu; - vdpasim->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); + vdpasim->buffer = kvmalloc(PAGE_SIZE, GFP_KERNEL); if (!vdpasim->buffer) goto err_iommu; @@ -707,7 +707,7 @@ static void vdpasim_free(struct vdpa_device *vdpa) struct vdpasim *vdpasim = vdpa_to_sim(vdpa); cancel_work_sync(&vdpasim->work); - kfree(vdpasim->buffer); + kvfree(vdpasim->buffer); if (vdpasim->iommu) vhost_iotlb_free(vdpasim->iommu); kfree(vdpasim->vqs); From da7af6967c6e9815f8da60a8db1d0fe35b8e97b9 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:54 +0100 Subject: [PATCH 208/326] vdpa_sim: make vdpasim->buffer size configurable Allow each device to specify the size of the buffer allocated in vdpa_sim. Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-17-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 87529899033a..60e45db29b15 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -73,6 +73,7 @@ struct vdpasim; struct vdpasim_dev_attr { u64 supported_features; size_t config_size; + size_t buffer_size; int nvqs; u32 id; @@ -418,7 +419,7 @@ static struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) if (!vdpasim->iommu) goto err_iommu; - vdpasim->buffer = kvmalloc(PAGE_SIZE, GFP_KERNEL); + vdpasim->buffer = kvmalloc(dev_attr->buffer_size, GFP_KERNEL); if (!vdpasim->buffer) goto err_iommu; @@ -787,6 +788,7 @@ static int __init vdpasim_dev_init(void) dev_attr.config_size = sizeof(struct virtio_net_config); dev_attr.get_config = vdpasim_net_get_config; dev_attr.work_fn = vdpasim_net_work; + dev_attr.buffer_size = PAGE_SIZE; vdpasim_dev = vdpasim_create(&dev_attr); From 275900dfa17c32f0f52b460e1fbd769cf694ecd3 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 15 Dec 2020 15:42:55 +0100 Subject: [PATCH 209/326] vdpa_sim: split vdpasim_virtqueue's iov field in out_iov and in_iov vringh_getdesc_iotlb() manages 2 iovs for writable and readable descriptors. This is very useful for the block device, where for each request we have both types of descriptor. Let's split the vdpasim_virtqueue's iov field in out_iov and in_iov to use them with vringh_getdesc_iotlb(). We are using VIRTIO terminology for "out" (readable by the device) and "in" (writable by the device) descriptors. Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-18-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 60e45db29b15..875e42390a13 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -44,7 +44,8 @@ u8 macaddr_buf[ETH_ALEN]; struct vdpasim_virtqueue { struct vringh vring; - struct vringh_kiov iov; + struct vringh_kiov in_iov; + struct vringh_kiov out_iov; unsigned short head; bool ready; u64 desc_addr; @@ -210,12 +211,12 @@ static void vdpasim_net_work(struct work_struct *work) while (true) { total_write = 0; - err = vringh_getdesc_iotlb(&txq->vring, &txq->iov, NULL, + err = vringh_getdesc_iotlb(&txq->vring, &txq->out_iov, NULL, &txq->head, GFP_ATOMIC); if (err <= 0) break; - err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->iov, + err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->in_iov, &rxq->head, GFP_ATOMIC); if (err <= 0) { vringh_complete_iotlb(&txq->vring, txq->head, 0); @@ -223,13 +224,13 @@ static void vdpasim_net_work(struct work_struct *work) } while (true) { - read = vringh_iov_pull_iotlb(&txq->vring, &txq->iov, + read = vringh_iov_pull_iotlb(&txq->vring, &txq->out_iov, vdpasim->buffer, PAGE_SIZE); if (read <= 0) break; - write = vringh_iov_push_iotlb(&rxq->vring, &rxq->iov, + write = vringh_iov_push_iotlb(&rxq->vring, &rxq->in_iov, vdpasim->buffer, read); if (write <= 0) break; From db1e8bb6c63a77b74b0c6b49662fc50d49d5f90b Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 15 Dec 2020 15:42:56 +0100 Subject: [PATCH 210/326] vdpa: split vdpasim to core and net modules Introduce new vdpa_sim_net and vdpa_sim (core) drivers. This is a preparation for adding a vdpa simulator module for block devices. Signed-off-by: Max Gurtovoy [sgarzare: various cleanups/fixes] Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201215144256.155342-19-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/Kconfig | 13 +- drivers/vdpa/vdpa_sim/Makefile | 1 + drivers/vdpa/vdpa_sim/vdpa_sim.c | 221 +-------------------------- drivers/vdpa/vdpa_sim/vdpa_sim.h | 105 +++++++++++++ drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 177 +++++++++++++++++++++ 5 files changed, 298 insertions(+), 219 deletions(-) create mode 100644 drivers/vdpa/vdpa_sim/vdpa_sim.h create mode 100644 drivers/vdpa/vdpa_sim/vdpa_sim_net.c diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index 2c892e890b9e..92a6396f8a73 100644 --- a/drivers/vdpa/Kconfig +++ b/drivers/vdpa/Kconfig @@ -9,15 +9,20 @@ menuconfig VDPA if VDPA config VDPA_SIM - tristate "vDPA device simulator" + tristate "vDPA device simulator core" depends on RUNTIME_TESTING_MENU && HAS_DMA select DMA_OPS select VHOST_RING + help + Enable this module to support vDPA device simulators. These devices + are used for testing, prototyping and development of vDPA. + +config VDPA_SIM_NET + tristate "vDPA simulator for networking device" + depends on VDPA_SIM select GENERIC_NET_UTILS help - vDPA networking device simulator which loop TX traffic back - to RX. This device is used for testing, prototyping and - development of vDPA. + vDPA networking device simulator which loops TX traffic back to RX. config IFCVF tristate "Intel IFC VF vDPA driver" diff --git a/drivers/vdpa/vdpa_sim/Makefile b/drivers/vdpa/vdpa_sim/Makefile index b40278f65e04..79d4536d347e 100644 --- a/drivers/vdpa/vdpa_sim/Makefile +++ b/drivers/vdpa/vdpa_sim/Makefile @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_VDPA_SIM) += vdpa_sim.o +obj-$(CONFIG_VDPA_SIM_NET) += vdpa_sim_net.o diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 875e42390a13..b3fcc67bfdf0 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * VDPA networking device simulator. + * VDPA device simulator core. * * Copyright (c) 2020, Red Hat Inc. All rights reserved. * Author: Jason Wang @@ -14,17 +14,15 @@ #include #include #include -#include #include #include -#include #include -#include -#include + +#include "vdpa_sim.h" #define DRV_VERSION "0.1" #define DRV_AUTHOR "Jason Wang " -#define DRV_DESC "vDPA Device Simulator" +#define DRV_DESC "vDPA Device Simulator core" #define DRV_LICENSE "GPL v2" static int batch_mapping = 1; @@ -36,90 +34,9 @@ module_param(max_iotlb_entries, int, 0444); MODULE_PARM_DESC(max_iotlb_entries, "Maximum number of iotlb entries. 0 means unlimited. (default: 2048)"); -static char *macaddr; -module_param(macaddr, charp, 0); -MODULE_PARM_DESC(macaddr, "Ethernet MAC address"); - -u8 macaddr_buf[ETH_ALEN]; - -struct vdpasim_virtqueue { - struct vringh vring; - struct vringh_kiov in_iov; - struct vringh_kiov out_iov; - unsigned short head; - bool ready; - u64 desc_addr; - u64 device_addr; - u64 driver_addr; - u32 num; - void *private; - irqreturn_t (*cb)(void *data); -}; - #define VDPASIM_QUEUE_ALIGN PAGE_SIZE #define VDPASIM_QUEUE_MAX 256 #define VDPASIM_VENDOR_ID 0 -#define VDPASIM_VQ_NUM 0x2 -#define VDPASIM_NAME "vdpasim-netdev" - -#define VDPASIM_FEATURES ((1ULL << VIRTIO_F_ANY_LAYOUT) | \ - (1ULL << VIRTIO_F_VERSION_1) | \ - (1ULL << VIRTIO_F_ACCESS_PLATFORM)) - -#define VDPASIM_NET_FEATURES (VDPASIM_FEATURES | \ - (1ULL << VIRTIO_NET_F_MAC)) - -struct vdpasim; - -struct vdpasim_dev_attr { - u64 supported_features; - size_t config_size; - size_t buffer_size; - int nvqs; - u32 id; - - work_func_t work_fn; - void (*get_config)(struct vdpasim *vdpasim, void *config); - void (*set_config)(struct vdpasim *vdpasim, const void *config); -}; - -/* State of each vdpasim device */ -struct vdpasim { - struct vdpa_device vdpa; - struct vdpasim_virtqueue *vqs; - struct work_struct work; - struct vdpasim_dev_attr dev_attr; - /* spinlock to synchronize virtqueue state */ - spinlock_t lock; - /* virtio config according to device type */ - void *config; - struct vhost_iotlb *iommu; - void *buffer; - u32 status; - u32 generation; - u64 features; - /* spinlock to synchronize iommu table */ - spinlock_t iommu_lock; -}; - -/* TODO: cross-endian support */ -static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim) -{ - return virtio_legacy_is_little_endian() || - (vdpasim->features & (1ULL << VIRTIO_F_VERSION_1)); -} - -static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val) -{ - return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val); -} - -static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val) -{ - return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val); -} - -static struct vdpasim *vdpasim_dev; static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa) { @@ -190,80 +107,6 @@ static void vdpasim_reset(struct vdpasim *vdpasim) ++vdpasim->generation; } -static void vdpasim_net_work(struct work_struct *work) -{ - struct vdpasim *vdpasim = container_of(work, struct - vdpasim, work); - struct vdpasim_virtqueue *txq = &vdpasim->vqs[1]; - struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0]; - ssize_t read, write; - size_t total_write; - int pkts = 0; - int err; - - spin_lock(&vdpasim->lock); - - if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) - goto out; - - if (!txq->ready || !rxq->ready) - goto out; - - while (true) { - total_write = 0; - err = vringh_getdesc_iotlb(&txq->vring, &txq->out_iov, NULL, - &txq->head, GFP_ATOMIC); - if (err <= 0) - break; - - err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->in_iov, - &rxq->head, GFP_ATOMIC); - if (err <= 0) { - vringh_complete_iotlb(&txq->vring, txq->head, 0); - break; - } - - while (true) { - read = vringh_iov_pull_iotlb(&txq->vring, &txq->out_iov, - vdpasim->buffer, - PAGE_SIZE); - if (read <= 0) - break; - - write = vringh_iov_push_iotlb(&rxq->vring, &rxq->in_iov, - vdpasim->buffer, read); - if (write <= 0) - break; - - total_write += write; - } - - /* Make sure data is wrote before advancing index */ - smp_wmb(); - - vringh_complete_iotlb(&txq->vring, txq->head, 0); - vringh_complete_iotlb(&rxq->vring, rxq->head, total_write); - - /* Make sure used is visible before rasing the interrupt. */ - smp_wmb(); - - local_bh_disable(); - if (vringh_need_notify_iotlb(&txq->vring) > 0) - vringh_notify(&txq->vring); - if (vringh_need_notify_iotlb(&rxq->vring) > 0) - vringh_notify(&rxq->vring); - local_bh_enable(); - - if (++pkts > 4) { - schedule_work(&vdpasim->work); - goto out; - } - } - -out: - spin_unlock(&vdpasim->lock); -} - static int dir_to_perm(enum dma_data_direction dir) { int perm = -EFAULT; @@ -379,7 +222,7 @@ static const struct dma_map_ops vdpasim_dma_ops = { static const struct vdpa_config_ops vdpasim_config_ops; static const struct vdpa_config_ops vdpasim_batch_config_ops; -static struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) +struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) { const struct vdpa_config_ops *ops; struct vdpasim *vdpasim; @@ -424,23 +267,10 @@ static struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) if (!vdpasim->buffer) goto err_iommu; - if (macaddr) { - mac_pton(macaddr, macaddr_buf); - if (!is_valid_ether_addr(macaddr_buf)) { - ret = -EADDRNOTAVAIL; - goto err_iommu; - } - } else { - eth_random_addr(macaddr_buf); - } - for (i = 0; i < dev_attr->nvqs; i++) vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu); vdpasim->vdpa.dma_dev = dev; - ret = vdpa_register_device(&vdpasim->vdpa); - if (ret) - goto err_iommu; return vdpasim; @@ -449,6 +279,7 @@ err_iommu: err_alloc: return ERR_PTR(ret); } +EXPORT_SYMBOL_GPL(vdpasim_create); static int vdpasim_set_vq_address(struct vdpa_device *vdpa, u16 idx, u64 desc_area, u64 driver_area, @@ -769,46 +600,6 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .free = vdpasim_free, }; -static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config) -{ - struct virtio_net_config *net_config = - (struct virtio_net_config *)config; - - net_config->mtu = cpu_to_vdpasim16(vdpasim, 1500); - net_config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP); - memcpy(net_config->mac, macaddr_buf, ETH_ALEN); -} - -static int __init vdpasim_dev_init(void) -{ - struct vdpasim_dev_attr dev_attr = {}; - - dev_attr.id = VIRTIO_ID_NET; - dev_attr.supported_features = VDPASIM_NET_FEATURES; - dev_attr.nvqs = VDPASIM_VQ_NUM; - dev_attr.config_size = sizeof(struct virtio_net_config); - dev_attr.get_config = vdpasim_net_get_config; - dev_attr.work_fn = vdpasim_net_work; - dev_attr.buffer_size = PAGE_SIZE; - - vdpasim_dev = vdpasim_create(&dev_attr); - - if (!IS_ERR(vdpasim_dev)) - return 0; - - return PTR_ERR(vdpasim_dev); -} - -static void __exit vdpasim_dev_exit(void) -{ - struct vdpa_device *vdpa = &vdpasim_dev->vdpa; - - vdpa_unregister_device(vdpa); -} - -module_init(vdpasim_dev_init) -module_exit(vdpasim_dev_exit) - MODULE_VERSION(DRV_VERSION); MODULE_LICENSE(DRV_LICENSE); MODULE_AUTHOR(DRV_AUTHOR); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h new file mode 100644 index 000000000000..b02142293d5b --- /dev/null +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020, Red Hat Inc. All rights reserved. + */ + +#ifndef _VDPA_SIM_H +#define _VDPA_SIM_H + +#include +#include +#include +#include +#include + +#define VDPASIM_FEATURES ((1ULL << VIRTIO_F_ANY_LAYOUT) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VIRTIO_F_ACCESS_PLATFORM)) + +struct vdpasim; + +struct vdpasim_virtqueue { + struct vringh vring; + struct vringh_kiov in_iov; + struct vringh_kiov out_iov; + unsigned short head; + bool ready; + u64 desc_addr; + u64 device_addr; + u64 driver_addr; + u32 num; + void *private; + irqreturn_t (*cb)(void *data); +}; + +struct vdpasim_dev_attr { + u64 supported_features; + size_t config_size; + size_t buffer_size; + int nvqs; + u32 id; + + work_func_t work_fn; + void (*get_config)(struct vdpasim *vdpasim, void *config); + void (*set_config)(struct vdpasim *vdpasim, const void *config); +}; + +/* State of each vdpasim device */ +struct vdpasim { + struct vdpa_device vdpa; + struct vdpasim_virtqueue *vqs; + struct work_struct work; + struct vdpasim_dev_attr dev_attr; + /* spinlock to synchronize virtqueue state */ + spinlock_t lock; + /* virtio config according to device type */ + void *config; + struct vhost_iotlb *iommu; + void *buffer; + u32 status; + u32 generation; + u64 features; + /* spinlock to synchronize iommu table */ + spinlock_t iommu_lock; +}; + +struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *attr); + +/* TODO: cross-endian support */ +static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim) +{ + return virtio_legacy_is_little_endian() || + (vdpasim->features & (1ULL << VIRTIO_F_VERSION_1)); +} + +static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val) +{ + return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val); +} + +static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val) +{ + return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val); +} + +static inline u32 vdpasim32_to_cpu(struct vdpasim *vdpasim, __virtio32 val) +{ + return __virtio32_to_cpu(vdpasim_is_little_endian(vdpasim), val); +} + +static inline __virtio32 cpu_to_vdpasim32(struct vdpasim *vdpasim, u32 val) +{ + return __cpu_to_virtio32(vdpasim_is_little_endian(vdpasim), val); +} + +static inline u64 vdpasim64_to_cpu(struct vdpasim *vdpasim, __virtio64 val) +{ + return __virtio64_to_cpu(vdpasim_is_little_endian(vdpasim), val); +} + +static inline __virtio64 cpu_to_vdpasim64(struct vdpasim *vdpasim, u64 val) +{ + return __cpu_to_virtio64(vdpasim_is_little_endian(vdpasim), val); +} + +#endif diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c new file mode 100644 index 000000000000..c10b6981fdab --- /dev/null +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VDPA simulator for networking device. + * + * Copyright (c) 2020, Red Hat Inc. All rights reserved. + * Author: Jason Wang + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vdpa_sim.h" + +#define DRV_VERSION "0.1" +#define DRV_AUTHOR "Jason Wang " +#define DRV_DESC "vDPA Device Simulator for networking device" +#define DRV_LICENSE "GPL v2" + +#define VDPASIM_NET_FEATURES (VDPASIM_FEATURES | \ + (1ULL << VIRTIO_NET_F_MAC)) + +#define VDPASIM_NET_VQ_NUM 2 + +static char *macaddr; +module_param(macaddr, charp, 0); +MODULE_PARM_DESC(macaddr, "Ethernet MAC address"); + +u8 macaddr_buf[ETH_ALEN]; + +static struct vdpasim *vdpasim_net_dev; + +static void vdpasim_net_work(struct work_struct *work) +{ + struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); + struct vdpasim_virtqueue *txq = &vdpasim->vqs[1]; + struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0]; + ssize_t read, write; + size_t total_write; + int pkts = 0; + int err; + + spin_lock(&vdpasim->lock); + + if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) + goto out; + + if (!txq->ready || !rxq->ready) + goto out; + + while (true) { + total_write = 0; + err = vringh_getdesc_iotlb(&txq->vring, &txq->out_iov, NULL, + &txq->head, GFP_ATOMIC); + if (err <= 0) + break; + + err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->in_iov, + &rxq->head, GFP_ATOMIC); + if (err <= 0) { + vringh_complete_iotlb(&txq->vring, txq->head, 0); + break; + } + + while (true) { + read = vringh_iov_pull_iotlb(&txq->vring, &txq->out_iov, + vdpasim->buffer, + PAGE_SIZE); + if (read <= 0) + break; + + write = vringh_iov_push_iotlb(&rxq->vring, &rxq->in_iov, + vdpasim->buffer, read); + if (write <= 0) + break; + + total_write += write; + } + + /* Make sure data is wrote before advancing index */ + smp_wmb(); + + vringh_complete_iotlb(&txq->vring, txq->head, 0); + vringh_complete_iotlb(&rxq->vring, rxq->head, total_write); + + /* Make sure used is visible before rasing the interrupt. */ + smp_wmb(); + + local_bh_disable(); + if (vringh_need_notify_iotlb(&txq->vring) > 0) + vringh_notify(&txq->vring); + if (vringh_need_notify_iotlb(&rxq->vring) > 0) + vringh_notify(&rxq->vring); + local_bh_enable(); + + if (++pkts > 4) { + schedule_work(&vdpasim->work); + goto out; + } + } + +out: + spin_unlock(&vdpasim->lock); +} + +static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config) +{ + struct virtio_net_config *net_config = + (struct virtio_net_config *)config; + + net_config->mtu = cpu_to_vdpasim16(vdpasim, 1500); + net_config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP); + memcpy(net_config->mac, macaddr_buf, ETH_ALEN); +} + +static int __init vdpasim_net_init(void) +{ + struct vdpasim_dev_attr dev_attr = {}; + int ret; + + if (macaddr) { + mac_pton(macaddr, macaddr_buf); + if (!is_valid_ether_addr(macaddr_buf)) { + ret = -EADDRNOTAVAIL; + goto out; + } + } else { + eth_random_addr(macaddr_buf); + } + + dev_attr.id = VIRTIO_ID_NET; + dev_attr.supported_features = VDPASIM_NET_FEATURES; + dev_attr.nvqs = VDPASIM_NET_VQ_NUM; + dev_attr.config_size = sizeof(struct virtio_net_config); + dev_attr.get_config = vdpasim_net_get_config; + dev_attr.work_fn = vdpasim_net_work; + dev_attr.buffer_size = PAGE_SIZE; + + vdpasim_net_dev = vdpasim_create(&dev_attr); + if (IS_ERR(vdpasim_net_dev)) { + ret = PTR_ERR(vdpasim_net_dev); + goto out; + } + + ret = vdpa_register_device(&vdpasim_net_dev->vdpa); + if (ret) + goto put_dev; + + return 0; + +put_dev: + put_device(&vdpasim_net_dev->vdpa.dev); +out: + return ret; +} + +static void __exit vdpasim_net_exit(void) +{ + struct vdpa_device *vdpa = &vdpasim_net_dev->vdpa; + + vdpa_unregister_device(vdpa); +} + +module_init(vdpasim_net_init); +module_exit(vdpasim_net_exit); + +MODULE_VERSION(DRV_VERSION); +MODULE_LICENSE(DRV_LICENSE); +MODULE_AUTHOR(DRV_AUTHOR); +MODULE_DESCRIPTION(DRV_DESC); From 83ef73b27eb2363f44faf9c3ee28a3fe752cfd15 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 9 Dec 2020 16:00:04 +0200 Subject: [PATCH 211/326] vdpa/mlx5: Use write memory barrier after updating CQ index Make sure to put dma write memory barrier after updating CQ consumer index so the hardware knows that there are available CQE slots in the queue. Failure to do this can cause the update of the RX doorbell record to get updated before the CQ consumer index resulting in CQ overrun. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20201209140004.15892-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 1fa6fcac8299..81b932f72e10 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -464,6 +464,11 @@ static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq) static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num) { mlx5_cq_set_ci(&mvq->cq.mcq); + + /* make sure CQ cosumer update is visible to the hardware before updating + * RX doorbell record. + */ + dma_wmb(); rx_post(&mvq->vqqp, num); if (mvq->event_cb.callback) mvq->event_cb.callback(mvq->event_cb.private); From 697d1549140cdcdc4cfcd0bf94e62643008972b7 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Wed, 9 Dec 2020 16:42:03 +0800 Subject: [PATCH 212/326] tools/virtio: include asm/bug.h WARN_ON is used in drivers/vhost/vringh.c, to avoid build failure, need include asm/bug.h Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/20201209084205.24062-2-peng.fan@oss.nxp.com Signed-off-by: Michael S. Tsirkin --- tools/virtio/linux/bug.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/virtio/linux/bug.h b/tools/virtio/linux/bug.h index b14c2c3b6b85..813baf13f62a 100644 --- a/tools/virtio/linux/bug.h +++ b/tools/virtio/linux/bug.h @@ -2,6 +2,8 @@ #ifndef BUG_H #define BUG_H +#include + #define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond)) #define BUILD_BUG_ON(x) From b9ca93bcd186ec4144df91c619f6084cdad500ec Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Wed, 9 Dec 2020 16:42:04 +0800 Subject: [PATCH 213/326] tools/virtio: add krealloc_array krealloc_array is used in drivers/vhost/vringh.c, add it to avoid build failure. Drop WARN_ON_ONCE, because duplicated with the one in bug.h Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/20201209084205.24062-3-peng.fan@oss.nxp.com Signed-off-by: Michael S. Tsirkin --- tools/virtio/linux/kernel.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h index 315e85cabeda..0b493542e61a 100644 --- a/tools/virtio/linux/kernel.h +++ b/tools/virtio/linux/kernel.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -117,6 +118,16 @@ static inline void free_page(unsigned long addr) # define unlikely(x) (__builtin_expect(!!(x), 0)) # endif +static inline void *krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t gfp) +{ + size_t bytes; + + if (unlikely(check_mul_overflow(new_n, new_size, &bytes))) + return NULL; + + return krealloc(p, bytes, gfp); +} + #define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__) #ifdef DEBUG #define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__) @@ -126,8 +137,6 @@ static inline void free_page(unsigned long addr) #define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) #define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) -#define WARN_ON_ONCE(cond) (unlikely(cond) ? fprintf (stderr, "WARNING\n") : 0) - #define min(x, y) ({ \ typeof(x) _min1 = (x); \ typeof(y) _min2 = (y); \ From 1a5514cbb09aaf694d26ef26fd6da5c5d495cc22 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Wed, 9 Dec 2020 16:42:05 +0800 Subject: [PATCH 214/326] tools/virtio: add barrier for aarch64 Add barrier for aarch64 for cross compiling, and most are from Linux Kernel. Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/20201209084205.24062-4-peng.fan@oss.nxp.com Signed-off-by: Michael S. Tsirkin --- tools/virtio/asm/barrier.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/virtio/asm/barrier.h b/tools/virtio/asm/barrier.h index 04d563fc9b95..468435ed64e6 100644 --- a/tools/virtio/asm/barrier.h +++ b/tools/virtio/asm/barrier.h @@ -16,6 +16,16 @@ # define mb() abort() # define dma_rmb() abort() # define dma_wmb() abort() +#elif defined(__aarch64__) +#define dmb(opt) asm volatile("dmb " #opt : : : "memory") +#define virt_mb() __sync_synchronize() +#define virt_rmb() dmb(ishld) +#define virt_wmb() dmb(ishst) +#define virt_store_mb(var, value) do { WRITE_ONCE(var, value); dmb(ish); } while (0) +/* Weak barriers should be used. If not - it's a bug */ +# define mb() abort() +# define dma_rmb() abort() +# define dma_wmb() abort() #else #error Please fill in barrier macros #endif From ae93d8ea0fa701e84ab9df0db9fb60ec6c80d7b8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 4 Dec 2020 17:23:00 +0300 Subject: [PATCH 215/326] virtio_ring: Cut and paste bugs in vring_create_virtqueue_packed() There is a copy and paste bug in the error handling of this code and it uses "ring_dma_addr" three times instead of "device_event_dma_addr" and "driver_event_dma_addr". Fixes: 1ce9e6055fa0 (" virtio_ring: introduce packed ring support") Reported-by: Robert Buhren Reported-by: Felicitas Hetzelt Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/X8pGRJlEzyn+04u2@mwanda Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/virtio/virtio_ring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index becc77697960..924b6b85376b 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1676,9 +1676,9 @@ err_desc_extra: err_desc_state: kfree(vq); err_vq: - vring_free_queue(vdev, event_size_in_bytes, device, ring_dma_addr); + vring_free_queue(vdev, event_size_in_bytes, device, device_event_dma_addr); err_device: - vring_free_queue(vdev, event_size_in_bytes, driver, ring_dma_addr); + vring_free_queue(vdev, event_size_in_bytes, driver, driver_event_dma_addr); err_driver: vring_free_queue(vdev, ring_size_in_bytes, ring, ring_dma_addr); err_ring: From 411ea23a76526e6efed0b601abb603d3c981b333 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 4 Dec 2020 17:23:16 +0300 Subject: [PATCH 216/326] virtio_net: Fix error code in probe() Set a negative error code intead of returning success if the MTU has been changed to something invalid. Fixes: fe36cbe0671e ("virtio_net: clear MTU when out of range") Reported-by: Robert Buhren Reported-by: Felicitas Hetzelt Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/X8pGVJSeeCdII1Ys@mwanda Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/net/virtio_net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 21b71148c532..34bb95dd9239 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3072,6 +3072,7 @@ static int virtnet_probe(struct virtio_device *vdev) dev_err(&vdev->dev, "device MTU appears to have changed it is now %d < %d", mtu, dev->min_mtu); + err = -EINVAL; goto free; } From e152d8af4220a05c9797591609151d404866beaa Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 4 Dec 2020 17:23:36 +0300 Subject: [PATCH 217/326] virtio_ring: Fix two use after free bugs The "vq" struct is added to the "vdev->vqs" list prematurely. If we encounter an error later in the function then the "vq" is freed, but since it is still on the list that could lead to a use after free bug. Fixes: cbeedb72b97a ("virtio_ring: allocate desc state for split ring separately") Reported-by: Robert Buhren Reported-by: Felicitas Hetzelt Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/X8pGaG/zkI3jk8mk@mwanda Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/virtio/virtio_ring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 924b6b85376b..71e16b53e9c1 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1608,7 +1608,6 @@ static struct virtqueue *vring_create_virtqueue_packed( vq->num_added = 0; vq->packed_ring = true; vq->use_dma_api = vring_use_dma_api(vdev); - list_add_tail(&vq->vq.list, &vdev->vqs); #ifdef DEBUG vq->in_use = false; vq->last_add_time_valid = false; @@ -1669,6 +1668,7 @@ static struct virtqueue *vring_create_virtqueue_packed( cpu_to_le16(vq->packed.event_flags_shadow); } + list_add_tail(&vq->vq.list, &vdev->vqs); return &vq->vq; err_desc_extra: @@ -2085,7 +2085,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, vq->last_used_idx = 0; vq->num_added = 0; vq->use_dma_api = vring_use_dma_api(vdev); - list_add_tail(&vq->vq.list, &vdev->vqs); #ifdef DEBUG vq->in_use = false; vq->last_add_time_valid = false; @@ -2127,6 +2126,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, memset(vq->split.desc_state, 0, vring.num * sizeof(struct vring_desc_state_split)); + list_add_tail(&vq->vq.list, &vdev->vqs); return &vq->vq; } EXPORT_SYMBOL_GPL(__vring_new_virtqueue); From 2e1139d613c7fb0956e82f72a8281c0a475ad4f8 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Fri, 4 Dec 2020 16:43:30 +0800 Subject: [PATCH 218/326] vhost scsi: fix error return code in vhost_scsi_set_endpoint() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Fixes: 25b98b64e284 ("vhost scsi: alloc cmds per vq instead of session") Reported-by: Hulk Robot Signed-off-by: Zhang Changzhong Link: https://lore.kernel.org/r/1607071411-33484-1-git-send-email-zhangchangzhong@huawei.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/scsi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 6ff8a5096691..4ce9f00ae10e 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1643,7 +1643,8 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, if (!vhost_vq_is_setup(vq)) continue; - if (vhost_scsi_setup_vq_cmds(vq, vq->num)) + ret = vhost_scsi_setup_vq_cmds(vq, vq->num); + if (ret) goto destroy_vq_cmds; } From 1e38f0031c3055c9c7e5ffcb3bb09c95f69614ee Mon Sep 17 00:00:00 2001 From: "Enrico Weigelt, metux IT consult" Date: Wed, 2 Dec 2020 12:19:30 +0100 Subject: [PATCH 219/326] uapi: virtio_ids.h: consistent indentions Fixing the differing indentions to be consistent and properly aligned. Signed-off-by: Enrico Weigelt, metux IT consult Link: https://lore.kernel.org/r/20201202111931.31953-1-info@metux.net Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_ids.h | 38 ++++++++++++++++----------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index b052355ac7a3..3cb55e5277a1 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -29,24 +29,24 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#define VIRTIO_ID_NET 1 /* virtio net */ -#define VIRTIO_ID_BLOCK 2 /* virtio block */ -#define VIRTIO_ID_CONSOLE 3 /* virtio console */ -#define VIRTIO_ID_RNG 4 /* virtio rng */ -#define VIRTIO_ID_BALLOON 5 /* virtio balloon */ -#define VIRTIO_ID_RPMSG 7 /* virtio remote processor messaging */ -#define VIRTIO_ID_SCSI 8 /* virtio scsi */ -#define VIRTIO_ID_9P 9 /* 9p virtio console */ -#define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ -#define VIRTIO_ID_CAIF 12 /* Virtio caif */ -#define VIRTIO_ID_GPU 16 /* virtio GPU */ -#define VIRTIO_ID_INPUT 18 /* virtio input */ -#define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ -#define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ -#define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */ -#define VIRTIO_ID_MEM 24 /* virtio mem */ -#define VIRTIO_ID_FS 26 /* virtio filesystem */ -#define VIRTIO_ID_PMEM 27 /* virtio pmem */ -#define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */ +#define VIRTIO_ID_NET 1 /* virtio net */ +#define VIRTIO_ID_BLOCK 2 /* virtio block */ +#define VIRTIO_ID_CONSOLE 3 /* virtio console */ +#define VIRTIO_ID_RNG 4 /* virtio rng */ +#define VIRTIO_ID_BALLOON 5 /* virtio balloon */ +#define VIRTIO_ID_RPMSG 7 /* virtio remote processor messaging */ +#define VIRTIO_ID_SCSI 8 /* virtio scsi */ +#define VIRTIO_ID_9P 9 /* 9p virtio console */ +#define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ +#define VIRTIO_ID_CAIF 12 /* Virtio caif */ +#define VIRTIO_ID_GPU 16 /* virtio GPU */ +#define VIRTIO_ID_INPUT 18 /* virtio input */ +#define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ +#define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ +#define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */ +#define VIRTIO_ID_MEM 24 /* virtio mem */ +#define VIRTIO_ID_FS 26 /* virtio filesystem */ +#define VIRTIO_ID_PMEM 27 /* virtio pmem */ +#define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */ #endif /* _LINUX_VIRTIO_IDS_H */ From be618636de4186521ffba2cbe5105e9c3481b9cb Mon Sep 17 00:00:00 2001 From: "Enrico Weigelt, metux IT consult" Date: Wed, 2 Dec 2020 12:19:31 +0100 Subject: [PATCH 220/326] uapi: virtio_ids: add missing device type IDs from OASIS spec The OASIS virtio spec (1.1) defines several IDs that aren't reflected in the header yet. Fixing this by adding the missing IDs, even though they're not yet used by the kernel yet. Signed-off-by: Enrico Weigelt, metux IT consult Link: https://lore.kernel.org/r/20201202111931.31953-2-info@metux.net Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_ids.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 3cb55e5277a1..bc1c0621f5ed 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -34,15 +34,21 @@ #define VIRTIO_ID_CONSOLE 3 /* virtio console */ #define VIRTIO_ID_RNG 4 /* virtio rng */ #define VIRTIO_ID_BALLOON 5 /* virtio balloon */ +#define VIRTIO_ID_IOMEM 6 /* virtio ioMemory */ #define VIRTIO_ID_RPMSG 7 /* virtio remote processor messaging */ #define VIRTIO_ID_SCSI 8 /* virtio scsi */ #define VIRTIO_ID_9P 9 /* 9p virtio console */ +#define VIRTIO_ID_MAC80211_WLAN 10 /* virtio WLAN MAC */ #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ #define VIRTIO_ID_CAIF 12 /* Virtio caif */ +#define VIRTIO_ID_MEMORY_BALLOON 13 /* virtio memory balloon */ #define VIRTIO_ID_GPU 16 /* virtio GPU */ +#define VIRTIO_ID_CLOCK 17 /* virtio clock/timer */ #define VIRTIO_ID_INPUT 18 /* virtio input */ #define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ #define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ +#define VIRTIO_ID_SIGNAL_DIST 21 /* virtio signal distribution device */ +#define VIRTIO_ID_PSTORE 22 /* virtio pstore device */ #define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */ #define VIRTIO_ID_MEM 24 /* virtio mem */ #define VIRTIO_ID_FS 26 /* virtio filesystem */ From 476c135e321716ad7a8a5d4a19a636e2dcc50526 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 12 Nov 2020 08:39:59 +0200 Subject: [PATCH 221/326] vdpa: Add missing comment for virtqueue count Add missing comment for number of virtqueue. Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Acked-by: Jason Wang Link: https://lore.kernel.org/r/20201112064005.349268-2-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 30bc7a7223bb..0fefeb976877 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -42,6 +42,7 @@ struct vdpa_vq_state { * @config: the configuration ops for this device. * @index: device index * @features_valid: were features initialized? for legacy guests + * @nvqs: maximum number of supported virtqueues */ struct vdpa_device { struct device dev; From 418eddef050d5f6393c303a94e3173847ab85466 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 12 Nov 2020 08:40:00 +0200 Subject: [PATCH 222/326] vdpa: Use simpler version of ida allocation vdpa doesn't have any specific need to define start and end range of the device index. Hence use the simper version of the ida allocator. Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Acked-by: Jason Wang Link: https://lore.kernel.org/r/20201112064005.349268-3-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index a69ffc991e13..c0825650c055 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -89,7 +89,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, if (!vdev) goto err; - err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL); + err = ida_alloc(&vdpa_index_ida, GFP_KERNEL); if (err < 0) goto err_ida; From d69c6ddd019f31081cc0232fa8ad8ea1cabdf22c Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 17 Dec 2020 16:34:29 -0600 Subject: [PATCH 223/326] dt-bindings: Fix JSON pointers The correct syntax for JSON pointers begins with a '/' after the '#'. Without a '/', the string should be interpreted as a subschema identifier. The jsonschema module currently doesn't handle subschema identifiers and incorrectly allows JSON pointers to begin without a '/'. Let's fix this before it becomes a problem when jsonschema module is fixed. Converted with: perl -p -i -e 's/yaml#definitions/yaml#\/definitions/g' `find Documentation/devicetree/bindings/ -name "*.yaml"` Cc: Maxime Ripard Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: Daniel Thompson Cc: Jingoo Han Cc: Pavel Machek Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Andrew Lunn Cc: Florian Fainelli Cc: Greg Kroah-Hartman Cc: Mark Brown Cc: netdev@vger.kernel.org Acked-By: Vinod Koul Acked-by: Lee Jones Acked-by: Guenter Roeck Acked-by: Sebastian Reichel Acked-by: Florian Fainelli Acked-by: Mark Brown Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20201217223429.354283-1-robh@kernel.org Signed-off-by: Rob Herring --- .../devicetree/bindings/arm/idle-states.yaml | 2 +- .../bus/allwinner,sun50i-a64-de2.yaml | 2 +- .../bindings/bus/baikal,bt1-axi.yaml | 2 +- .../bindings/connector/usb-connector.yaml | 10 ++--- .../devicetree/bindings/dma/dma-common.yaml | 4 +- .../devicetree/bindings/dma/dma-router.yaml | 2 +- .../devicetree/bindings/dma/ingenic,dma.yaml | 2 +- .../bindings/dma/snps,dma-spear1340.yaml | 10 ++--- .../devicetree/bindings/eeprom/at24.yaml | 4 +- .../devicetree/bindings/eeprom/at25.yaml | 4 +- .../bindings/hwmon/moortec,mr75203.yaml | 2 +- .../bindings/hwmon/sensirion,shtc1.yaml | 4 +- .../devicetree/bindings/hwmon/ti,tmp513.yaml | 2 +- .../bindings/iio/light/upisemi,us5182.yaml | 2 +- .../iio/proximity/semtech,sx9310.yaml | 6 +-- .../devicetree/bindings/input/gpio-keys.yaml | 12 +++--- .../interrupt-controller/mti,gic.yaml | 4 +- .../interrupt-controller/ti,pruss-intc.yaml | 2 +- .../interrupt-controller/ti,sci-inta.yaml | 2 +- .../bindings/leds/backlight/common.yaml | 4 +- .../devicetree/bindings/leds/common.yaml | 16 ++++---- .../devicetree/bindings/leds/leds-lp55xx.yaml | 10 ++--- .../net/allwinner,sun8i-a83t-emac.yaml | 6 +-- .../bindings/net/amlogic,meson-dwmac.yaml | 2 +- .../devicetree/bindings/net/dsa/dsa.yaml | 6 +-- .../bindings/net/ethernet-controller.yaml | 24 ++++++------ .../devicetree/bindings/net/ethernet-phy.yaml | 20 +++++----- .../bindings/net/fsl,qoriq-mc-dpmac.yaml | 2 +- .../devicetree/bindings/net/mdio.yaml | 2 +- .../bindings/net/mediatek,star-emac.yaml | 2 +- .../devicetree/bindings/net/qcom,ipa.yaml | 2 +- .../devicetree/bindings/net/snps,dwmac.yaml | 38 +++++++++---------- .../bindings/net/socionext,uniphier-ave4.yaml | 2 +- .../bindings/net/ti,cpsw-switch.yaml | 2 +- .../devicetree/bindings/net/ti,dp83867.yaml | 12 +++--- .../devicetree/bindings/net/ti,dp83869.yaml | 8 ++-- .../bindings/net/ti,k3-am654-cpsw-nuss.yaml | 4 +- .../bindings/net/wireless/qcom,ath11k.yaml | 2 +- .../devicetree/bindings/phy/ti,omap-usb2.yaml | 4 +- .../power/mediatek,power-controller.yaml | 12 +++--- .../bindings/power/supply/cw2015_battery.yaml | 2 +- .../devicetree/bindings/powerpc/sleep.yaml | 2 +- .../devicetree/bindings/serial/8250.yaml | 6 +-- .../bindings/soc/ti/k3-ringacc.yaml | 2 +- .../sound/allwinner,sun4i-a10-codec.yaml | 2 +- .../bindings/sound/st,stm32-sai.yaml | 4 +- 46 files changed, 138 insertions(+), 138 deletions(-) diff --git a/Documentation/devicetree/bindings/arm/idle-states.yaml b/Documentation/devicetree/bindings/arm/idle-states.yaml index ea805c1e6b20..52bce5dbb11f 100644 --- a/Documentation/devicetree/bindings/arm/idle-states.yaml +++ b/Documentation/devicetree/bindings/arm/idle-states.yaml @@ -313,7 +313,7 @@ patternProperties: wakeup-latency-us by this duration. idle-state-name: - $ref: /schemas/types.yaml#definitions/string + $ref: /schemas/types.yaml#/definitions/string description: A string used as a descriptive name for the idle state. diff --git a/Documentation/devicetree/bindings/bus/allwinner,sun50i-a64-de2.yaml b/Documentation/devicetree/bindings/bus/allwinner,sun50i-a64-de2.yaml index 0503651cd214..863a287ebc7e 100644 --- a/Documentation/devicetree/bindings/bus/allwinner,sun50i-a64-de2.yaml +++ b/Documentation/devicetree/bindings/bus/allwinner,sun50i-a64-de2.yaml @@ -34,7 +34,7 @@ properties: description: The SRAM that needs to be claimed to access the display engine bus. - $ref: /schemas/types.yaml#definitions/phandle-array + $ref: /schemas/types.yaml#/definitions/phandle-array maxItems: 1 ranges: true diff --git a/Documentation/devicetree/bindings/bus/baikal,bt1-axi.yaml b/Documentation/devicetree/bindings/bus/baikal,bt1-axi.yaml index 0bee4694578a..4ac78b44e45e 100644 --- a/Documentation/devicetree/bindings/bus/baikal,bt1-axi.yaml +++ b/Documentation/devicetree/bindings/bus/baikal,bt1-axi.yaml @@ -46,7 +46,7 @@ properties: const: 1 syscon: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: Phandle to the Baikal-T1 System Controller DT node interrupts: diff --git a/Documentation/devicetree/bindings/connector/usb-connector.yaml b/Documentation/devicetree/bindings/connector/usb-connector.yaml index a84464b3e1f2..4286ed767a0a 100644 --- a/Documentation/devicetree/bindings/connector/usb-connector.yaml +++ b/Documentation/devicetree/bindings/connector/usb-connector.yaml @@ -37,7 +37,7 @@ properties: description: Size of the connector, should be specified in case of non-fullsize 'usb-a-connector' or 'usb-b-connector' compatible connectors. - $ref: /schemas/types.yaml#definitions/string + $ref: /schemas/types.yaml#/definitions/string enum: - mini @@ -67,7 +67,7 @@ properties: power-role: description: Determines the power role that the Type C connector will support. "dual" refers to Dual Role Port (DRP). - $ref: /schemas/types.yaml#definitions/string + $ref: /schemas/types.yaml#/definitions/string enum: - source @@ -76,7 +76,7 @@ properties: try-power-role: description: Preferred power role. - $ref: /schemas/types.yaml#definitions/string + $ref: /schemas/types.yaml#/definitions/string enum: - source @@ -86,7 +86,7 @@ properties: data-role: description: Data role if Type C connector supports USB data. "dual" refers Dual Role Device (DRD). - $ref: /schemas/types.yaml#definitions/string + $ref: /schemas/types.yaml#/definitions/string enum: - host @@ -105,7 +105,7 @@ properties: Type-C Cable and Connector specification, when Power Delivery is not supported. allOf: - - $ref: /schemas/types.yaml#definitions/string + - $ref: /schemas/types.yaml#/definitions/string enum: - default - 1.5A diff --git a/Documentation/devicetree/bindings/dma/dma-common.yaml b/Documentation/devicetree/bindings/dma/dma-common.yaml index 307b499e8968..ad06d36af208 100644 --- a/Documentation/devicetree/bindings/dma/dma-common.yaml +++ b/Documentation/devicetree/bindings/dma/dma-common.yaml @@ -38,12 +38,12 @@ properties: maxItems: 255 dma-channels: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: Number of DMA channels supported by the controller. dma-requests: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: Number of DMA request signals supported by the controller. diff --git a/Documentation/devicetree/bindings/dma/dma-router.yaml b/Documentation/devicetree/bindings/dma/dma-router.yaml index 4cee5667b8a8..e72748496fd9 100644 --- a/Documentation/devicetree/bindings/dma/dma-router.yaml +++ b/Documentation/devicetree/bindings/dma/dma-router.yaml @@ -23,7 +23,7 @@ properties: pattern: "^dma-router(@.*)?$" dma-masters: - $ref: /schemas/types.yaml#definitions/phandle-array + $ref: /schemas/types.yaml#/definitions/phandle-array description: Array of phandles to the DMA controllers the router can direct the signal to. diff --git a/Documentation/devicetree/bindings/dma/ingenic,dma.yaml b/Documentation/devicetree/bindings/dma/ingenic,dma.yaml index 00f19b3cac31..6a2043721b95 100644 --- a/Documentation/devicetree/bindings/dma/ingenic,dma.yaml +++ b/Documentation/devicetree/bindings/dma/ingenic,dma.yaml @@ -48,7 +48,7 @@ properties: ingenic,reserved-channels property. ingenic,reserved-channels: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: > Bitmask of channels to reserve for devices that need a specific channel. These channels will only be assigned when explicitely diff --git a/Documentation/devicetree/bindings/dma/snps,dma-spear1340.yaml b/Documentation/devicetree/bindings/dma/snps,dma-spear1340.yaml index ef1d6879c158..6b35089ac017 100644 --- a/Documentation/devicetree/bindings/dma/snps,dma-spear1340.yaml +++ b/Documentation/devicetree/bindings/dma/snps,dma-spear1340.yaml @@ -54,7 +54,7 @@ properties: maximum: 16 dma-masters: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: | Number of DMA masters supported by the controller. In case if not specified the driver will try to auto-detect this and @@ -63,7 +63,7 @@ properties: maximum: 4 chan_allocation_order: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: | DMA channels allocation order specifier. Zero means ascending order (first free allocated), while one - descending (last free allocated). @@ -71,7 +71,7 @@ properties: enum: [0, 1] chan_priority: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: | DMA channels priority order. Zero means ascending channels priority so the very first channel has the highest priority. While 1 means @@ -80,7 +80,7 @@ properties: enum: [0, 1] block_size: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: Maximum block size supported by the DMA controller. enum: [3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095] @@ -139,7 +139,7 @@ properties: default: 256 snps,dma-protection-control: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: | Bits one-to-one passed to the AHB HPROT[3:1] bus. Each bit setting indicates the following features: bit 0 - privileged mode, diff --git a/Documentation/devicetree/bindings/eeprom/at24.yaml b/Documentation/devicetree/bindings/eeprom/at24.yaml index 6edfa705b486..d5117c638b75 100644 --- a/Documentation/devicetree/bindings/eeprom/at24.yaml +++ b/Documentation/devicetree/bindings/eeprom/at24.yaml @@ -131,7 +131,7 @@ properties: default: 1 read-only: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Disables writes to the eeprom. @@ -141,7 +141,7 @@ properties: Total eeprom size in bytes. no-read-rollover: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Indicates that the multi-address eeprom does not automatically roll over reads to the next slave address. Please consult the manual of diff --git a/Documentation/devicetree/bindings/eeprom/at25.yaml b/Documentation/devicetree/bindings/eeprom/at25.yaml index 744973637678..121a601db22e 100644 --- a/Documentation/devicetree/bindings/eeprom/at25.yaml +++ b/Documentation/devicetree/bindings/eeprom/at25.yaml @@ -45,13 +45,13 @@ properties: spi-max-frequency: true pagesize: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 enum: [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072] description: Size of the eeprom page. size: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: Total eeprom size in bytes. diff --git a/Documentation/devicetree/bindings/hwmon/moortec,mr75203.yaml b/Documentation/devicetree/bindings/hwmon/moortec,mr75203.yaml index 6f3e3c01f717..b79f069a04c2 100644 --- a/Documentation/devicetree/bindings/hwmon/moortec,mr75203.yaml +++ b/Documentation/devicetree/bindings/hwmon/moortec,mr75203.yaml @@ -32,7 +32,7 @@ properties: PVT controller has 5 VM (voltage monitor) sensors. vm-map defines CPU core to VM instance mapping. A value of 0xff means that VM sensor is unused. - $ref: /schemas/types.yaml#definitions/uint8-array + $ref: /schemas/types.yaml#/definitions/uint8-array maxItems: 5 clocks: diff --git a/Documentation/devicetree/bindings/hwmon/sensirion,shtc1.yaml b/Documentation/devicetree/bindings/hwmon/sensirion,shtc1.yaml index c523a1beb2b7..7d49478d9668 100644 --- a/Documentation/devicetree/bindings/hwmon/sensirion,shtc1.yaml +++ b/Documentation/devicetree/bindings/hwmon/sensirion,shtc1.yaml @@ -29,12 +29,12 @@ properties: const: 0x70 sensirion,blocking-io: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: If set, the driver hold the i2c bus until measurement is finished. sensirion,low-precision: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: If set, the sensor aquire data with low precision (not recommended). The driver aquire data with high precision by default. diff --git a/Documentation/devicetree/bindings/hwmon/ti,tmp513.yaml b/Documentation/devicetree/bindings/hwmon/ti,tmp513.yaml index c17e5d3ee3f1..8020d739a078 100644 --- a/Documentation/devicetree/bindings/hwmon/ti,tmp513.yaml +++ b/Documentation/devicetree/bindings/hwmon/ti,tmp513.yaml @@ -61,7 +61,7 @@ properties: Array of three(TMP513) or two(TMP512) n-Factor value for each remote temperature channel. See datasheet Table 11 for n-Factor range list and value interpretation. - $ref: /schemas/types.yaml#definitions/uint32-array + $ref: /schemas/types.yaml#/definitions/uint32-array minItems: 2 maxItems: 3 items: diff --git a/Documentation/devicetree/bindings/iio/light/upisemi,us5182.yaml b/Documentation/devicetree/bindings/iio/light/upisemi,us5182.yaml index 4a9b2827cf7b..de5882cb3360 100644 --- a/Documentation/devicetree/bindings/iio/light/upisemi,us5182.yaml +++ b/Documentation/devicetree/bindings/iio/light/upisemi,us5182.yaml @@ -45,7 +45,7 @@ properties: default: 0x16 upisemi,continuous: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: | This chip has two power modes: one-shot (chip takes one measurement and then shuts itself down) and continuous (chip takes continuous diff --git a/Documentation/devicetree/bindings/iio/proximity/semtech,sx9310.yaml b/Documentation/devicetree/bindings/iio/proximity/semtech,sx9310.yaml index ccfb163f3d34..5de0bb2180e6 100644 --- a/Documentation/devicetree/bindings/iio/proximity/semtech,sx9310.yaml +++ b/Documentation/devicetree/bindings/iio/proximity/semtech,sx9310.yaml @@ -72,7 +72,7 @@ properties: - finest semtech,startup-sensor: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 enum: [0, 1, 2, 3] default: 0 description: @@ -81,7 +81,7 @@ properties: compensation. semtech,proxraw-strength: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 enum: [0, 2, 4, 8] default: 2 description: @@ -89,7 +89,7 @@ properties: represent 1-1/N. semtech,avg-pos-strength: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 enum: [0, 16, 64, 128, 256, 512, 1024, 4294967295] default: 16 description: diff --git a/Documentation/devicetree/bindings/input/gpio-keys.yaml b/Documentation/devicetree/bindings/input/gpio-keys.yaml index 6966ab009fa3..060a309ff8e7 100644 --- a/Documentation/devicetree/bindings/input/gpio-keys.yaml +++ b/Documentation/devicetree/bindings/input/gpio-keys.yaml @@ -34,13 +34,13 @@ patternProperties: linux,code: description: Key / Axis code to emit. - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 linux,input-type: description: Specify event type this button/key generates. If not specified defaults to <1> == EV_KEY. - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 default: 1 @@ -56,12 +56,12 @@ patternProperties: linux,input-value = <0xffffffff>; /* -1 */ - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 debounce-interval: description: Debouncing interval time in milliseconds. If not specified defaults to 5. - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 default: 5 @@ -79,7 +79,7 @@ patternProperties: EV_ACT_ANY - both asserted and deasserted EV_ACT_ASSERTED - asserted EV_ACT_DEASSERTED - deasserted - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 enum: [0, 1, 2] linux,can-disable: @@ -118,7 +118,7 @@ then: poll-interval: description: Poll interval time in milliseconds - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 required: - poll-interval diff --git a/Documentation/devicetree/bindings/interrupt-controller/mti,gic.yaml b/Documentation/devicetree/bindings/interrupt-controller/mti,gic.yaml index 039e08af98bb..91bb3c2307a7 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/mti,gic.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/mti,gic.yaml @@ -42,7 +42,7 @@ properties: Specifies the list of CPU interrupt vectors to which the GIC may not route interrupts. This property is ignored if the CPU is started in EIC mode. - $ref: /schemas/types.yaml#definitions/uint32-array + $ref: /schemas/types.yaml#/definitions/uint32-array minItems: 1 maxItems: 6 uniqueItems: true @@ -56,7 +56,7 @@ properties: It accepts two values: the 1st is the starting interrupt and the 2nd is the size of the reserved range. If not specified, the driver will allocate the last (2 * number of VPEs in the system). - $ref: /schemas/types.yaml#definitions/uint32-array + $ref: /schemas/types.yaml#/definitions/uint32-array items: - minimum: 0 maximum: 254 diff --git a/Documentation/devicetree/bindings/interrupt-controller/ti,pruss-intc.yaml b/Documentation/devicetree/bindings/interrupt-controller/ti,pruss-intc.yaml index 1c4c009dedd0..c2ce215501a5 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/ti,pruss-intc.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/ti,pruss-intc.yaml @@ -80,7 +80,7 @@ properties: mapping is provided. ti,irqs-reserved: - $ref: /schemas/types.yaml#definitions/uint8 + $ref: /schemas/types.yaml#/definitions/uint8 description: | Bitmask of host interrupts between 0 and 7 (corresponding to PRUSS INTC output interrupts 2 through 9) that are not connected to the Arm interrupt diff --git a/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml b/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml index b5af12011499..3d89668573e8 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/ti,sci-inta.yaml @@ -76,7 +76,7 @@ properties: "limit" specifies the limit for translation ti,unmapped-event-sources: - $ref: /schemas/types.yaml#definitions/phandle-array + $ref: /schemas/types.yaml#/definitions/phandle-array description: Array of phandles to DMA controllers where the unmapped events originate. diff --git a/Documentation/devicetree/bindings/leds/backlight/common.yaml b/Documentation/devicetree/bindings/leds/backlight/common.yaml index bc817f77d2b1..702ba350d869 100644 --- a/Documentation/devicetree/bindings/leds/backlight/common.yaml +++ b/Documentation/devicetree/bindings/leds/backlight/common.yaml @@ -22,7 +22,7 @@ properties: The default brightness that should be applied to the LED by the operating system on start-up. The brightness should not exceed the brightness the LED can provide. - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 max-brightness: description: @@ -31,6 +31,6 @@ properties: on the brightness apart from what the driver says, as it could happen that a LED can be made so bright that it gets damaged or causes damage due to restrictions in a specific system, such as mounting conditions. - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 additionalProperties: true diff --git a/Documentation/devicetree/bindings/leds/common.yaml b/Documentation/devicetree/bindings/leds/common.yaml index f1211e7045f1..b1f363747a62 100644 --- a/Documentation/devicetree/bindings/leds/common.yaml +++ b/Documentation/devicetree/bindings/leds/common.yaml @@ -27,21 +27,21 @@ properties: List of device current outputs the LED is connected to. The outputs are identified by the numbers that must be defined in the LED device binding documentation. - $ref: /schemas/types.yaml#definitions/uint32-array + $ref: /schemas/types.yaml#/definitions/uint32-array function: description: LED function. Use one of the LED_FUNCTION_* prefixed definitions from the header include/dt-bindings/leds/common.h. If there is no matching LED_FUNCTION available, add a new one. - $ref: /schemas/types.yaml#definitions/string + $ref: /schemas/types.yaml#/definitions/string color: description: Color of the LED. Use one of the LED_COLOR_ID_* prefixed definitions from the header include/dt-bindings/leds/common.h. If there is no matching LED_COLOR_ID available, add a new one. - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 minimum: 0 maximum: 9 @@ -49,7 +49,7 @@ properties: description: Integer to be used when more than one instance of the same function is needed, differing only with an ordinal number. - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 label: description: @@ -66,7 +66,7 @@ properties: produced where the LED momentarily turns off (or on). The "keep" setting will keep the LED at whatever its current state is, without producing a glitch. - $ref: /schemas/types.yaml#definitions/string + $ref: /schemas/types.yaml#/definitions/string enum: - on - off @@ -77,7 +77,7 @@ properties: description: This parameter, if present, is a string defining the trigger assigned to the LED. - $ref: /schemas/types.yaml#definitions/string + $ref: /schemas/types.yaml#/definitions/string enum: # LED will act as a back-light, controlled by the framebuffer system @@ -109,7 +109,7 @@ properties: brightness and duration (in ms). The exact format is described in: Documentation/devicetree/bindings/leds/leds-trigger-pattern.txt - $ref: /schemas/types.yaml#definitions/uint32-matrix + $ref: /schemas/types.yaml#/definitions/uint32-matrix items: minItems: 2 maxItems: 2 @@ -143,7 +143,7 @@ properties: the device tree and be referenced by a phandle and a set of phandle arguments. A length of arguments should be specified by the #trigger-source-cells property in the source node. - $ref: /schemas/types.yaml#definitions/phandle-array + $ref: /schemas/types.yaml#/definitions/phandle-array # Required properties for flash LED child nodes: flash-max-microamp: diff --git a/Documentation/devicetree/bindings/leds/leds-lp55xx.yaml b/Documentation/devicetree/bindings/leds/leds-lp55xx.yaml index 58e974793a79..f552cd143d5b 100644 --- a/Documentation/devicetree/bindings/leds/leds-lp55xx.yaml +++ b/Documentation/devicetree/bindings/leds/leds-lp55xx.yaml @@ -35,7 +35,7 @@ properties: description: I2C slave address clock-mode: - $ref: /schemas/types.yaml#definitions/uint8 + $ref: /schemas/types.yaml#/definitions/uint8 description: | Input clock mode enum: @@ -49,7 +49,7 @@ properties: GPIO attached to the chip's enable pin pwr-sel: - $ref: /schemas/types.yaml#definitions/uint8 + $ref: /schemas/types.yaml#/definitions/uint8 description: | LP8501 specific property. Power selection for output channels. enum: @@ -70,14 +70,14 @@ patternProperties: $ref: common.yaml# properties: led-cur: - $ref: /schemas/types.yaml#definitions/uint8 + $ref: /schemas/types.yaml#/definitions/uint8 description: | Current setting at each LED channel (mA x10, 0 if LED is not connected) minimum: 0 maximum: 255 max-cur: - $ref: /schemas/types.yaml#definitions/uint8 + $ref: /schemas/types.yaml#/definitions/uint8 description: Maximun current at each LED channel. reg: @@ -97,7 +97,7 @@ patternProperties: - 8 # LED output D9 chan-name: - $ref: /schemas/types.yaml#definitions/string + $ref: /schemas/types.yaml#/definitions/string description: name of channel required: diff --git a/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml b/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml index c7c9ad4e3f9f..7f2578d48e3f 100644 --- a/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml +++ b/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml @@ -38,7 +38,7 @@ properties: const: stmmaceth syscon: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: Phandle to the device containing the EMAC or GMAC clock register @@ -114,7 +114,7 @@ allOf: then: properties: allwinner,leds-active-low: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: EPHY LEDs are active low. @@ -126,7 +126,7 @@ allOf: const: allwinner,sun8i-h3-mdio-mux mdio-parent-bus: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: Phandle to EMAC MDIO. diff --git a/Documentation/devicetree/bindings/net/amlogic,meson-dwmac.yaml b/Documentation/devicetree/bindings/net/amlogic,meson-dwmac.yaml index 6b057b117aa0..1f133f4a2924 100644 --- a/Documentation/devicetree/bindings/net/amlogic,meson-dwmac.yaml +++ b/Documentation/devicetree/bindings/net/amlogic,meson-dwmac.yaml @@ -60,7 +60,7 @@ allOf: - const: timing-adjustment amlogic,tx-delay-ns: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: The internal RGMII TX clock delay (provided by this driver) in nanoseconds. Allowed values are 0ns, 2ns, 4ns, 6ns. diff --git a/Documentation/devicetree/bindings/net/dsa/dsa.yaml b/Documentation/devicetree/bindings/net/dsa/dsa.yaml index 8e044631bcf7..8a3494db4d8d 100644 --- a/Documentation/devicetree/bindings/net/dsa/dsa.yaml +++ b/Documentation/devicetree/bindings/net/dsa/dsa.yaml @@ -54,7 +54,7 @@ patternProperties: description: Describes the label associated with this port, which will become the netdev name - $ref: /schemas/types.yaml#definitions/string + $ref: /schemas/types.yaml#/definitions/string link: description: @@ -62,13 +62,13 @@ patternProperties: port is used as the outgoing port towards the phandle ports. The full routing information must be given, not just the one hop routes to neighbouring switches - $ref: /schemas/types.yaml#definitions/phandle-array + $ref: /schemas/types.yaml#/definitions/phandle-array ethernet: description: Should be a phandle to a valid Ethernet device node. This host device is what the switch port is connected to - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle phy-handle: true diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml index cc93063a8f39..0965f6515f9e 100644 --- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml @@ -16,7 +16,7 @@ properties: local-mac-address: description: Specifies the MAC address that was assigned to the network device. - $ref: /schemas/types.yaml#definitions/uint8-array + $ref: /schemas/types.yaml#/definitions/uint8-array items: - minItems: 6 maxItems: 6 @@ -27,20 +27,20 @@ properties: program; should be used in cases where the MAC address assigned to the device by the boot program is different from the local-mac-address property. - $ref: /schemas/types.yaml#definitions/uint8-array + $ref: /schemas/types.yaml#/definitions/uint8-array items: - minItems: 6 maxItems: 6 max-frame-size: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: Maximum transfer unit (IEEE defined MTU), rather than the maximum frame size (there\'s contradiction in the Devicetree Specification). max-speed: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: Specifies maximum speed in Mbit/s supported by the device. @@ -101,7 +101,7 @@ properties: $ref: "#/properties/phy-connection-type" phy-handle: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: Specifies a reference to a node representing a PHY device. @@ -114,7 +114,7 @@ properties: deprecated: true rx-fifo-depth: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: The size of the controller\'s receive fifo in bytes. This is used for components that can have configurable receive fifo sizes, @@ -129,12 +129,12 @@ properties: If this property is present then the MAC applies the RX delay. sfp: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: Specifies a reference to a node representing a SFP cage. tx-fifo-depth: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: The size of the controller\'s transmit fifo in bytes. This is used for components that can have configurable fifo sizes. @@ -150,7 +150,7 @@ properties: description: Specifies the PHY management type. If auto is set and fixed-link is not specified, it uses MDIO for management. - $ref: /schemas/types.yaml#definitions/string + $ref: /schemas/types.yaml#/definitions/string default: auto enum: - auto @@ -198,17 +198,17 @@ properties: speed: description: Link speed. - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 enum: [10, 100, 1000] full-duplex: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Indicates that full-duplex is used. When absent, half duplex is assumed. asym-pause: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Indicates that asym_pause should be enabled. diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml b/Documentation/devicetree/bindings/net/ethernet-phy.yaml index 6dd72faebd89..2766fe45bb98 100644 --- a/Documentation/devicetree/bindings/net/ethernet-phy.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml @@ -78,57 +78,57 @@ properties: Maximum PHY supported speed in Mbits / seconds. broken-turn-around: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: If set, indicates the PHY device does not correctly release the turn around line low at end of the control phase of the MDIO transaction. enet-phy-lane-swap: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: If set, indicates the PHY will swap the TX/RX lanes to compensate for the board being designed with the lanes swapped. eee-broken-100tx: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Mark the corresponding energy efficient ethernet mode as broken and request the ethernet to stop advertising it. eee-broken-1000t: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Mark the corresponding energy efficient ethernet mode as broken and request the ethernet to stop advertising it. eee-broken-10gt: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Mark the corresponding energy efficient ethernet mode as broken and request the ethernet to stop advertising it. eee-broken-1000kx: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Mark the corresponding energy efficient ethernet mode as broken and request the ethernet to stop advertising it. eee-broken-10gkx4: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Mark the corresponding energy efficient ethernet mode as broken and request the ethernet to stop advertising it. eee-broken-10gkr: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Mark the corresponding energy efficient ethernet mode as broken and request the ethernet to stop advertising it. phy-is-integrated: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: If set, indicates that the PHY is integrated into the same physical package as the Ethernet MAC. If needed, muxers @@ -158,7 +158,7 @@ properties: this property is missing the delay will be skipped. sfp: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: Specifies a reference to a node representing a SFP cage. diff --git a/Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml b/Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml index 2159b7d1f537..7f620a71a972 100644 --- a/Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml +++ b/Documentation/devicetree/bindings/net/fsl,qoriq-mc-dpmac.yaml @@ -31,7 +31,7 @@ properties: phy-mode: true pcs-handle: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: A reference to a node representing a PCS PHY device found on the internal MDIO bus. diff --git a/Documentation/devicetree/bindings/net/mdio.yaml b/Documentation/devicetree/bindings/net/mdio.yaml index e811e0fd851c..08e15fb1584f 100644 --- a/Documentation/devicetree/bindings/net/mdio.yaml +++ b/Documentation/devicetree/bindings/net/mdio.yaml @@ -70,7 +70,7 @@ patternProperties: The ID number for the device. broken-turn-around: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: If set, indicates the MDIO device does not correctly release the turn around line low at end of the control phase of the diff --git a/Documentation/devicetree/bindings/net/mediatek,star-emac.yaml b/Documentation/devicetree/bindings/net/mediatek,star-emac.yaml index 0bbd598704e9..e6a5ff208253 100644 --- a/Documentation/devicetree/bindings/net/mediatek,star-emac.yaml +++ b/Documentation/devicetree/bindings/net/mediatek,star-emac.yaml @@ -42,7 +42,7 @@ properties: - const: trans mediatek,pericfg: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: Phandle to the device containing the PERICFG register range. This is used to control the MII mode. diff --git a/Documentation/devicetree/bindings/net/qcom,ipa.yaml b/Documentation/devicetree/bindings/net/qcom,ipa.yaml index 4d8464b2676d..d0cbbcf1b0e5 100644 --- a/Documentation/devicetree/bindings/net/qcom,ipa.yaml +++ b/Documentation/devicetree/bindings/net/qcom,ipa.yaml @@ -114,7 +114,7 @@ properties: validating firwmare used by the GSI. modem-remoteproc: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: This defines the phandle to the remoteproc node representing the modem subsystem. This is requied so the IPA driver can diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml index 11a6fdb657c9..b2f6083f556a 100644 --- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml @@ -126,7 +126,7 @@ properties: in a different mode than the PHY in order to function. snps,axi-config: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: AXI BUS Mode parameters. Phandle to a node that can contain the following properties @@ -141,7 +141,7 @@ properties: * snps,rb, rebuild INCRx Burst snps,mtl-rx-config: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: Multiple RX Queues parameters. Phandle to a node that can contain the following properties @@ -164,7 +164,7 @@ properties: * snps,priority, RX queue priority (Range 0x0 to 0xF) snps,mtl-tx-config: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: Multiple TX Queues parameters. Phandle to a node that can contain the following properties @@ -198,7 +198,7 @@ properties: snps,reset-active-low: deprecated: true - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Indicates that the PHY Reset is active low @@ -208,55 +208,55 @@ properties: Triplet of delays. The 1st cell is reset pre-delay in micro seconds. The 2nd cell is reset pulse in micro seconds. The 3rd cell is reset post-delay in micro seconds. - $ref: /schemas/types.yaml#definitions/uint32-array + $ref: /schemas/types.yaml#/definitions/uint32-array minItems: 3 maxItems: 3 snps,aal: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Use Address-Aligned Beats snps,fixed-burst: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Program the DMA to use the fixed burst mode snps,mixed-burst: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Program the DMA to use the mixed burst mode snps,force_thresh_dma_mode: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Force DMA to use the threshold mode for both tx and rx snps,force_sf_dma_mode: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Force DMA to use the Store and Forward mode for both tx and rx. This flag is ignored if force_thresh_dma_mode is set. snps,en-tx-lpi-clockgating: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Enable gating of the MAC TX clock during TX low-power mode snps,multicast-filter-bins: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: Number of multicast filter hash bins supported by this device instance snps,perfect-filter-entries: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: Number of perfect filter entries supported by this device instance snps,ps-speed: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: Port selection speed that can be passed to the core when PCS is supported. For example, this is used in case of SGMII and @@ -307,25 +307,25 @@ allOf: snps,pbl: description: Programmable Burst Length (tx and rx) - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 enum: [2, 4, 8] snps,txpbl: description: Tx Programmable Burst Length. If set, DMA tx will use this value rather than snps,pbl. - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 enum: [2, 4, 8] snps,rxpbl: description: Rx Programmable Burst Length. If set, DMA rx will use this value rather than snps,pbl. - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 enum: [2, 4, 8] snps,no-pbl-x8: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Don\'t multiply the pbl/txpbl/rxpbl values by 8. For core rev < 3.50, don\'t multiply the values by 4. @@ -351,7 +351,7 @@ allOf: then: properties: snps,tso: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Enables the TSO feature otherwise it will be managed by MAC HW capability register. diff --git a/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.yaml b/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.yaml index cbacc04fc9e6..8a03a24a2019 100644 --- a/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.yaml +++ b/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.yaml @@ -64,7 +64,7 @@ properties: - const: ether # for others socionext,syscon-phy-mode: - $ref: /schemas/types.yaml#definitions/phandle-array + $ref: /schemas/types.yaml#/definitions/phandle-array description: A phandle to syscon with one argument that configures phy mode. The argument is the ID of MAC instance. diff --git a/Documentation/devicetree/bindings/net/ti,cpsw-switch.yaml b/Documentation/devicetree/bindings/net/ti,cpsw-switch.yaml index dadeb8f811c0..07a00f53adbf 100644 --- a/Documentation/devicetree/bindings/net/ti,cpsw-switch.yaml +++ b/Documentation/devicetree/bindings/net/ti,cpsw-switch.yaml @@ -70,7 +70,7 @@ properties: pinctrl-names: true syscon: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: Phandle to the system control device node which provides access to efuse IO range with MAC addresses diff --git a/Documentation/devicetree/bindings/net/ti,dp83867.yaml b/Documentation/devicetree/bindings/net/ti,dp83867.yaml index 4050a3608658..047d757e8d82 100644 --- a/Documentation/devicetree/bindings/net/ti,dp83867.yaml +++ b/Documentation/devicetree/bindings/net/ti,dp83867.yaml @@ -47,31 +47,31 @@ properties: takes precedence. tx-fifo-depth: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: | Transmitt FIFO depth see dt-bindings/net/ti-dp83867.h for values rx-fifo-depth: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: | Receive FIFO depth see dt-bindings/net/ti-dp83867.h for values ti,clk-output-sel: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: | Muxing option for CLK_OUT pin. See dt-bindings/net/ti-dp83867.h for applicable values. The CLK_OUT pin can also be disabled by this property. When omitted, the PHY's default will be left as is. ti,rx-internal-delay: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: | RGMII Receive Clock Delay - see dt-bindings/net/ti-dp83867.h for applicable values. Required only if interface type is PHY_INTERFACE_MODE_RGMII_ID or PHY_INTERFACE_MODE_RGMII_RXID. ti,tx-internal-delay: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: | RGMII Transmit Clock Delay - see dt-bindings/net/ti-dp83867.h for applicable values. Required only if interface type is @@ -101,7 +101,7 @@ properties: ti,fifo-depth: deprecated: true - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: | Transmitt FIFO depth- see dt-bindings/net/ti-dp83867.h for applicable values. diff --git a/Documentation/devicetree/bindings/net/ti,dp83869.yaml b/Documentation/devicetree/bindings/net/ti,dp83869.yaml index c3235f08e326..70a1209cb13b 100644 --- a/Documentation/devicetree/bindings/net/ti,dp83869.yaml +++ b/Documentation/devicetree/bindings/net/ti,dp83869.yaml @@ -44,22 +44,22 @@ properties: to a maximum value (70 ohms). tx-fifo-depth: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: | Transmitt FIFO depth see dt-bindings/net/ti-dp83869.h for values rx-fifo-depth: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: | Receive FIFO depth see dt-bindings/net/ti-dp83869.h for values ti,clk-output-sel: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: | Muxing option for CLK_OUT pin see dt-bindings/net/ti-dp83869.h for values. ti,op-mode: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: | Operational mode for the PHY. If this is not set then the operational mode is set by the straps. see dt-bindings/net/ti-dp83869.h for values diff --git a/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml b/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml index 227270cbf892..c47b58f3e3f6 100644 --- a/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml +++ b/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml @@ -119,12 +119,12 @@ properties: description: label associated with this port ti,mac-only: - $ref: /schemas/types.yaml#definitions/flag + $ref: /schemas/types.yaml#/definitions/flag description: Specifies the port works in mac-only mode. ti,syscon-efuse: - $ref: /schemas/types.yaml#definitions/phandle-array + $ref: /schemas/types.yaml#/definitions/phandle-array description: Phandle to the system control device node which provides access to efuse IO range with MAC addresses diff --git a/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml b/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml index 6af999191559..85c2f699d602 100644 --- a/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml +++ b/Documentation/devicetree/bindings/net/wireless/qcom,ath11k.yaml @@ -136,7 +136,7 @@ properties: - const: tcl2host-status-ring qcom,rproc: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: DT entry of q6v5-wcss remoteproc driver. Phandle to a node that can contain the following properties diff --git a/Documentation/devicetree/bindings/phy/ti,omap-usb2.yaml b/Documentation/devicetree/bindings/phy/ti,omap-usb2.yaml index 83d5d0aceb04..cbbf5e8b1197 100644 --- a/Documentation/devicetree/bindings/phy/ti,omap-usb2.yaml +++ b/Documentation/devicetree/bindings/phy/ti,omap-usb2.yaml @@ -44,13 +44,13 @@ properties: - const: refclk syscon-phy-power: - $ref: /schemas/types.yaml#definitions/phandle-array + $ref: /schemas/types.yaml#/definitions/phandle-array description: phandle/offset pair. Phandle to the system control module and register offset to power on/off the PHY. ctrl-module: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: (deprecated) phandle of the control module used by PHY driver to power on the PHY. Use syscon-phy-power instead. diff --git a/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml b/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml index fd12bafe3548..d14cb9bac849 100644 --- a/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml +++ b/Documentation/devicetree/bindings/power/mediatek,power-controller.yaml @@ -83,11 +83,11 @@ patternProperties: SUSBSYS clocks. mediatek,infracfg: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: phandle to the device containing the INFRACFG register range. mediatek,smi: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: phandle to the device containing the SMI register range. patternProperties: @@ -131,11 +131,11 @@ patternProperties: SUSBSYS clocks. mediatek,infracfg: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: phandle to the device containing the INFRACFG register range. mediatek,smi: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: phandle to the device containing the SMI register range. patternProperties: @@ -179,11 +179,11 @@ patternProperties: SUSBSYS clocks. mediatek,infracfg: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: phandle to the device containing the INFRACFG register range. mediatek,smi: - $ref: /schemas/types.yaml#definitions/phandle + $ref: /schemas/types.yaml#/definitions/phandle description: phandle to the device containing the SMI register range. required: diff --git a/Documentation/devicetree/bindings/power/supply/cw2015_battery.yaml b/Documentation/devicetree/bindings/power/supply/cw2015_battery.yaml index ee92e6a076ac..5fcdf5801536 100644 --- a/Documentation/devicetree/bindings/power/supply/cw2015_battery.yaml +++ b/Documentation/devicetree/bindings/power/supply/cw2015_battery.yaml @@ -27,7 +27,7 @@ properties: of this binary blob is kept secret by CellWise. The only way to obtain it is to mail two batteries to a test facility of CellWise and receive back a test report with the binary blob. - $ref: /schemas/types.yaml#definitions/uint8-array + $ref: /schemas/types.yaml#/definitions/uint8-array minItems: 64 maxItems: 64 diff --git a/Documentation/devicetree/bindings/powerpc/sleep.yaml b/Documentation/devicetree/bindings/powerpc/sleep.yaml index 6494c7d08b93..1b0936a5beec 100644 --- a/Documentation/devicetree/bindings/powerpc/sleep.yaml +++ b/Documentation/devicetree/bindings/powerpc/sleep.yaml @@ -42,6 +42,6 @@ select: true properties: sleep: - $ref: /schemas/types.yaml#definitions/phandle-array + $ref: /schemas/types.yaml#/definitions/phandle-array additionalProperties: true diff --git a/Documentation/devicetree/bindings/serial/8250.yaml b/Documentation/devicetree/bindings/serial/8250.yaml index c1d4c196f005..f54cae9ff7b2 100644 --- a/Documentation/devicetree/bindings/serial/8250.yaml +++ b/Documentation/devicetree/bindings/serial/8250.yaml @@ -126,7 +126,7 @@ properties: maxItems: 1 current-speed: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: The current active speed of the UART. reg-offset: @@ -154,7 +154,7 @@ properties: Set to indicate that the port does not implement loopback test mode. fifo-size: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: The fifo size of the UART. auto-flow-control: @@ -165,7 +165,7 @@ properties: property. tx-threshold: - $ref: /schemas/types.yaml#definitions/uint32 + $ref: /schemas/types.yaml#/definitions/uint32 description: | Specify the TX FIFO low water indication for parts with programmable TX FIFO thresholds. diff --git a/Documentation/devicetree/bindings/soc/ti/k3-ringacc.yaml b/Documentation/devicetree/bindings/soc/ti/k3-ringacc.yaml index c3c595e235a8..ddea3d41971d 100644 --- a/Documentation/devicetree/bindings/soc/ti/k3-ringacc.yaml +++ b/Documentation/devicetree/bindings/soc/ti/k3-ringacc.yaml @@ -55,7 +55,7 @@ properties: description: TI-SCI RM subtype for GP ring range ti,sci: - $ref: /schemas/types.yaml#definitions/phandle-array + $ref: /schemas/types.yaml#/definitions/phandle-array description: phandle on TI-SCI compatible System controller node ti,sci-dev-id: diff --git a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-codec.yaml b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-codec.yaml index be390accdd07..dd47fef9854d 100644 --- a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-codec.yaml +++ b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-codec.yaml @@ -57,7 +57,7 @@ properties: A list of the connections between audio components. Each entry is a pair of strings, the first being the connection's sink, the second being the connection's source. - $ref: /schemas/types.yaml#definitions/non-unique-string-array + $ref: /schemas/types.yaml#/definitions/non-unique-string-array minItems: 2 maxItems: 18 items: diff --git a/Documentation/devicetree/bindings/sound/st,stm32-sai.yaml b/Documentation/devicetree/bindings/sound/st,stm32-sai.yaml index 6ad48c7681c1..f2443b651282 100644 --- a/Documentation/devicetree/bindings/sound/st,stm32-sai.yaml +++ b/Documentation/devicetree/bindings/sound/st,stm32-sai.yaml @@ -106,7 +106,7 @@ patternProperties: Must contain the phandle and index of the SAI sub-block providing the synchronization. allOf: - - $ref: /schemas/types.yaml#definitions/phandle-array + - $ref: /schemas/types.yaml#/definitions/phandle-array - maxItems: 1 st,iec60958: @@ -117,7 +117,7 @@ patternProperties: configured according to protocol defined in related DAI link node, such as i2s, left justified, right justified, dsp and pdm protocols. allOf: - - $ref: /schemas/types.yaml#definitions/flag + - $ref: /schemas/types.yaml#/definitions/flag "#clock-cells": description: Configure the SAI device as master clock provider. From 479a41748fdd8aa3eb933b0fac554fb2b7931334 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 17 Nov 2020 14:07:52 -0600 Subject: [PATCH 224/326] media: dt-bindings: coda: Add missing 'additionalProperties' 'additionalProperties' is now required by the meta-schema. Add it for coda. As a result, 'interrupts', 'interrupt-names' and 'power-domains' need to be reworked to be defined at the top level. Cc: Philipp Zabel Cc: Mauro Carvalho Chehab Cc: linux-media@vger.kernel.org Signed-off-by: Rob Herring Reviewed-by: Philipp Zabel Link: https://lore.kernel.org/r/20201117200752.4004368-1-robh@kernel.org Signed-off-by: Rob Herring --- .../devicetree/bindings/media/coda.yaml | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/Documentation/devicetree/bindings/media/coda.yaml b/Documentation/devicetree/bindings/media/coda.yaml index 7bac0057faf7..36781ee4617f 100644 --- a/Documentation/devicetree/bindings/media/coda.yaml +++ b/Documentation/devicetree/bindings/media/coda.yaml @@ -44,6 +44,21 @@ properties: - const: per - const: ahb + interrupts: + minItems: 1 + items: + - description: BIT processor interrupt + - description: JPEG unit interrupt + + interrupt-names: + minItems: 1 + items: + - const: bit + - const: jpeg + + power-domains: + maxItems: 1 + resets: maxItems: 1 @@ -59,6 +74,8 @@ required: - clocks - clock-names +additionalProperties: false + allOf: - if: properties: @@ -68,34 +85,17 @@ allOf: then: properties: interrupts: - items: - - description: BIT processor interrupt - - description: JPEG unit interrupt + minItems: 2 interrupt-names: - items: - - const: bit - - const: jpeg + minItems: 2 else: properties: interrupts: - items: - - description: BIT processor interrupt - - - if: - properties: - compatible: - contains: - enum: - - fsl,imx6dl-vpu - - fsl,imx6q-vpu - then: - properties: - power-domains: - $ref: /schemas/types.yaml#/definitions/phandle - description: phandle pointing to the PU power domain maxItems: 1 + power-domains: false + examples: - | vpu: video-codec@63ff4000 { From 64a21a18f55ebafc9e805787770df4e0518db887 Mon Sep 17 00:00:00 2001 From: Michael Tretter Date: Wed, 2 Dec 2020 10:05:22 +0100 Subject: [PATCH 225/326] dt-bindings: xlnx,vcu-settings: fix dt_binding_check warnings When running make dt_binding_check, the xlnx,vcu-settings binding triggers the following two warnings: 'additionalProperties' is a required property example-0: vcu@a0041000:reg:0: [0, 2684620800, 0, 4096] is too long Fix the binding and make the checker happy. Signed-off-by: Michael Tretter Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20201202090522.251607-1-m.tretter@pengutronix.de Signed-off-by: Rob Herring --- .../bindings/soc/xilinx/xlnx,vcu-settings.yaml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/soc/xilinx/xlnx,vcu-settings.yaml b/Documentation/devicetree/bindings/soc/xilinx/xlnx,vcu-settings.yaml index 378d0ced43c8..cb245f400287 100644 --- a/Documentation/devicetree/bindings/soc/xilinx/xlnx,vcu-settings.yaml +++ b/Documentation/devicetree/bindings/soc/xilinx/xlnx,vcu-settings.yaml @@ -26,9 +26,18 @@ required: - compatible - reg +additionalProperties: false + examples: - | - xlnx_vcu: vcu@a0041000 { - compatible = "xlnx,vcu-settings", "syscon"; - reg = <0x0 0xa0041000 0x0 0x1000>; + fpga { + #address-cells = <2>; + #size-cells = <2>; + + xlnx_vcu: vcu@a0041000 { + compatible = "xlnx,vcu-settings", "syscon"; + reg = <0x0 0xa0041000 0x0 0x1000>; + }; }; + +... From c1efde3f9780ad337df1cc393f6471ac8e24f50f Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Fri, 4 Dec 2020 17:38:10 +0800 Subject: [PATCH 226/326] dt-bindings: serial: add the required property 'additionalProperties' When I do dt_binding_check for any YAML file, below wanring is always reported: xxx/serial/litex,liteuart.yaml: 'additionalProperties' is a required property xxx/serial/litex,liteuart.yaml: ignoring, error in schema: warning: no schema found in file: xxx/serial/litex,liteuart.yaml Signed-off-by: Zhen Lei Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20201204093813.1275-3-thunder.leizhen@huawei.com Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/serial/litex,liteuart.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/serial/litex,liteuart.yaml b/Documentation/devicetree/bindings/serial/litex,liteuart.yaml index bc79b3cca542..c4f1f489dc2d 100644 --- a/Documentation/devicetree/bindings/serial/litex,liteuart.yaml +++ b/Documentation/devicetree/bindings/serial/litex,liteuart.yaml @@ -29,6 +29,8 @@ required: - compatible - reg +additionalProperties: false + examples: - | uart0: serial@e0001800 { From c8f054f10507dc133c9aa51f478dabe772f16288 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Fri, 4 Dec 2020 17:38:11 +0800 Subject: [PATCH 227/326] dt-bindings: soc: add the required property 'additionalProperties' When I do dt_binding_check for any YAML file, below wanring is always reported: xxx/soc/litex/litex,soc-controller.yaml: 'additionalProperties' is a required property xxx/soc/litex/litex,soc-controller.yaml: ignoring, error in schema: warning: no schema found in file: xxx/soc/litex/litex,soc-controller.yaml Signed-off-by: Zhen Lei Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20201204093813.1275-4-thunder.leizhen@huawei.com Signed-off-by: Rob Herring --- .../devicetree/bindings/soc/litex/litex,soc-controller.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/soc/litex/litex,soc-controller.yaml b/Documentation/devicetree/bindings/soc/litex/litex,soc-controller.yaml index e2b788796e79..c8b57c7fd08c 100644 --- a/Documentation/devicetree/bindings/soc/litex/litex,soc-controller.yaml +++ b/Documentation/devicetree/bindings/soc/litex/litex,soc-controller.yaml @@ -28,6 +28,8 @@ required: - compatible - reg +additionalProperties: false + examples: - | soc_ctrl0: soc-controller@f0000000 { From d73982be2b00bbe76b53433cc56a1cd9555b9091 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Fri, 4 Dec 2020 17:38:12 +0800 Subject: [PATCH 228/326] dt-bindings: devapc: add the required property 'additionalProperties' When I do dt_binding_check for any YAML file, below wanring is always reported: xxx/soc/mediatek/devapc.yaml: 'additionalProperties' is a required property xxx/soc/mediatek/devapc.yaml: ignoring, error in schema: warning: no schema found in file: xxx/soc/mediatek/devapc.yaml Signed-off-by: Zhen Lei Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20201204093813.1275-5-thunder.leizhen@huawei.com Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/soc/mediatek/devapc.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/soc/mediatek/devapc.yaml b/Documentation/devicetree/bindings/soc/mediatek/devapc.yaml index 6c763f873a63..31e4d3c339bf 100644 --- a/Documentation/devicetree/bindings/soc/mediatek/devapc.yaml +++ b/Documentation/devicetree/bindings/soc/mediatek/devapc.yaml @@ -44,6 +44,8 @@ required: - clocks - clock-names +additionalProperties: false + examples: - | #include From c4b8c562a75f568026038c001cfa7737dac272da Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Mon, 7 Dec 2020 12:23:58 +0800 Subject: [PATCH 229/326] dt-bindings: media: nokia,smia: eliminate yamllint warnings Eliminate the following yamllint warnings: ./Documentation/devicetree/bindings/media/i2c/mipi-ccs.yaml :4:1: [error] missing document start "---" (document-start) :29:9: [warning] wrong indentation: expected 10 but found 8 (indentation) :32:9: [warning] wrong indentation: expected 10 but found 8 (indentation) Signed-off-by: Zhen Lei Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20201207042400.1498-3-thunder.leizhen@huawei.com Signed-off-by: Rob Herring --- .../devicetree/bindings/media/i2c/mipi-ccs.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/media/i2c/mipi-ccs.yaml b/Documentation/devicetree/bindings/media/i2c/mipi-ccs.yaml index d94bd67ccea1..0df0334d2d0d 100644 --- a/Documentation/devicetree/bindings/media/i2c/mipi-ccs.yaml +++ b/Documentation/devicetree/bindings/media/i2c/mipi-ccs.yaml @@ -1,6 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) # Copyright (C) 2014--2020 Intel Corporation - +%YAML 1.2 +--- $id: http://devicetree.org/schemas/media/i2c/mipi-ccs.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# @@ -26,11 +27,11 @@ properties: compatible: oneOf: - items: - - const: mipi-ccs-1.1 - - const: mipi-ccs + - const: mipi-ccs-1.1 + - const: mipi-ccs - items: - - const: mipi-ccs-1.0 - - const: mipi-ccs + - const: mipi-ccs-1.0 + - const: mipi-ccs - const: nokia,smia reg: From aeefc1a01e7c3905580a981e93032cd452275c99 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Mon, 7 Dec 2020 12:48:30 +0800 Subject: [PATCH 230/326] dt-bindings: display: eliminate yamllint warnings Eliminate the following yamllint warnings: ./Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml :52:9: [warning] wrong indentation: expected 6 but found 8 (indentation) ./Documentation/devicetree/bindings/display/bridge/intel,keembay-dsi.yaml :42:8: [warning] wrong indentation: expected 8 but found 7 (indentation) :45:8: [warning] wrong indentation: expected 8 but found 7 (indentation) ./Documentation/devicetree/bindings/display/intel,keembay-msscam.yaml :21:6: [warning] wrong indentation: expected 6 but found 5 (indentation) ./Documentation/devicetree/bindings/display/panel/novatek,nt36672a.yaml :25:10: [warning] wrong indentation: expected 10 but found 9 (indentation) Signed-off-by: Zhen Lei Acked-by: Sam Ravnborg Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20201207044830.1551-2-thunder.leizhen@huawei.com Signed-off-by: Rob Herring --- .../devicetree/bindings/display/bridge/analogix,anx7625.yaml | 4 ++-- .../devicetree/bindings/display/bridge/intel,keembay-dsi.yaml | 4 ++-- .../devicetree/bindings/display/intel,keembay-msscam.yaml | 4 ++-- .../devicetree/bindings/display/panel/novatek,nt36672a.yaml | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml b/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml index 60585a4fc22b..9392b5502a32 100644 --- a/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml +++ b/Documentation/devicetree/bindings/display/bridge/analogix,anx7625.yaml @@ -49,8 +49,8 @@ properties: Video port for panel or connector. required: - - port@0 - - port@1 + - port@0 + - port@1 required: - compatible diff --git a/Documentation/devicetree/bindings/display/bridge/intel,keembay-dsi.yaml b/Documentation/devicetree/bindings/display/bridge/intel,keembay-dsi.yaml index ab5be2625224..35c9dfd86650 100644 --- a/Documentation/devicetree/bindings/display/bridge/intel,keembay-dsi.yaml +++ b/Documentation/devicetree/bindings/display/bridge/intel,keembay-dsi.yaml @@ -39,10 +39,10 @@ properties: properties: '#address-cells': - const: 1 + const: 1 '#size-cells': - const: 0 + const: 0 port@0: type: object diff --git a/Documentation/devicetree/bindings/display/intel,keembay-msscam.yaml b/Documentation/devicetree/bindings/display/intel,keembay-msscam.yaml index 40caa6118809..a222b52d8b8f 100644 --- a/Documentation/devicetree/bindings/display/intel,keembay-msscam.yaml +++ b/Documentation/devicetree/bindings/display/intel,keembay-msscam.yaml @@ -18,8 +18,8 @@ description: | properties: compatible: items: - - const: intel,keembay-msscam - - const: syscon + - const: intel,keembay-msscam + - const: syscon reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/display/panel/novatek,nt36672a.yaml b/Documentation/devicetree/bindings/display/panel/novatek,nt36672a.yaml index d2170de6b723..2f5df1d235ae 100644 --- a/Documentation/devicetree/bindings/display/panel/novatek,nt36672a.yaml +++ b/Documentation/devicetree/bindings/display/panel/novatek,nt36672a.yaml @@ -22,7 +22,7 @@ properties: compatible: items: - enum: - - tianma,fhd-video + - tianma,fhd-video - const: novatek,nt36672a description: This indicates the panel manufacturer of the panel that is in turn using the NT36672A panel driver. This compatible string From 246e18ba725c3b39d9d45b91fd93ce67e772fef4 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Mon, 7 Dec 2020 12:55:27 +0800 Subject: [PATCH 231/326] dt-bindings: clock: imx8qxp-lpcg: eliminate yamllint warnings Eliminate the following yamllint warnings: ./Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml :32:13:[warning] wrong indentation: expected 14 but found 12 (indentation) :35:9: [warning] wrong indentation: expected 10 but found 8 (indentation) Signed-off-by: Zhen Lei Reviewed-by: Rob Herring Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/20201207045527.1607-2-thunder.leizhen@huawei.com Signed-off-by: Rob Herring --- .../bindings/clock/imx8qxp-lpcg.yaml | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml b/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml index e709e530e17a..940486ef1051 100644 --- a/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml +++ b/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml @@ -29,18 +29,18 @@ properties: - const: fsl,imx8qxp-lpcg - items: - enum: - - fsl,imx8qm-lpcg + - fsl,imx8qm-lpcg - const: fsl,imx8qxp-lpcg - enum: - - fsl,imx8qxp-lpcg-adma - - fsl,imx8qxp-lpcg-conn - - fsl,imx8qxp-lpcg-dc - - fsl,imx8qxp-lpcg-dsp - - fsl,imx8qxp-lpcg-gpu - - fsl,imx8qxp-lpcg-hsio - - fsl,imx8qxp-lpcg-img - - fsl,imx8qxp-lpcg-lsio - - fsl,imx8qxp-lpcg-vpu + - fsl,imx8qxp-lpcg-adma + - fsl,imx8qxp-lpcg-conn + - fsl,imx8qxp-lpcg-dc + - fsl,imx8qxp-lpcg-dsp + - fsl,imx8qxp-lpcg-gpu + - fsl,imx8qxp-lpcg-hsio + - fsl,imx8qxp-lpcg-img + - fsl,imx8qxp-lpcg-lsio + - fsl,imx8qxp-lpcg-vpu deprecated: true reg: maxItems: 1 From 21df8683b85611c8267fdf87ebb7b4056b88ad3a Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Thu, 17 Dec 2020 00:59:45 +0000 Subject: [PATCH 232/326] dt-bindings/display: abt,y030xx067a: Fix binding The binding should use "unevaluatedProperties" instead of "additionalProperties", since it is a SPI device and may have SPI-related Device Tree properties, for instance the "spi-max-frequency" property that is present in the example. Fixes: e366a644c69d ("dt-bindings: display: Add ABT Y030XX067A panel bindings") Signed-off-by: Paul Cercueil Reviewed-by: Sam Ravnborg Link: https://lore.kernel.org/r/20201217005945.335111-1-paul%40crapouillou.net Signed-off-by: Rob Herring --- .../devicetree/bindings/display/panel/abt,y030xx067a.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml b/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml index 91cb4c3e0198..a108029ecfab 100644 --- a/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml +++ b/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml @@ -32,7 +32,7 @@ required: - power-supply - reset-gpios -additionalProperties: false +unevaluatedProperties: false examples: - | From dd20166236953c8cd14f4c668bf972af32f0c6be Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 19 Dec 2020 03:15:43 +0000 Subject: [PATCH 233/326] io_uring: fix 0-iov read buffer select Doing vectored buf-select read with 0 iovec passed is meaningless and utterly broken, forbid it. Cc: # 5.7+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b74957856e68..f3690dfdd564 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3125,9 +3125,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, iov[0].iov_len = kbuf->len; return 0; } - if (!req->rw.len) - return 0; - else if (req->rw.len > 1) + if (req->rw.len != 1) return -EINVAL; #ifdef CONFIG_COMPAT From 09926202e939fd699650ac0fc0baa5757e069390 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 20 Dec 2020 09:09:43 +0100 Subject: [PATCH 234/326] ALSA: hda/realtek: Add quirk for MSI-GP73 MSI-GP73 (with SSID 1462:1229) requires yet again ALC1220_FIXUP_CLEVO_P950 quirk like other MSI models. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=210793 Cc: Link: https://lore.kernel.org/r/20201220080943.24839-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index c203eba8d9ff..5478a89d94e3 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2516,6 +2516,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1458, 0xa0ce, "Gigabyte X570 Aorus Xtreme", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x11f7, "MSI-GE63", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x1228, "MSI-GP63", ALC1220_FIXUP_CLEVO_P950), + SND_PCI_QUIRK(0x1462, 0x1229, "MSI-GP73", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x1275, "MSI-GL63", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x1276, "MSI-GL73", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x1293, "MSI-GP65", ALC1220_FIXUP_CLEVO_P950), From 00c18640c2430c4bafaaeede1f9dd6f7ec0e4b25 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 20 Dec 2020 10:45:02 -0700 Subject: [PATCH 235/326] io_uring: make ctx cancel on exit targeted to actual ctx Before IORING_SETUP_ATTACH_WQ, we could just cancel everything on the io-wq when exiting. But that's not the case if they are shared, so cancel for the specific ctx instead. Cc: stable@vger.kernel.org Fixes: 24369c2e3bb0 ("io_uring: add io-wq workqueue sharing") Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f3690dfdd564..48b9332c2354 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8637,6 +8637,13 @@ static void io_ring_exit_work(struct work_struct *work) io_ring_ctx_free(ctx); } +static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + return req->ctx == data; +} + static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); @@ -8651,7 +8658,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_poll_remove_all(ctx, NULL, NULL); if (ctx->io_wq) - io_wq_cancel_all(ctx->io_wq); + io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true); /* if we failed setting up the ctx, we might not have any rings */ io_iopoll_try_reap_events(ctx); From 446bc1c207331080d8c711a4456799b7d0b9df26 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 20 Dec 2020 10:47:42 -0700 Subject: [PATCH 236/326] io-wq: kill now unused io_wq_cancel_all() io_uring no longer issues full cancelations on the io-wq, so remove any remnants of this code and the IO_WQ_BIT_CANCEL flag. Signed-off-by: Jens Axboe --- fs/io-wq.c | 30 +----------------------------- fs/io-wq.h | 2 -- 2 files changed, 1 insertion(+), 31 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index f72d53848dcb..a564f36e260c 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -36,8 +36,7 @@ enum { enum { IO_WQ_BIT_EXIT = 0, /* wq exiting */ - IO_WQ_BIT_CANCEL = 1, /* cancel work on list */ - IO_WQ_BIT_ERROR = 2, /* error on setup */ + IO_WQ_BIT_ERROR = 1, /* error on setup */ }; enum { @@ -561,12 +560,6 @@ get_next: next_hashed = wq_next_work(work); io_impersonate_work(worker, work); - /* - * OK to set IO_WQ_WORK_CANCEL even for uncancellable - * work, the worker function will do the right thing. - */ - if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) - work->flags |= IO_WQ_WORK_CANCEL; old_work = work; linked = wq->do_work(work); @@ -732,12 +725,6 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) return acct->nr_workers < acct->max_workers; } -static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data) -{ - send_sig(SIGINT, worker->task, 1); - return false; -} - /* * Iterate the passed in list and call the specific function for each * worker that isn't exiting @@ -938,21 +925,6 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); } -void io_wq_cancel_all(struct io_wq *wq) -{ - int node; - - set_bit(IO_WQ_BIT_CANCEL, &wq->state); - - rcu_read_lock(); - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL); - } - rcu_read_unlock(); -} - struct io_cb_cancel_data { work_cancel_fn *fn; void *data; diff --git a/fs/io-wq.h b/fs/io-wq.h index 75113bcd5889..b158f8addcf3 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -129,8 +129,6 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work) return work->flags & IO_WQ_WORK_HASHED; } -void io_wq_cancel_all(struct io_wq *wq); - typedef bool (work_cancel_fn)(struct io_wq_work *, void *); enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, From 55583d72e2303638d30dd4a7aabef59ffa0a017a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Dec 2020 13:21:43 +0000 Subject: [PATCH 237/326] io_uring: always progress task_work on task cancel Might happen that __io_uring_cancel_task_requests() cancels nothing but there are task_works pending. We need to always run them. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 48b9332c2354..d0ab6b503a9f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8801,9 +8801,9 @@ static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, ret |= io_poll_remove_all(ctx, task, NULL); ret |= io_kill_timeouts(ctx, task, NULL); + ret |= io_run_task_work(); if (!ret) break; - io_run_task_work(); cond_resched(); } } From f57555eda979ca085d2524db81e14b8a6089e15e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 20 Dec 2020 13:21:44 +0000 Subject: [PATCH 238/326] io_uring: end waiting before task cancel attempts Get rid of TASK_UNINTERRUPTIBLE and waiting with finish_wait before going for next iteration in __io_uring_task_cancel(), because __io_uring_files_cancel() doesn't expect that sheduling is disallowed. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d0ab6b503a9f..fbf747803dbc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8989,9 +8989,9 @@ void __io_uring_task_cancel(void) if (inflight != tctx_inflight(tctx)) continue; schedule(); + finish_wait(&tctx->wait, &wait); } while (1); - finish_wait(&tctx->wait, &wait); atomic_dec(&tctx->in_idle); } From 8b7c764e0644455a5991abea126e7ca6e03ee723 Mon Sep 17 00:00:00 2001 From: YangHui Date: Mon, 21 Dec 2020 14:22:07 +0800 Subject: [PATCH 239/326] ALSA: core: Remove redundant comments Remove redundant comments Signed-off-by: YangHui Link: https://lore.kernel.org/r/1608531727-5433-1-git-send-email-yanghui.def@gmail.com Signed-off-by: Takashi Iwai --- sound/core/init.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/sound/core/init.c b/sound/core/init.c index 764dbe673d48..75aec71c48a8 100644 --- a/sound/core/init.c +++ b/sound/core/init.c @@ -149,8 +149,6 @@ static void release_card_device(struct device *dev) * @extra_size: allocate this extra size after the main soundcard structure * @card_ret: the pointer to store the created card instance * - * Creates and initializes a soundcard structure. - * * The function allocates snd_card instance via kzalloc with the given * space for the driver to use freely. The allocated struct is stored * in the given card_ret pointer. From 525d9c57d0eeeb660d9b25e5b2d1c95975e3ba95 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 21 Dec 2020 09:01:59 +0100 Subject: [PATCH 240/326] ALSA: usb-audio: Add alias entry for ASUS PRIME TRX40 PRO-S ASUS PRIME TRX40 PRO-S mobo with 0b05:1918 needs the same quirk alias for another ASUS mobo (0b05:1917) for the proper mixer mapping, etc. Add the corresponding entry. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=210783 Cc: Link: https://lore.kernel.org/r/20201221080159.24468-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/card.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/usb/card.c b/sound/usb/card.c index cb0b6582dfee..d731ca62d599 100644 --- a/sound/usb/card.c +++ b/sound/usb/card.c @@ -383,6 +383,9 @@ static const struct usb_audio_device_name usb_audio_names[] = { /* ASUS ROG Strix */ PROFILE_NAME(0x0b05, 0x1917, "Realtek", "ALC1220-VB-DT", "Realtek-ALC1220-VB-Desktop"), + /* ASUS PRIME TRX40 PRO-S */ + PROFILE_NAME(0x0b05, 0x1918, + "Realtek", "ALC1220-VB-DT", "Realtek-ALC1220-VB-Desktop"), /* Dell WD15 Dock */ PROFILE_NAME(0x0bda, 0x4014, "Dell", "WD15 Dock", "Dell-WD15-Dock"), From b36f835b636908e4122f2e17310b1dbc380a3b19 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 15 Dec 2020 14:29:06 +1100 Subject: [PATCH 241/326] powerpc/boot: Fix build of dts/fsl The lkp robot reported that some configs fail to build, for example mpc85xx_smp_defconfig, with: cc1: fatal error: opening output file arch/powerpc/boot/dts/fsl/.mpc8540ads.dtb.dts.tmp: No such file or directory This bisects to: cc8a51ca6f05 ("kbuild: always create directories of targets") Although that commit claims to be about in-tree builds, it somehow breaks out-of-tree builds. But presumably it's just exposing a latent bug in our Makefiles. We can fix it by adding to targets for dts/fsl in the same way that we do for dts. Fixes: cc8a51ca6f05 ("kbuild: always create directories of targets") Reported-by: kernel test robot Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201215032906.473460-1-mpe@ellerman.id.au --- arch/powerpc/boot/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index ec0b2186e41c..2b8da923ceca 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -373,6 +373,8 @@ initrd-y := $(filter-out $(image-y), $(initrd-y)) targets += $(image-y) $(initrd-y) targets += $(foreach x, dtbImage uImage cuImage simpleImage treeImage, \ $(patsubst $(x).%, dts/%.dtb, $(filter $(x).%, $(image-y)))) +targets += $(foreach x, dtbImage uImage cuImage simpleImage treeImage, \ + $(patsubst $(x).%, dts/fsl/%.dtb, $(filter $(x).%, $(image-y)))) $(addprefix $(obj)/, $(initrd-y)): $(obj)/ramdisk.image.gz From 0faa22f09caadc11af2aa7570870ebd2ac5b8170 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sun, 20 Dec 2020 18:18:26 +0000 Subject: [PATCH 242/326] powerpc/time: Force inlining of get_tb() Force inlining of get_tb() in order to avoid getting following function in vdso32, leading to suboptimal performance in clock_gettime() 00000688 <.get_tb>: 688: 7c 6d 42 a6 mftbu r3 68c: 7c 8c 42 a6 mftb r4 690: 7d 2d 42 a6 mftbu r9 694: 7c 03 48 40 cmplw r3,r9 698: 40 e2 ff f0 bne+ 688 <.get_tb> 69c: 4e 80 00 20 blr Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/df05d53eed1210cf1aa76d1fb44aa0fab29c018e.1608488286.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/vdso/timebase.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/vdso/timebase.h b/arch/powerpc/include/asm/vdso/timebase.h index b558b07959ce..881f655caa0a 100644 --- a/arch/powerpc/include/asm/vdso/timebase.h +++ b/arch/powerpc/include/asm/vdso/timebase.h @@ -49,7 +49,7 @@ static inline unsigned long get_tbl(void) return mftb(); } -static inline u64 get_tb(void) +static __always_inline u64 get_tb(void) { unsigned int tbhi, tblo, tbhi2; From 9014eab6a38c60fd185bc92ed60f46cf99a462ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 21 Dec 2020 08:41:54 +0100 Subject: [PATCH 243/326] powerpc/smp: Add __init to init_big_cores() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It fixes this link warning: WARNING: modpost: vmlinux.o(.text.unlikely+0x2d98): Section mismatch in reference from the function init_big_cores.isra.0() to the function .init.text:init_thread_group_cache_map() The function init_big_cores.isra.0() references the function __init init_thread_group_cache_map(). This is often because init_big_cores.isra.0 lacks a __init annotation or the annotation of init_thread_group_cache_map is wrong. Fixes: 425752c63b6f ("powerpc: Detect the presence of big-cores via "ibm, thread-groups"") Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201221074154.403779-1-clg@kaod.org --- arch/powerpc/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 2b9b1bb4c5f2..9e2246e80efd 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -990,7 +990,7 @@ static struct sched_domain_topology_level powerpc_topology[] = { { NULL, }, }; -static int init_big_cores(void) +static int __init init_big_cores(void) { int cpu; From 42ed6d56ade21f367f27aa5915cc397510cfdef5 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 18 Dec 2020 22:16:17 +1100 Subject: [PATCH 244/326] powerpc/vdso: Block R_PPC_REL24 relocations Add R_PPC_REL24 relocations to the list of relocations we do NOT support in the VDSO. These are generated in some cases and we do not support relocating them at runtime, so if they appear then the VDSO will not work at runtime, therefore it's preferable to break the build if we see them. Fixes: ab037dd87a2f ("powerpc/vdso: Switch VDSO to generic C implementation.") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201218111619.1206391-1-mpe@ellerman.id.au --- arch/powerpc/kernel/vdso32/Makefile | 2 +- arch/powerpc/kernel/vdso64/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index 59aa2944ecae..6616f4e794d0 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -2,7 +2,7 @@ # List of files in the vdso, has to be asm only for now -ARCH_REL_TYPE_ABS := R_PPC_JUMP_SLOT|R_PPC_GLOB_DAT|R_PPC_ADDR32|R_PPC_ADDR24|R_PPC_ADDR16|R_PPC_ADDR16_LO|R_PPC_ADDR16_HI|R_PPC_ADDR16_HA|R_PPC_ADDR14|R_PPC_ADDR14_BRTAKEN|R_PPC_ADDR14_BRNTAKEN +ARCH_REL_TYPE_ABS := R_PPC_JUMP_SLOT|R_PPC_GLOB_DAT|R_PPC_ADDR32|R_PPC_ADDR24|R_PPC_ADDR16|R_PPC_ADDR16_LO|R_PPC_ADDR16_HI|R_PPC_ADDR16_HA|R_PPC_ADDR14|R_PPC_ADDR14_BRTAKEN|R_PPC_ADDR14_BRNTAKEN|R_PPC_REL24 include $(srctree)/lib/vdso/Makefile obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile index d365810a689a..bf363ff37152 100644 --- a/arch/powerpc/kernel/vdso64/Makefile +++ b/arch/powerpc/kernel/vdso64/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 # List of files in the vdso, has to be asm only for now -ARCH_REL_TYPE_ABS := R_PPC_JUMP_SLOT|R_PPC_GLOB_DAT|R_PPC_ADDR32|R_PPC_ADDR24|R_PPC_ADDR16|R_PPC_ADDR16_LO|R_PPC_ADDR16_HI|R_PPC_ADDR16_HA|R_PPC_ADDR14|R_PPC_ADDR14_BRTAKEN|R_PPC_ADDR14_BRNTAKEN +ARCH_REL_TYPE_ABS := R_PPC_JUMP_SLOT|R_PPC_GLOB_DAT|R_PPC_ADDR32|R_PPC_ADDR24|R_PPC_ADDR16|R_PPC_ADDR16_LO|R_PPC_ADDR16_HI|R_PPC_ADDR16_HA|R_PPC_ADDR14|R_PPC_ADDR14_BRTAKEN|R_PPC_ADDR14_BRNTAKEN|R_PPC_REL24 include $(srctree)/lib/vdso/Makefile obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o From 107521e8039688f7a9548f17919dfde670b911c1 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 18 Dec 2020 22:16:18 +1100 Subject: [PATCH 245/326] powerpc/vdso: Don't pass 64-bit ABI cflags to 32-bit VDSO When building the 32-bit VDSO, we are building 32-bit code as part of a 64-bit kernel build. That requires us to tweak the cflags to trick the compiler into building 32-bit code for us. The main way we do that is by passing -m32, but there are other options that affect code generation and ABI selection. In particular when building vgettimeofday.c, we end up passing -mcall-aixdesc because it's in KBUILD_CFLAGS, which causes the compiler to generate function descriptors, and dot symbols, eg: $ nm arch/powerpc/kernel/vdso32/vgettimeofday.o 000005d0 T .__c_kernel_clock_getres 00000024 D __c_kernel_clock_getres ... We get away with that at the moment because we also use the DOTSYM macro, and that is also incorrectly prepending a '.' in 32-bit VDSO code due to a separate bug. But we shouldn't be generating function descriptors for this file, there's no 32-bit ABI that includes function descriptors, so the resulting object file is some frankenstein and it's surprising that it even links. So filter out all the ABI-related options we add to CFLAGS for 64-bit builds, so that they're not used when building 32-bit code. With that we only see regular text symbols: $ nm arch/powerpc/kernel/vdso32/vgettimeofday.o michael@alpine1-p1 000005d0 T __c_kernel_clock_getres 00000000 T __c_kernel_clock_gettime 00000200 T __c_kernel_clock_gettime64 00000410 T __c_kernel_gettimeofday 00000650 T __c_kernel_time Fixes: ab037dd87a2f ("powerpc/vdso: Switch VDSO to generic C implementation.") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201218111619.1206391-2-mpe@ellerman.id.au --- arch/powerpc/kernel/vdso32/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index 6616f4e794d0..9cb6f524854b 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -27,7 +27,7 @@ endif CC32FLAGS := ifdef CONFIG_PPC64 CC32FLAGS += -m32 -KBUILD_CFLAGS := $(filter-out -mcmodel=medium,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS := $(filter-out -mcmodel=medium -mabi=elfv1 -mabi=elfv2 -mcall-aixdesc,$(KBUILD_CFLAGS)) endif targets := $(obj-vdso32) vdso32.so.dbg From 2eda7f11000646909a10298951c9defb2321b240 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 18 Dec 2020 22:16:19 +1100 Subject: [PATCH 246/326] powerpc/vdso: Fix DOTSYM for 32-bit LE VDSO Skirmisher reported on IRC that the 32-bit LE VDSO was hanging. This turned out to be due to a branch to self in eg. __kernel_gettimeofday. Looking at the disassembly with objdump -dR shows why: 00000528 <__kernel_gettimeofday>: 528: f0 ff 21 94 stwu r1,-16(r1) 52c: a6 02 08 7c mflr r0 530: f0 ff 21 94 stwu r1,-16(r1) 534: 14 00 01 90 stw r0,20(r1) 538: 05 00 9f 42 bcl 20,4*cr7+so,53c <__kernel_gettimeofday+0x14> 53c: a6 02 a8 7c mflr r5 540: ff ff a5 3c addis r5,r5,-1 544: c4 fa a5 38 addi r5,r5,-1340 548: f0 00 a5 38 addi r5,r5,240 54c: 01 00 00 48 bl 54c <__kernel_gettimeofday+0x24> 54c: R_PPC_REL24 .__c_kernel_gettimeofday Because we don't process relocations for the VDSO, this branch remains a branch from 0x54c to 0x54c. With the preceding patch to prohibit R_PPC_REL24 relocations, we instead get a build failure: 0000054c R_PPC_REL24 .__c_kernel_gettimeofday 00000598 R_PPC_REL24 .__c_kernel_clock_gettime 000005e4 R_PPC_REL24 .__c_kernel_clock_gettime64 00000630 R_PPC_REL24 .__c_kernel_clock_getres 0000067c R_PPC_REL24 .__c_kernel_time arch/powerpc/kernel/vdso32/vdso32.so.dbg: dynamic relocations are not supported The root cause is that we're branching to `.__c_kernel_gettimeofday`. But this is 32-bit LE code, which doesn't use function descriptors, so there are no dot symbols. The reason we're trying to branch to a dot symbol is because we're using the DOTSYM macro, but the ifdefs we use to define the DOTSYM macro do not currently work for 32-bit LE. So like previous commits we need to differentiate if the current compilation unit is 64-bit, rather than the kernel as a whole. ie. switch from CONFIG_PPC64 to __powerpc64__. With that fixed 32-bit LE code gets the empty version of DOTSYM, which just resolves to the original symbol name, leading to a direct branch and no relocations: 000003f8 <__kernel_gettimeofday>: 3f8: f0 ff 21 94 stwu r1,-16(r1) 3fc: a6 02 08 7c mflr r0 400: f0 ff 21 94 stwu r1,-16(r1) 404: 14 00 01 90 stw r0,20(r1) 408: 05 00 9f 42 bcl 20,4*cr7+so,40c <__kernel_gettimeofday+0x14> 40c: a6 02 a8 7c mflr r5 410: ff ff a5 3c addis r5,r5,-1 414: f4 fb a5 38 addi r5,r5,-1036 418: f0 00 a5 38 addi r5,r5,240 41c: 85 06 00 48 bl aa0 <__c_kernel_gettimeofday> Fixes: ab037dd87a2f ("powerpc/vdso: Switch VDSO to generic C implementation.") Reported-by: "Will Springer " Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201218111619.1206391-3-mpe@ellerman.id.au --- arch/powerpc/include/asm/ppc_asm.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index cfa814824285..cc1bca571332 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -180,7 +180,12 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) #define VCPU_GPR(n) __VCPU_GPR(__REG_##n) #ifdef __KERNEL__ -#ifdef CONFIG_PPC64 + +/* + * We use __powerpc64__ here because we want the compat VDSO to use the 32-bit + * version below in the else case of the ifdef. + */ +#ifdef __powerpc64__ #define STACKFRAMESIZE 256 #define __STK_REG(i) (112 + ((i)-14)*8) From d5c243989fb0cb03c74d7340daca3b819f706ee7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 21 Dec 2020 06:18:03 +0000 Subject: [PATCH 247/326] powerpc/32: Fix vmap stack - Properly set r1 before activating MMU on syscall too We need r1 to be properly set before activating MMU, otherwise any new exception taken while saving registers into the stack in syscall prologs will use the user stack, which is wrong and will even lockup or crash when KUAP is selected. Do that by switching the meaning of r11 and r1 until we have saved r1 to the stack: copy r1 into r11 and setup the new stack pointer in r1. To avoid complicating and impacting all generic and specific prolog code (and more), copy back r1 into r11 once r11 is save onto the stack. We could get rid of copying r1 back and forth at the cost of rewriting everything to use r1 instead of r11 all the way when CONFIG_VMAP_STACK is set, but the effort is probably not worth it for now. Fixes: da7bb43ab9da ("powerpc/32: Fix vmap stack - Properly set r1 before activating MMU") Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a3d819d5c348cee9783a311d5d3f3ba9b48fd219.1608531452.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_32.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 541664d95702..a2f72c966baf 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -121,18 +121,28 @@ #ifdef CONFIG_VMAP_STACK mfspr r11, SPRN_SRR0 mtctr r11 -#endif + andi. r11, r9, MSR_PR + mr r11, r1 + lwz r1,TASK_STACK-THREAD(r12) + beq- 99f + addi r1, r1, THREAD_SIZE - INT_FRAME_SIZE + li r10, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */ + mtmsr r10 + isync + tovirt(r12, r12) + stw r11,GPR1(r1) + stw r11,0(r1) + mr r11, r1 +#else andi. r11, r9, MSR_PR lwz r11,TASK_STACK-THREAD(r12) beq- 99f addi r11, r11, THREAD_SIZE - INT_FRAME_SIZE -#ifdef CONFIG_VMAP_STACK - li r10, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */ - mtmsr r10 - isync + tophys(r11, r11) + stw r1,GPR1(r11) + stw r1,0(r11) + tovirt(r1, r11) /* set new kernel sp */ #endif - tovirt_vmstack r12, r12 - tophys_novmstack r11, r11 mflr r10 stw r10, _LINK(r11) #ifdef CONFIG_VMAP_STACK @@ -140,9 +150,6 @@ #else mfspr r10,SPRN_SRR0 #endif - stw r1,GPR1(r11) - stw r1,0(r11) - tovirt_novmstack r1, r11 /* set new kernel sp */ stw r10,_NIP(r11) mfcr r10 rlwinm r10,r10,0,4,2 /* Clear SO bit in CR */ From 7e90285716518d810857a1d362983d99da9bbf66 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Wed, 16 Dec 2020 13:46:54 +0000 Subject: [PATCH 248/326] docs: submitting-patches: Trivial - fix grammatical error "it is a used" does not make sense. Should be "it is used". Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20201216134654.271508-1-lee.jones@linaro.org Signed-off-by: Jonathan Corbet --- Documentation/process/submitting-patches.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst index fb8261a4be30..2c48a00a436e 100644 --- a/Documentation/process/submitting-patches.rst +++ b/Documentation/process/submitting-patches.rst @@ -446,7 +446,7 @@ patch. This tag documents that potentially interested parties have been included in the discussion. Co-developed-by: states that the patch was co-created by multiple developers; -it is a used to give attribution to co-authors (in addition to the author +it is used to give attribution to co-authors (in addition to the author attributed by the From: tag) when several people work on a single patch. Since Co-developed-by: denotes authorship, every Co-developed-by: must be immediately followed by a Signed-off-by: of the associated co-author. Standard sign-off From 27ab873e0ca640cbe1375aa5a0cdd0607cb6bbdc Mon Sep 17 00:00:00 2001 From: Milan Lakhani Date: Tue, 15 Dec 2020 20:42:36 +0000 Subject: [PATCH 249/326] Documentation: process: Correct numbering Renumber the steps in submit-checklist.rst as some numbers were skipped. Signed-off-by: Milan Lakhani Link: https://lore.kernel.org/r/1608064956-5512-1-git-send-email-milan.lakhani@codethink.co.uk Signed-off-by: Jonathan Corbet --- Documentation/process/submit-checklist.rst | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Documentation/process/submit-checklist.rst b/Documentation/process/submit-checklist.rst index 1879f881c300..230ee42f872f 100644 --- a/Documentation/process/submit-checklist.rst +++ b/Documentation/process/submit-checklist.rst @@ -75,44 +75,44 @@ and elsewhere regarding submitting Linux kernel patches. 13) Has been build- and runtime tested with and without ``CONFIG_SMP`` and ``CONFIG_PREEMPT.`` -16) All codepaths have been exercised with all lockdep features enabled. +14) All codepaths have been exercised with all lockdep features enabled. -17) All new ``/proc`` entries are documented under ``Documentation/`` +15) All new ``/proc`` entries are documented under ``Documentation/`` -18) All new kernel boot parameters are documented in +16) All new kernel boot parameters are documented in ``Documentation/admin-guide/kernel-parameters.rst``. -19) All new module parameters are documented with ``MODULE_PARM_DESC()`` +17) All new module parameters are documented with ``MODULE_PARM_DESC()`` -20) All new userspace interfaces are documented in ``Documentation/ABI/``. +18) All new userspace interfaces are documented in ``Documentation/ABI/``. See ``Documentation/ABI/README`` for more information. Patches that change userspace interfaces should be CCed to linux-api@vger.kernel.org. -21) Check that it all passes ``make headers_check``. +19) Check that it all passes ``make headers_check``. -22) Has been checked with injection of at least slab and page-allocation +20) Has been checked with injection of at least slab and page-allocation failures. See ``Documentation/fault-injection/``. If the new code is substantial, addition of subsystem-specific fault injection might be appropriate. -23) Newly-added code has been compiled with ``gcc -W`` (use +21) Newly-added code has been compiled with ``gcc -W`` (use ``make EXTRA_CFLAGS=-W``). This will generate lots of noise, but is good for finding bugs like "warning: comparison between signed and unsigned". -24) Tested after it has been merged into the -mm patchset to make sure +22) Tested after it has been merged into the -mm patchset to make sure that it still works with all of the other queued patches and various changes in the VM, VFS, and other subsystems. -25) All memory barriers {e.g., ``barrier()``, ``rmb()``, ``wmb()``} need a +23) All memory barriers {e.g., ``barrier()``, ``rmb()``, ``wmb()``} need a comment in the source code that explains the logic of what they are doing and why. -26) If any ioctl's are added by the patch, then also update +24) If any ioctl's are added by the patch, then also update ``Documentation/userspace-api/ioctl/ioctl-number.rst``. -27) If your modified source code depends on or uses any of the kernel +25) If your modified source code depends on or uses any of the kernel APIs or features that are related to the following ``Kconfig`` symbols, then test multiple builds with the related ``Kconfig`` symbols disabled and/or ``=m`` (if that option is available) [not all of these at the From 9bf19b78a203b6ed20ed7b5d7222f5ef7a49aed4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 17 Dec 2020 19:37:56 +0100 Subject: [PATCH 250/326] Documentation/submitting-patches: Document the SoB chain Document what a chain of Signed-off-by's in a patch commit message should mean, explicitly. This has been carved out from a tip subsystem handbook patchset by Thomas Gleixner: https://lkml.kernel.org/r/20181107171010.421878737@linutronix.de and incorporates follow-on comments. Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20201217183756.GE23634@zn.tnic Signed-off-by: Jonathan Corbet --- Documentation/process/submitting-patches.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst index 2c48a00a436e..5ba54120bef7 100644 --- a/Documentation/process/submitting-patches.rst +++ b/Documentation/process/submitting-patches.rst @@ -411,6 +411,12 @@ Some people also put extra tags at the end. They'll just be ignored for now, but you can do this to mark internal company procedures or just point out some special detail about the sign-off. +Any further SoBs (Signed-off-by:'s) following the author's SoB are from +people handling and transporting the patch, but were not involved in its +development. SoB chains should reflect the **real** route a patch took +as it was propagated to the maintainers and ultimately to Linus, with +the first SoB entry signalling primary authorship of a single author. + When to use Acked-by:, Cc:, and Co-developed-by: ------------------------------------------------ From c635b0cea6b812898563809a13e65278989b2c72 Mon Sep 17 00:00:00 2001 From: Fengfei Xi Date: Thu, 10 Dec 2020 16:21:34 +0800 Subject: [PATCH 251/326] docs: admin-guide: Fix default value of max_map_count in sysctl/vm.rst Since the default value of sysctl_max_map_count is defined as DEFAULT_MAX_MAP_COUNT from mm/util.c int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; DEFAULT_MAX_MAP_COUNT is defined as 65530 (65535-5) in include/linux/mm.h #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) Signed-off-by: Fengfei Xi Link: https://lore.kernel.org/r/20201210082134.36957-1-xi.fengfei@h3c.com Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/sysctl/vm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index e0cf17ad2ae5..82b5e7ad3e91 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -428,7 +428,7 @@ While most applications need less than a thousand maps, certain programs, particularly malloc debuggers, may consume lots of them, e.g., up to one or two maps per allocation. -The default value is 65536. +The default value is 65530. memory_failure_early_kill: From a528b04ea40690ff40501f50d618a62a02b19620 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 21 Dec 2020 18:34:04 +0000 Subject: [PATCH 252/326] io_uring: fix ignoring xa_store errors xa_store() may fail, check the result. Cc: stable@vger.kernel.org # 5.10 Fixes: 0f2122045b946 ("io_uring: don't rely on weak ->files references") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index fbf747803dbc..846c635d0620 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8852,10 +8852,9 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) { struct io_uring_task *tctx = current->io_uring; + int ret; if (unlikely(!tctx)) { - int ret; - ret = io_uring_alloc_task_context(current); if (unlikely(ret)) return ret; @@ -8866,7 +8865,12 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) if (!old) { get_file(file); - xa_store(&tctx->xa, (unsigned long)file, file, GFP_KERNEL); + ret = xa_err(xa_store(&tctx->xa, (unsigned long)file, + file, GFP_KERNEL)); + if (ret) { + fput(file); + return ret; + } } tctx->last = file; } From 2e2cbaf920d14de9a96180ddefd6861bcc46f07d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 9 Dec 2020 18:38:24 -0500 Subject: [PATCH 253/326] fix hostfs_open() use of ->f_path.dentry this is one of the cases where we need to use d_real() - we are using more than the name of dentry here. ->d_sb is used as well, so in case of hostfs being used as a layer we get the wrong superblock. Reported-by: Johannes Berg Tested-by: Johannes Berg Signed-off-by: Al Viro --- fs/hostfs/hostfs_kern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index c070c0d8e3e9..aea35459d390 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -315,7 +315,7 @@ retry: if (mode & FMODE_WRITE) r = w = 1; - name = dentry_name(file->f_path.dentry); + name = dentry_name(d_real(file->f_path.dentry, file->f_inode)); if (name == NULL) return -ENOMEM; From de043da0b9e71147ca610ed542d34858aadfc61c Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Fri, 18 Dec 2020 16:13:56 -0800 Subject: [PATCH 254/326] RISC-V: Fix usage of memblock_enforce_memory_limit memblock_enforce_memory_limit accepts the maximum memory size not the maximum address that can be handled by kernel. Fix the function invocation accordingly. Fixes: 1bd14a66ee52 ("RISC-V: Remove any memblock representing unusable memory area") Cc: stable@vger.kernel.org Reported-by: Bin Meng Tested-by: Bin Meng Acked-by: Mike Rapoport Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 13ba533f462b..bf5379135e39 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -176,7 +176,7 @@ void __init setup_bootmem(void) * Make sure that any memory beyond mem_start + (-PAGE_OFFSET) is removed * as it is unusable by kernel. */ - memblock_enforce_memory_limit(mem_start - PAGE_OFFSET); + memblock_enforce_memory_limit(-PAGE_OFFSET); /* Reserve from the start of the kernel to the end of the kernel */ memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); From 43877226829eda91856b055d217b3033805fd76f Mon Sep 17 00:00:00 2001 From: Mike Oliphant Date: Mon, 21 Dec 2020 13:55:33 -0800 Subject: [PATCH 255/326] ALSA: usb-audio: Add implicit feeback support for the BOSS GT-1 The BOSS GT-1 (USB ID 0582:01d6) requires implicit feedback like other similar BOSS devices. This patch adds this support. [ rearranged the table entry in the ID order -- tiwai ] Signed-off-by: Mike Oliphant Link: https://lore.kernel.org/r/20201221215533.2511-1-oliphant@nostatic.org Signed-off-by: Takashi Iwai --- sound/usb/implicit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/implicit.c b/sound/usb/implicit.c index 4e911d200562..eb3a4c433c3e 100644 --- a/sound/usb/implicit.c +++ b/sound/usb/implicit.c @@ -75,6 +75,7 @@ static const struct snd_usb_implicit_fb_match playback_implicit_fb_quirks[] = { /* No quirk for playback but with capture quirk (see below) */ IMPLICIT_FB_SKIP_DEV(0x0582, 0x0130), /* BOSS BR-80 */ IMPLICIT_FB_SKIP_DEV(0x0582, 0x0189), /* BOSS GT-100v2 */ + IMPLICIT_FB_SKIP_DEV(0x0582, 0x01d6), /* BOSS GT-1 */ IMPLICIT_FB_SKIP_DEV(0x0582, 0x01d8), /* BOSS Katana */ IMPLICIT_FB_SKIP_DEV(0x0582, 0x01e5), /* BOSS GT-001 */ @@ -85,6 +86,7 @@ static const struct snd_usb_implicit_fb_match playback_implicit_fb_quirks[] = { static const struct snd_usb_implicit_fb_match capture_implicit_fb_quirks[] = { IMPLICIT_FB_FIXED_DEV(0x0582, 0x0130, 0x0d, 0x01), /* BOSS BR-80 */ IMPLICIT_FB_FIXED_DEV(0x0582, 0x0189, 0x0d, 0x01), /* BOSS GT-100v2 */ + IMPLICIT_FB_FIXED_DEV(0x0582, 0x01d6, 0x0d, 0x01), /* BOSS GT-1 */ IMPLICIT_FB_FIXED_DEV(0x0582, 0x01d8, 0x0d, 0x01), /* BOSS Katana */ IMPLICIT_FB_FIXED_DEV(0x0582, 0x01e5, 0x0d, 0x01), /* BOSS GT-001 */ From 6ca653e3f73a1af0f30dbf9c2c79d2897074989f Mon Sep 17 00:00:00 2001 From: Chris Chiu Date: Tue, 22 Dec 2020 23:04:58 +0800 Subject: [PATCH 256/326] ALSA: hda/realtek: Apply jack fixup for Quanta NL3 The Quanta NL3 laptop has both a headphone output jack and a headset jack, on the right edge of the chassis. The pin information suggests that both of these are at the Front. The PulseAudio is confused to differentiate them so one of the jack can neither get the jack sense working nor the audio output. The ALC269_FIXUP_LIFEBOOK chained with ALC269_FIXUP_QUANTA_MUTE can help to differentiate 2 jacks and get the 'Auto-Mute Mode' working correctly. Signed-off-by: Chris Chiu Cc: Link: https://lore.kernel.org/r/20201222150459.9545-1-chiu@endlessos.org Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 5478a89d94e3..e9ea2e3fc811 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -8030,6 +8030,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1458, 0xfa53, "Gigabyte BXBT-2807", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1462, 0xb120, "MSI Cubi MS-B120", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1462, 0xb171, "Cubi N 8GL (MS-B171)", ALC283_FIXUP_HEADSET_MIC), + SND_PCI_QUIRK(0x152d, 0x1082, "Quanta NL3", ALC269_FIXUP_LIFEBOOK), SND_PCI_QUIRK(0x1558, 0x1323, "Clevo N130ZU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x1325, "System76 Darter Pro (darp5)", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x1401, "Clevo L140[CZ]U", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), From 13be30f156fda725b168ac89fc91f78651575307 Mon Sep 17 00:00:00 2001 From: Chris Chiu Date: Tue, 22 Dec 2020 23:04:59 +0800 Subject: [PATCH 257/326] ALSA/hda: apply jack fixup for the Acer Veriton N4640G/N6640G/N2510G This Acer Veriton N4640G/N6640G/N2510G desktops have 2 headphone jacks(front and rear), and a separate Mic In jack. The rear headphone jack is actually a line out jack but always silent while playing audio. The front 'Mic In' also fails the jack sensing. Apply the ALC269_FIXUP_LIFEBOOK to have all audio jacks to work as expected. Signed-off-by: Chris Chiu Signed-off-by: Jian-Hong Pan Cc: Link: https://lore.kernel.org/r/20201222150459.9545-2-chiu@endlessos.org Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index e9ea2e3fc811..dde5ba209541 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -7817,11 +7817,14 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1025, 0x0762, "Acer Aspire E1-472", ALC271_FIXUP_HP_GATE_MIC_JACK_E1_572), SND_PCI_QUIRK(0x1025, 0x0775, "Acer Aspire E1-572", ALC271_FIXUP_HP_GATE_MIC_JACK_E1_572), SND_PCI_QUIRK(0x1025, 0x079b, "Acer Aspire V5-573G", ALC282_FIXUP_ASPIRE_V5_PINS), + SND_PCI_QUIRK(0x1025, 0x101c, "Acer Veriton N2510G", ALC269_FIXUP_LIFEBOOK), SND_PCI_QUIRK(0x1025, 0x102b, "Acer Aspire C24-860", ALC286_FIXUP_ACER_AIO_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1025, 0x1065, "Acer Aspire C20-820", ALC269VC_FIXUP_ACER_HEADSET_MIC), SND_PCI_QUIRK(0x1025, 0x106d, "Acer Cloudbook 14", ALC283_FIXUP_CHROME_BOOK), SND_PCI_QUIRK(0x1025, 0x1099, "Acer Aspire E5-523G", ALC255_FIXUP_ACER_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1025, 0x110e, "Acer Aspire ES1-432", ALC255_FIXUP_ACER_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1025, 0x1166, "Acer Veriton N4640G", ALC269_FIXUP_LIFEBOOK), + SND_PCI_QUIRK(0x1025, 0x1167, "Acer Veriton N6640G", ALC269_FIXUP_LIFEBOOK), SND_PCI_QUIRK(0x1025, 0x1246, "Acer Predator Helios 500", ALC299_FIXUP_PREDATOR_SPK), SND_PCI_QUIRK(0x1025, 0x1247, "Acer vCopperbox", ALC269VC_FIXUP_ACER_VCOPPERBOX_PINS), SND_PCI_QUIRK(0x1025, 0x1248, "Acer Veriton N4660G", ALC269VC_FIXUP_ACER_MIC_NO_PRESENCE), From adf4c01aba575c02ae7335255f9f9a379e594c5d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 10 Dec 2020 08:55:42 +0100 Subject: [PATCH 258/326] MAINTAINERS: add fs/block_dev.c to the block section fs/block_dev.c is a pretty integral part of the block layer, so make sure it is mentioned in MAINTAINERS. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8c1c5f9830c3..ce9c7ab54b5c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3187,6 +3187,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git F: block/ F: drivers/block/ +F: fs/block_dev.c F: include/linux/blk* F: kernel/trace/blktrace.c F: lib/sbitmap.c From ca2e270aa1aa214d77d06c705d1f19524cde3faf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 10 Dec 2020 08:55:43 +0100 Subject: [PATCH 259/326] block: remove a pointless self-reference in block_dev.c There is no point in duplicating the file name in the top of the file comment. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- fs/block_dev.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 9e56ee1f2652..d2fa5009d5a4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/fs/block_dev.c - * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2001 Andrea Arcangeli SuSE */ From 7b51e703a89b824dbbf65de96e77d10d4915dbe0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 10 Dec 2020 08:55:44 +0100 Subject: [PATCH 260/326] block: update some copyrights Update copyrights for files that have gotten some major rewrites lately. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/genhd.c | 2 ++ block/partitions/core.c | 1 + fs/block_dev.c | 1 + 3 files changed, 4 insertions(+) diff --git a/block/genhd.c b/block/genhd.c index b84b8671e627..73faec438e49 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* * gendisk handling + * + * Portions Copyright (C) 2020 Christoph Hellwig */ #include diff --git a/block/partitions/core.c b/block/partitions/core.c index deca253583bd..e7d776db803b 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -2,6 +2,7 @@ /* * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King + * Copyright (C) 2020 Christoph Hellwig */ #include #include diff --git a/fs/block_dev.c b/fs/block_dev.c index d2fa5009d5a4..9293045e128c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -2,6 +2,7 @@ /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright (C) 2016 - 2020 Christoph Hellwig */ #include From c92dc856848f32781e37b88c1b7f875e274f5efb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 27 Nov 2020 12:34:00 +0100 Subject: [PATCH 261/326] ext4: defer saving error info from atomic context When filesystem inconsistency is detected with group locked, we currently try to modify superblock to store error there without blocking. However this can cause superblock checksum failures (or DIF/DIX failure) when the superblock is just being written out. Make error handling code just store error information in ext4_sb_info structure and copy it to on-disk superblock only in ext4_commit_super(). In case of error happening with group locked, we just postpone the superblock flushing to a workqueue. [ Added fixup so that s_first_error_* does not get updated after the file system is remounted. Also added fix for syzbot failure. - Ted ] Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20201127113405.26867-8-jack@suse.cz Signed-off-by: Theodore Ts'o Cc: Hillf Danton Reported-by: syzbot+9043030c040ce1849a60@syzkaller.appspotmail.com --- fs/ext4/ext4.h | 21 +++++++++ fs/ext4/super.c | 118 +++++++++++++++++++++++++++++++++--------------- 2 files changed, 103 insertions(+), 36 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 84c63b9b8310..ac356ed33a22 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1629,6 +1629,27 @@ struct ext4_sb_info { errseq_t s_bdev_wb_err; spinlock_t s_bdev_wb_lock; + /* Information about errors that happened during this mount */ + spinlock_t s_error_lock; + int s_add_error_count; + int s_first_error_code; + __u32 s_first_error_line; + __u32 s_first_error_ino; + __u64 s_first_error_block; + const char *s_first_error_func; + time64_t s_first_error_time; + int s_last_error_code; + __u32 s_last_error_line; + __u32 s_last_error_ino; + __u64 s_last_error_block; + const char *s_last_error_func; + time64_t s_last_error_time; + /* + * If we are in a context where we cannot update error information in + * the on-disk superblock, we queue this work to do it. + */ + struct work_struct s_error_work; + /* Ext4 fast commit stuff */ atomic_t s_fc_subtid; atomic_t s_fc_ineligible_updates; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0e6d9a4bd72f..4bbfb05aae58 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -404,10 +404,8 @@ void ext4_itable_unused_set(struct super_block *sb, bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); } -static void __ext4_update_tstamp(__le32 *lo, __u8 *hi) +static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now) { - time64_t now = ktime_get_real_seconds(); - now = clamp_val(now, 0, (1ull << 40) - 1); *lo = cpu_to_le32(lower_32_bits(now)); @@ -419,7 +417,8 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); } #define ext4_update_tstamp(es, tstamp) \ - __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) + __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \ + ktime_get_real_seconds()) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) @@ -591,7 +590,7 @@ static void __save_error_info(struct super_block *sb, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { - struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_sb_info *sbi = EXT4_SB(sb); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (bdev_read_only(sb->s_bdev)) @@ -599,30 +598,24 @@ static void __save_error_info(struct super_block *sb, int error, /* We default to EFSCORRUPTED error... */ if (error == 0) error = EFSCORRUPTED; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_update_tstamp(es, s_last_error_time); - strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); - es->s_last_error_line = cpu_to_le32(line); - es->s_last_error_ino = cpu_to_le32(ino); - es->s_last_error_block = cpu_to_le64(block); - es->s_last_error_errcode = ext4_errno_to_code(error); - if (!es->s_first_error_time) { - es->s_first_error_time = es->s_last_error_time; - es->s_first_error_time_hi = es->s_last_error_time_hi; - strncpy(es->s_first_error_func, func, - sizeof(es->s_first_error_func)); - es->s_first_error_line = cpu_to_le32(line); - es->s_first_error_ino = es->s_last_error_ino; - es->s_first_error_block = es->s_last_error_block; - es->s_first_error_errcode = es->s_last_error_errcode; + + spin_lock(&sbi->s_error_lock); + sbi->s_add_error_count++; + sbi->s_last_error_code = error; + sbi->s_last_error_line = line; + sbi->s_last_error_ino = ino; + sbi->s_last_error_block = block; + sbi->s_last_error_func = func; + sbi->s_last_error_time = ktime_get_real_seconds(); + if (!sbi->s_first_error_time) { + sbi->s_first_error_code = error; + sbi->s_first_error_line = line; + sbi->s_first_error_ino = ino; + sbi->s_first_error_block = block; + sbi->s_first_error_func = func; + sbi->s_first_error_time = sbi->s_last_error_time; } - /* - * Start the daily error reporting function if it hasn't been - * started already - */ - if (!es->s_error_count) - mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); - le32_add_cpu(&es->s_error_count, 1); + spin_unlock(&sbi->s_error_lock); } static void save_error_info(struct super_block *sb, int error, @@ -685,6 +678,14 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro) sb->s_flags |= SB_RDONLY; } +static void flush_stashed_error_work(struct work_struct *work) +{ + struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, + s_error_work); + + ext4_commit_super(sbi->s_sb, 1); +} + #define ext4_error_ratelimit(sb) \ ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \ "EXT4-fs error") @@ -925,8 +926,6 @@ __acquires(bitlock) return; trace_ext4_error(sb, function, line); - __save_error_info(sb, EFSCORRUPTED, ino, block, function, line); - if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; @@ -942,16 +941,15 @@ __acquires(bitlock) va_end(args); } - if (test_opt(sb, WARN_ON_ERROR)) - WARN_ON_ONCE(1); - if (test_opt(sb, ERRORS_CONT)) { - ext4_commit_super(sb, 0); + if (test_opt(sb, WARN_ON_ERROR)) + WARN_ON_ONCE(1); + __save_error_info(sb, EFSCORRUPTED, ino, block, function, line); + schedule_work(&EXT4_SB(sb)->s_error_work); return; } - ext4_unlock_group(sb, grp); - ext4_commit_super(sb, 1); + save_error_info(sb, EFSCORRUPTED, ino, block, function, line); ext4_handle_error(sb, false); /* * We only get here in the ERRORS_RO case; relocking the group @@ -1124,6 +1122,7 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); ext4_quota_off_umount(sb); + flush_work(&sbi->s_error_work); destroy_workqueue(sbi->rsv_conversion_wq); /* @@ -4666,6 +4665,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } timer_setup(&sbi->s_err_report, print_daily_error_info, 0); + spin_lock_init(&sbi->s_error_lock); + INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); /* Register extent status tree shrinker */ if (ext4_es_register_shrinker(sbi)) @@ -5108,6 +5109,7 @@ failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: del_timer_sync(&sbi->s_err_report); + flush_work(&sbi->s_error_work); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); failed_mount2: @@ -5434,6 +5436,7 @@ err_out: static int ext4_commit_super(struct super_block *sb, int sync) { + struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; int error = 0; @@ -5470,6 +5473,46 @@ static int ext4_commit_super(struct super_block *sb, int sync) es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeinodes_counter)); + /* Copy error information to the on-disk superblock */ + spin_lock(&sbi->s_error_lock); + if (sbi->s_add_error_count > 0) { + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + if (!es->s_first_error_time && !es->s_first_error_time_hi) { + __ext4_update_tstamp(&es->s_first_error_time, + &es->s_first_error_time_hi, + sbi->s_first_error_time); + strncpy(es->s_first_error_func, sbi->s_first_error_func, + sizeof(es->s_first_error_func)); + es->s_first_error_line = + cpu_to_le32(sbi->s_first_error_line); + es->s_first_error_ino = + cpu_to_le32(sbi->s_first_error_ino); + es->s_first_error_block = + cpu_to_le64(sbi->s_first_error_block); + es->s_first_error_errcode = + ext4_errno_to_code(sbi->s_first_error_code); + } + __ext4_update_tstamp(&es->s_last_error_time, + &es->s_last_error_time_hi, + sbi->s_last_error_time); + strncpy(es->s_last_error_func, sbi->s_last_error_func, + sizeof(es->s_last_error_func)); + es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); + es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); + es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); + es->s_last_error_errcode = + ext4_errno_to_code(sbi->s_last_error_code); + /* + * Start the daily error reporting function if it hasn't been + * started already + */ + if (!es->s_error_count) + mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); + le32_add_cpu(&es->s_error_count, sbi->s_add_error_count); + sbi->s_add_error_count = 0; + } + spin_unlock(&sbi->s_error_lock); + BUFFER_TRACE(sbh, "marking dirty"); ext4_superblock_csum_set(sb); if (sync) @@ -5823,6 +5866,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); } + /* Flush outstanding errors before changing fs state */ + flush_work(&sbi->s_error_work); + if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { err = -EROFS; From 82ef1370b0c1757ab4ce29f34c52b4e93839b0aa Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Fri, 4 Dec 2020 11:05:43 +0800 Subject: [PATCH 262/326] ext4: avoid s_mb_prefetch to be zero in individual scenarios Commit cfd732377221 ("ext4: add prefetching for block allocation bitmaps") introduced block bitmap prefetch, and expects to read block bitmaps of flex_bg through an IO. However, it seems to ignore the value range of s_log_groups_per_flex. In the scenario where the value of s_log_groups_per_flex is greater than 27, s_mb_prefetch or s_mb_prefetch_limit will overflow, cause a divide zero exception. In addition, the logic of calculating nr is also flawed, because the size of flexbg is fixed during a single mount, but s_mb_prefetch can be modified, which causes nr to fail to meet the value condition of [1, flexbg_size]. To solve this problem, we need to set the upper limit of s_mb_prefetch. Since we expect to load block bitmaps of a flex_bg through an IO, we can consider determining a reasonable upper limit among the IO limit parameters. After consideration, we chose BLK_MAX_SEGMENT_SIZE. This is a good choice to solve divide zero problem and avoiding performance degradation. [ Some minor code simplifications to make the changes easy to follow -- TYT ] Reported-by: Tosk Robot Signed-off-by: Chunguang Xu Reviewed-by: Samuel Liao Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/1607051143-24508-1-git-send-email-brookxu@tencent.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 77815cd110b2..99bf091fee10 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2372,9 +2372,9 @@ repeat: nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(sb)) { - nr = (group / sbi->s_mb_prefetch) * - sbi->s_mb_prefetch; - nr = nr + sbi->s_mb_prefetch - group; + nr = 1 << sbi->s_log_groups_per_flex; + nr -= group & (nr - 1); + nr = min(nr, sbi->s_mb_prefetch); } prefetch_grp = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); @@ -2710,7 +2710,8 @@ static int ext4_mb_init_backend(struct super_block *sb) if (ext4_has_feature_flex_bg(sb)) { /* a single flex group is supposed to be read by a single IO */ - sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex; + sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex, + BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ } else { sbi->s_mb_prefetch = 32; From be993933d2e997fdb72b8b1418d2a84df79b8962 Mon Sep 17 00:00:00 2001 From: Lei Chen Date: Fri, 11 Dec 2020 14:54:24 +0800 Subject: [PATCH 263/326] ext4: remove unnecessary wbc parameter from ext4_bio_write_page ext4_bio_write_page does not need wbc parameter, since its parameter io contains the io_wbc field. The io::io_wbc is initialized by ext4_io_submit_init which is called in ext4_writepages and ext4_writepage functions prior to ext4_bio_write_page. Therefor, when ext4_bio_write_page is called, wbc info has already been included in io parameter. Signed-off-by: Lei Chen Link: https://lore.kernel.org/r/1607669664-25656-1-git-send-email-lennychen@tencent.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 - fs/ext4/inode.c | 4 ++-- fs/ext4/page-io.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ac356ed33a22..64f25ea2fa7a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3653,7 +3653,6 @@ extern void ext4_io_submit(struct ext4_io_submit *io); extern int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, int len, - struct writeback_control *wbc, bool keep_towrite); extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ab786123a022..27946882d4ce 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2064,7 +2064,7 @@ static int ext4_writepage(struct page *page, unlock_page(page); return -ENOMEM; } - ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite); + ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite); ext4_io_submit(&io_submit); /* Drop io_end reference we got from init */ ext4_put_io_end_defer(io_submit.io_end); @@ -2098,7 +2098,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) len = size & ~PAGE_MASK; else len = PAGE_SIZE; - err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); + err = ext4_bio_write_page(&mpd->io_submit, page, len, false); if (!err) mpd->wbc->nr_to_write--; mpd->first_page++; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index cb135a94482d..03a44a0de86a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -435,7 +435,6 @@ submit_and_retry: int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, int len, - struct writeback_control *wbc, bool keep_towrite) { struct page *bounce_page = NULL; @@ -445,6 +444,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, int ret = 0; int nr_submitted = 0; int nr_to_submit = 0; + struct writeback_control *wbc = io->io_wbc; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); From 942cb357ae7d9249088e3687ee6a00ed2745a0c7 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 22 Dec 2020 15:34:24 -0800 Subject: [PATCH 264/326] Smack: Handle io_uring kernel thread privileges Smack assumes that kernel threads are privileged for smackfs operations. This was necessary because the credential of the kernel thread was not related to a user operation. With io_uring the credential does reflect a user's rights and can be used. Suggested-by: Jens Axboe Acked-by: Jens Axboe Acked-by: Eric W. Biederman Signed-off-by: Casey Schaufler --- security/smack/smack_access.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index efe2406a3960..7eabb448acab 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -688,9 +688,10 @@ bool smack_privileged_cred(int cap, const struct cred *cred) bool smack_privileged(int cap) { /* - * All kernel tasks are privileged + * Kernel threads may not have credentials we can use. + * The io_uring kernel threads do have reliable credentials. */ - if (unlikely(current->flags & PF_KTHREAD)) + if ((current->flags & (PF_KTHREAD | PF_IO_WORKER)) == PF_KTHREAD) return true; return smack_privileged_cred(cap, current_cred()); From 9faadcc8abe4b83d0263216dc3a6321d5bbd616b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 21 Dec 2020 18:34:05 +0000 Subject: [PATCH 265/326] io_uring: fix double io_uring free Once we created a file for current context during setup, we should not call io_ring_ctx_wait_and_kill() directly as it'll be done by fput(file) Cc: stable@vger.kernel.org # 5.10 Reported-by: syzbot+c9937dfb2303a5f18640@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov [axboe: fix unused 'ret' for !CONFIG_UNIX] Signed-off-by: Jens Axboe --- fs/io_uring.c | 73 ++++++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 846c635d0620..7c0b77d49cd2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9379,55 +9379,52 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, return 0; } +static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) +{ + int ret, fd; + + fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (fd < 0) + return fd; + + ret = io_uring_add_task_file(ctx, file); + if (ret) { + put_unused_fd(fd); + return ret; + } + fd_install(fd, file); + return fd; +} + /* * Allocate an anonymous fd, this is what constitutes the application * visible backing of an io_uring instance. The application mmaps this * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, * we have to tie this fd to a socket for file garbage collection purposes. */ -static int io_uring_get_fd(struct io_ring_ctx *ctx) +static struct file *io_uring_get_file(struct io_ring_ctx *ctx) { struct file *file; - int ret; - int fd; - #if defined(CONFIG_UNIX) + int ret; + ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, &ctx->ring_sock); if (ret) - return ret; + return ERR_PTR(ret); #endif - ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC); - if (ret < 0) - goto err; - fd = ret; - file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, O_RDWR | O_CLOEXEC); +#if defined(CONFIG_UNIX) if (IS_ERR(file)) { - put_unused_fd(fd); - ret = PTR_ERR(file); - goto err; + sock_release(ctx->ring_sock); + ctx->ring_sock = NULL; + } else { + ctx->ring_sock->file = file; } - -#if defined(CONFIG_UNIX) - ctx->ring_sock->file = file; #endif - ret = io_uring_add_task_file(ctx, file); - if (ret) { - fput(file); - put_unused_fd(fd); - goto err; - } - fd_install(fd, file); - return fd; -err: -#if defined(CONFIG_UNIX) - sock_release(ctx->ring_sock); - ctx->ring_sock = NULL; -#endif - return ret; + return file; } static int io_uring_create(unsigned entries, struct io_uring_params *p, @@ -9435,6 +9432,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, { struct user_struct *user = NULL; struct io_ring_ctx *ctx; + struct file *file; bool limit_mem; int ret; @@ -9582,13 +9580,22 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; } + file = io_uring_get_file(ctx); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto err; + } + /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup */ - ret = io_uring_get_fd(ctx); - if (ret < 0) - goto err; + ret = io_uring_install_fd(ctx, file); + if (ret < 0) { + /* fput will clean it up */ + fput(file); + return ret; + } trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; From c07e6719511e77c4b289f62bfe96423eb6ea061d Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Mon, 14 Dec 2020 23:49:41 +0800 Subject: [PATCH 266/326] io_uring: hold uring_lock while completing failed polled io in io_wq_submit_work() io_iopoll_complete() does not hold completion_lock to complete polled io, so in io_wq_submit_work(), we can not call io_req_complete() directly, to complete polled io, otherwise there maybe concurrent access to cqring, defer_list, etc, which is not safe. Commit dad1b1242fd5 ("io_uring: always let io_iopoll_complete() complete polled io") has fixed this issue, but Pavel reported that IOPOLL apart from rw can do buf reg/unreg requests( IORING_OP_PROVIDE_BUFFERS or IORING_OP_REMOVE_BUFFERS), so the fix is not good. Given that io_iopoll_complete() is always called under uring_lock, so here for polled io, we can also get uring_lock to fix this issue. Fixes: dad1b1242fd5 ("io_uring: always let io_iopoll_complete() complete polled io") Cc: # 5.5+ Signed-off-by: Xiaoguang Wang Reviewed-by: Pavel Begunkov [axboe: don't deref 'req' after completing it'] Signed-off-by: Jens Axboe --- fs/io_uring.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7c0b77d49cd2..7e35283fc0b1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6332,19 +6332,28 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) } if (ret) { - /* - * io_iopoll_complete() does not hold completion_lock to complete - * polled io, so here for polled io, just mark it done and still let - * io_iopoll_complete() complete it. - */ - if (req->ctx->flags & IORING_SETUP_IOPOLL) { - struct kiocb *kiocb = &req->rw.kiocb; + struct io_ring_ctx *lock_ctx = NULL; - kiocb_done(kiocb, ret, NULL); - } else { - req_set_fail_links(req); - io_req_complete(req, ret); - } + if (req->ctx->flags & IORING_SETUP_IOPOLL) + lock_ctx = req->ctx; + + /* + * io_iopoll_complete() does not hold completion_lock to + * complete polled io, so here for polled io, we can not call + * io_req_complete() directly, otherwise there maybe concurrent + * access to cqring, defer_list, etc, which is not safe. Given + * that io_iopoll_complete() is always called under uring_lock, + * so here for polled io, we also get uring_lock to complete + * it. + */ + if (lock_ctx) + mutex_lock(&lock_ctx->uring_lock); + + req_set_fail_links(req); + io_req_complete(req, ret); + + if (lock_ctx) + mutex_unlock(&lock_ctx->uring_lock); } return io_steal_work(req); From 9bfaf9c729a924c048eaf2032ce932b3c724dc27 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 21 Dec 2020 16:46:59 -0700 Subject: [PATCH 267/326] dt-bindings: Drop unnecessary *-supply schemas properties *-supply properties are always a single phandle, so binding schemas don't need a type $ref nor 'maxItems'. A meta-schema check for this is pending once these existing cases are fixed. Cc: Jonathan Cameron Cc: Dmitry Torokhov Cc: Mauro Carvalho Chehab Cc: Maxime Ripard Cc: dri-devel@lists.freedesktop.org Cc: linux-iio@vger.kernel.org Cc: linux-input@vger.kernel.org Cc: linux-media@vger.kernel.org Acked-by: Sam Ravnborg Reviewed-by: Laurent Pinchart Acked-by: Sakari Ailus Acked-by: Lee Jones Acked-by: Mark Brown Link: https://lore.kernel.org/r/20201221234659.824881-1-robh@kernel.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/display/bridge/anx6345.yaml | 2 -- .../devicetree/bindings/display/bridge/ite,it6505.yaml | 2 -- .../devicetree/bindings/display/bridge/lvds-codec.yaml | 3 +-- Documentation/devicetree/bindings/display/bridge/ps8640.yaml | 2 -- .../devicetree/bindings/display/bridge/simple-bridge.yaml | 1 - .../devicetree/bindings/display/bridge/thine,thc63lvd1024.yaml | 1 - .../devicetree/bindings/display/bridge/toshiba,tc358775.yaml | 2 -- Documentation/devicetree/bindings/iio/adc/lltc,ltc2496.yaml | 3 +-- Documentation/devicetree/bindings/iio/humidity/ti,hdc2010.yaml | 3 +-- .../devicetree/bindings/input/fsl,mpr121-touchkey.yaml | 3 +-- .../devicetree/bindings/input/touchscreen/edt-ft5x06.yaml | 3 +-- Documentation/devicetree/bindings/media/i2c/maxim,max9286.yaml | 1 - Documentation/devicetree/bindings/media/i2c/mipi-ccs.yaml | 3 --- Documentation/devicetree/bindings/media/i2c/sony,imx214.yaml | 3 --- Documentation/devicetree/bindings/media/i2c/sony,imx274.yaml | 3 --- Documentation/devicetree/bindings/mfd/st,stmfx.yaml | 3 +-- .../devicetree/bindings/regulator/anatop-regulator.yaml | 1 - 17 files changed, 6 insertions(+), 33 deletions(-) diff --git a/Documentation/devicetree/bindings/display/bridge/anx6345.yaml b/Documentation/devicetree/bindings/display/bridge/anx6345.yaml index 8c0e4f285fbc..fccd63521a8c 100644 --- a/Documentation/devicetree/bindings/display/bridge/anx6345.yaml +++ b/Documentation/devicetree/bindings/display/bridge/anx6345.yaml @@ -26,11 +26,9 @@ properties: description: GPIO connected to active low reset dvdd12-supply: - maxItems: 1 description: Regulator for 1.2V digital core power. dvdd25-supply: - maxItems: 1 description: Regulator for 2.5V digital core power. ports: diff --git a/Documentation/devicetree/bindings/display/bridge/ite,it6505.yaml b/Documentation/devicetree/bindings/display/bridge/ite,it6505.yaml index efbb3d0117dc..02cfc0a3b550 100644 --- a/Documentation/devicetree/bindings/display/bridge/ite,it6505.yaml +++ b/Documentation/devicetree/bindings/display/bridge/ite,it6505.yaml @@ -35,11 +35,9 @@ properties: maxItems: 1 ovdd-supply: - maxItems: 1 description: I/O voltage pwr18-supply: - maxItems: 1 description: core voltage interrupts: diff --git a/Documentation/devicetree/bindings/display/bridge/lvds-codec.yaml b/Documentation/devicetree/bindings/display/bridge/lvds-codec.yaml index e5e3c72630cf..66a14d60ce1d 100644 --- a/Documentation/devicetree/bindings/display/bridge/lvds-codec.yaml +++ b/Documentation/devicetree/bindings/display/bridge/lvds-codec.yaml @@ -79,8 +79,7 @@ properties: The GPIO used to control the power down line of this device. maxItems: 1 - power-supply: - maxItems: 1 + power-supply: true required: - compatible diff --git a/Documentation/devicetree/bindings/display/bridge/ps8640.yaml b/Documentation/devicetree/bindings/display/bridge/ps8640.yaml index 7e27cfcf770d..763c7909473e 100644 --- a/Documentation/devicetree/bindings/display/bridge/ps8640.yaml +++ b/Documentation/devicetree/bindings/display/bridge/ps8640.yaml @@ -35,11 +35,9 @@ properties: description: GPIO connected to active low reset. vdd12-supply: - maxItems: 1 description: Regulator for 1.2V digital core power. vdd33-supply: - maxItems: 1 description: Regulator for 3.3V digital core power. ports: diff --git a/Documentation/devicetree/bindings/display/bridge/simple-bridge.yaml b/Documentation/devicetree/bindings/display/bridge/simple-bridge.yaml index 3ddb35fcf0a2..64e8a1c24b40 100644 --- a/Documentation/devicetree/bindings/display/bridge/simple-bridge.yaml +++ b/Documentation/devicetree/bindings/display/bridge/simple-bridge.yaml @@ -60,7 +60,6 @@ properties: description: GPIO controlling bridge enable vdd-supply: - maxItems: 1 description: Power supply for the bridge required: diff --git a/Documentation/devicetree/bindings/display/bridge/thine,thc63lvd1024.yaml b/Documentation/devicetree/bindings/display/bridge/thine,thc63lvd1024.yaml index 469ac4a34273..3d5ce08a5792 100644 --- a/Documentation/devicetree/bindings/display/bridge/thine,thc63lvd1024.yaml +++ b/Documentation/devicetree/bindings/display/bridge/thine,thc63lvd1024.yaml @@ -74,7 +74,6 @@ properties: description: Power down GPIO signal, pin name "/PDWN", active low. vcc-supply: - maxItems: 1 description: Power supply for the TTL output, TTL CLOCKOUT signal, LVDS input, PLL and digital circuitry. diff --git a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358775.yaml b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358775.yaml index fd3113aa9ccd..b5959cc78b8d 100644 --- a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358775.yaml +++ b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358775.yaml @@ -28,11 +28,9 @@ properties: description: i2c address of the bridge, 0x0f vdd-supply: - maxItems: 1 description: 1.2V LVDS Power Supply vddio-supply: - maxItems: 1 description: 1.8V IO Power Supply stby-gpios: diff --git a/Documentation/devicetree/bindings/iio/adc/lltc,ltc2496.yaml b/Documentation/devicetree/bindings/iio/adc/lltc,ltc2496.yaml index 6a991e9f78e2..2716d4e95329 100644 --- a/Documentation/devicetree/bindings/iio/adc/lltc,ltc2496.yaml +++ b/Documentation/devicetree/bindings/iio/adc/lltc,ltc2496.yaml @@ -17,8 +17,7 @@ properties: - lltc,ltc2496 vref-supply: - description: phandle to an external regulator providing the reference voltage - $ref: /schemas/types.yaml#/definitions/phandle + description: Power supply for the reference voltage reg: description: spi chipselect number according to the usual spi bindings diff --git a/Documentation/devicetree/bindings/iio/humidity/ti,hdc2010.yaml b/Documentation/devicetree/bindings/iio/humidity/ti,hdc2010.yaml index 7037f82ec753..88384b69f917 100644 --- a/Documentation/devicetree/bindings/iio/humidity/ti,hdc2010.yaml +++ b/Documentation/devicetree/bindings/iio/humidity/ti,hdc2010.yaml @@ -22,8 +22,7 @@ properties: - ti,hdc2010 - ti,hdc2080 - vdd-supply: - maxItems: 1 + vdd-supply: true reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/input/fsl,mpr121-touchkey.yaml b/Documentation/devicetree/bindings/input/fsl,mpr121-touchkey.yaml index 378a85c09d34..878464f128dc 100644 --- a/Documentation/devicetree/bindings/input/fsl,mpr121-touchkey.yaml +++ b/Documentation/devicetree/bindings/input/fsl,mpr121-touchkey.yaml @@ -31,8 +31,7 @@ properties: interrupts: maxItems: 1 - vdd-supply: - maxItems: 1 + vdd-supply: true linux,keycodes: minItems: 1 diff --git a/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml b/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml index 4ce109476a0e..bfc3a8b5e118 100644 --- a/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml +++ b/Documentation/devicetree/bindings/input/touchscreen/edt-ft5x06.yaml @@ -55,8 +55,7 @@ properties: wakeup-source: true - vcc-supply: - maxItems: 1 + vcc-supply: true gain: description: Allows setting the sensitivity in the range from 0 to 31. diff --git a/Documentation/devicetree/bindings/media/i2c/maxim,max9286.yaml b/Documentation/devicetree/bindings/media/i2c/maxim,max9286.yaml index 9ea827092fdd..68ee8c7d9e79 100644 --- a/Documentation/devicetree/bindings/media/i2c/maxim,max9286.yaml +++ b/Documentation/devicetree/bindings/media/i2c/maxim,max9286.yaml @@ -40,7 +40,6 @@ properties: poc-supply: description: Regulator providing Power over Coax to the cameras - maxItems: 1 enable-gpios: description: GPIO connected to the \#PWDN pin with inverted polarity diff --git a/Documentation/devicetree/bindings/media/i2c/mipi-ccs.yaml b/Documentation/devicetree/bindings/media/i2c/mipi-ccs.yaml index 0df0334d2d0d..bb3528315f20 100644 --- a/Documentation/devicetree/bindings/media/i2c/mipi-ccs.yaml +++ b/Documentation/devicetree/bindings/media/i2c/mipi-ccs.yaml @@ -39,15 +39,12 @@ properties: vana-supply: description: Analogue voltage supply (VANA), sensor dependent. - maxItems: 1 vcore-supply: description: Core voltage supply (VCore), sensor dependent. - maxItems: 1 vio-supply: description: I/O voltage supply (VIO), sensor dependent. - maxItems: 1 clocks: description: External clock to the sensor. diff --git a/Documentation/devicetree/bindings/media/i2c/sony,imx214.yaml b/Documentation/devicetree/bindings/media/i2c/sony,imx214.yaml index 1a3590dd0e98..eb12526a462f 100644 --- a/Documentation/devicetree/bindings/media/i2c/sony,imx214.yaml +++ b/Documentation/devicetree/bindings/media/i2c/sony,imx214.yaml @@ -37,15 +37,12 @@ properties: vdddo-supply: description: Chip digital IO regulator (1.8V). - maxItems: 1 vdda-supply: description: Chip analog regulator (2.7V). - maxItems: 1 vddd-supply: description: Chip digital core regulator (1.12V). - maxItems: 1 flash-leds: description: See ../video-interfaces.txt diff --git a/Documentation/devicetree/bindings/media/i2c/sony,imx274.yaml b/Documentation/devicetree/bindings/media/i2c/sony,imx274.yaml index f697e1a20beb..a66acb20d59b 100644 --- a/Documentation/devicetree/bindings/media/i2c/sony,imx274.yaml +++ b/Documentation/devicetree/bindings/media/i2c/sony,imx274.yaml @@ -33,15 +33,12 @@ properties: vana-supply: description: Sensor 2.8 V analog supply. - maxItems: 1 vdig-supply: description: Sensor 1.8 V digital core supply. - maxItems: 1 vddl-supply: description: Sensor digital IO 1.2 V supply. - maxItems: 1 port: type: object diff --git a/Documentation/devicetree/bindings/mfd/st,stmfx.yaml b/Documentation/devicetree/bindings/mfd/st,stmfx.yaml index 888ab4b5df45..19e9afb385ac 100644 --- a/Documentation/devicetree/bindings/mfd/st,stmfx.yaml +++ b/Documentation/devicetree/bindings/mfd/st,stmfx.yaml @@ -26,8 +26,7 @@ properties: drive-open-drain: true - vdd-supply: - maxItems: 1 + vdd-supply: true pinctrl: type: object diff --git a/Documentation/devicetree/bindings/regulator/anatop-regulator.yaml b/Documentation/devicetree/bindings/regulator/anatop-regulator.yaml index e7b3abe30363..0a66338c7e5a 100644 --- a/Documentation/devicetree/bindings/regulator/anatop-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/anatop-regulator.yaml @@ -59,7 +59,6 @@ properties: description: u32 value representing regulator enable bit offset. vin-supply: - $ref: '/schemas/types.yaml#/definitions/phandle' description: input supply phandle. required: From 246eedd70da91d57bf485bd558c50f7b2286c462 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 21 Dec 2020 21:01:21 -0700 Subject: [PATCH 268/326] dt-bindings: net: qcom,ipa: Drop unnecessary type ref on 'memory-region' 'memory-region' is a common property, so it doesn't need a type ref here. Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Alex Elder Cc: netdev@vger.kernel.org Acked-by: Alex Elder Link: https://lore.kernel.org/r/20201222040121.1314370-1-robh@kernel.org Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/net/qcom,ipa.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/qcom,ipa.yaml b/Documentation/devicetree/bindings/net/qcom,ipa.yaml index d0cbbcf1b0e5..8a2d12644675 100644 --- a/Documentation/devicetree/bindings/net/qcom,ipa.yaml +++ b/Documentation/devicetree/bindings/net/qcom,ipa.yaml @@ -121,7 +121,6 @@ properties: receive and act on notifications of modem up/down events. memory-region: - $ref: /schemas/types.yaml#/definitions/phandle-array maxItems: 1 description: If present, a phandle for a reserved memory area that holds From 2b8f061a4f505aad11fd36adb24c3138ad09b96b Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 21 Dec 2020 21:06:45 -0700 Subject: [PATCH 269/326] dt-bindings: Drop redundant maxItems/items 'maxItems' equal to the 'items' list length is redundant. 'maxItems' is preferred for a single entry while greater than 1 should have an 'items' list. A meta-schema check for this is pending once these existing cases are fixed. Cc: Greg Kroah-Hartman Cc: dri-devel@lists.freedesktop.org Cc: dmaengine@vger.kernel.org Cc: alsa-devel@alsa-project.org Cc: linux-usb@vger.kernel.org Acked-by: Sam Ravnborg Reviewed-by: Laurent Pinchart Acked-by: Vinod Koul Acked-by: Jassi Brar Acked-by: Mark Brown Link: https://lore.kernel.org/r/20201222040645.1323611-1-robh@kernel.org Signed-off-by: Rob Herring --- .../devicetree/bindings/display/xlnx/xlnx,zynqmp-dpsub.yaml | 1 - Documentation/devicetree/bindings/dma/renesas,rcar-dmac.yaml | 1 - Documentation/devicetree/bindings/mailbox/arm,mhu.yaml | 1 - .../devicetree/bindings/sound/nvidia,tegra30-hda.yaml | 2 -- Documentation/devicetree/bindings/usb/renesas,usb-xhci.yaml | 1 - Documentation/devicetree/bindings/usb/renesas,usbhs.yaml | 3 --- 6 files changed, 9 deletions(-) diff --git a/Documentation/devicetree/bindings/display/xlnx/xlnx,zynqmp-dpsub.yaml b/Documentation/devicetree/bindings/display/xlnx/xlnx,zynqmp-dpsub.yaml index 7b9d468c3e52..403d57977ee7 100644 --- a/Documentation/devicetree/bindings/display/xlnx/xlnx,zynqmp-dpsub.yaml +++ b/Documentation/devicetree/bindings/display/xlnx/xlnx,zynqmp-dpsub.yaml @@ -98,7 +98,6 @@ properties: maxItems: 1 dmas: - maxItems: 4 items: - description: Video layer, plane 0 (RGB or luma) - description: Video layer, plane 1 (U/V or U) diff --git a/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.yaml b/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.yaml index b548e4723936..c07eb6f2fc8d 100644 --- a/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.yaml +++ b/Documentation/devicetree/bindings/dma/renesas,rcar-dmac.yaml @@ -73,7 +73,6 @@ properties: maxItems: 1 clock-names: - maxItems: 1 items: - const: fck diff --git a/Documentation/devicetree/bindings/mailbox/arm,mhu.yaml b/Documentation/devicetree/bindings/mailbox/arm,mhu.yaml index d43791a2dde7..d07eb00b97c8 100644 --- a/Documentation/devicetree/bindings/mailbox/arm,mhu.yaml +++ b/Documentation/devicetree/bindings/mailbox/arm,mhu.yaml @@ -61,7 +61,6 @@ properties: - description: low-priority non-secure - description: high-priority non-secure - description: Secure - maxItems: 3 clocks: maxItems: 1 diff --git a/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.yaml b/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.yaml index e543a6123792..b55775e21de6 100644 --- a/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.yaml +++ b/Documentation/devicetree/bindings/sound/nvidia,tegra30-hda.yaml @@ -44,7 +44,6 @@ properties: maxItems: 3 clock-names: - maxItems: 3 items: - const: hda - const: hda2hdmi @@ -54,7 +53,6 @@ properties: maxItems: 3 reset-names: - maxItems: 3 items: - const: hda - const: hda2hdmi diff --git a/Documentation/devicetree/bindings/usb/renesas,usb-xhci.yaml b/Documentation/devicetree/bindings/usb/renesas,usb-xhci.yaml index 0f078bd0a3e5..22603256ddf8 100644 --- a/Documentation/devicetree/bindings/usb/renesas,usb-xhci.yaml +++ b/Documentation/devicetree/bindings/usb/renesas,usb-xhci.yaml @@ -51,7 +51,6 @@ properties: maxItems: 1 phy-names: - maxItems: 1 items: - const: usb diff --git a/Documentation/devicetree/bindings/usb/renesas,usbhs.yaml b/Documentation/devicetree/bindings/usb/renesas,usbhs.yaml index 737c1f47b7de..54c361d4a7af 100644 --- a/Documentation/devicetree/bindings/usb/renesas,usbhs.yaml +++ b/Documentation/devicetree/bindings/usb/renesas,usbhs.yaml @@ -74,11 +74,8 @@ properties: phys: maxItems: 1 - items: - - description: phandle + phy specifier pair. phy-names: - maxItems: 1 items: - const: usb From 117ae250cfa3718f21bd07df0650dfbe3bc3a823 Mon Sep 17 00:00:00 2001 From: Yi Li Date: Wed, 23 Dec 2020 23:04:21 +0800 Subject: [PATCH 270/326] bcache:remove a superfluous check in register_bcache There have no reassign the bdev after check It is IS_ERR. the double check !IS_ERR(bdev) is superfluous. After commit 4e7b5671c6a8 ("block: remove i_bdev"), "Switch the block device lookup interfaces to directly work with a dev_t so that struct block_device references are only acquired by the blkdev_get variants (and the blk-cgroup special case). This means that we now don't need an extra reference in the inode and can generally simplify handling of struct block_device to keep the lookups contained in the core block layer code." so after lookup_bdev call, there no need to do bdput. remove a superfluous check the bdev & don't call bdput after lookup_bdev. Fixes: 4e7b5671c6a8("block: remove i_bdev") Signed-off-by: Yi Li Reviewed-by: Christoph Hellwig Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 0e06d721cd8e..a4752ac410dc 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2535,8 +2535,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, else err = "device busy"; mutex_unlock(&bch_register_lock); - if (!IS_ERR(bdev)) - bdput(bdev); if (attr == &ksysfs_register_quiet) goto done; } From 46926127d76359b46659c556df7b4aa1b6325d90 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Wed, 23 Dec 2020 23:04:22 +0800 Subject: [PATCH 271/326] md/bcache: convert comma to semicolon Replace a comma between expression statements by a semicolon. Signed-off-by: Zheng Yongjun Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 554e3afc9b68..00a520c03f41 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -404,7 +404,7 @@ STORE(__cached_dev) if (!env) return -ENOMEM; add_uevent_var(env, "DRIVER=bcache"); - add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid), + add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid); add_uevent_var(env, "CACHED_LABEL=%s", buf); kobject_uevent_env(&disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, From 35b14475257f553a7cd60ce4b2571304644f652b Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Sat, 21 Nov 2020 21:01:47 +0800 Subject: [PATCH 272/326] drm/amdgpu: check number of gfx ring before init cp gfx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check number of gfx ring, rather than asic type, before cp gfx engine initialization so driver just need to make sure number of gfx ring is initialized correctly in gfx early_init phase. No need to add additional asic type check everywhere when there is new asic with gfx pipe removed. Signed-off-by: Hawking Zhang Reviewed-by: Feifei Xu Acked-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index fc9bb94eaaf4..ef430f285472 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -1647,7 +1647,7 @@ static int gfx_v9_0_init_microcode(struct amdgpu_device *adev) } /* No CPG in Arcturus */ - if (adev->asic_type != CHIP_ARCTURUS) { + if (adev->gfx.num_gfx_rings) { r = gfx_v9_0_init_cp_gfx_microcode(adev, chip_name); if (r) return r; @@ -3822,7 +3822,7 @@ static int gfx_v9_0_cp_resume(struct amdgpu_device *adev) gfx_v9_0_enable_gui_idle_interrupt(adev, false); if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) { - if (adev->asic_type != CHIP_ARCTURUS) { + if (adev->gfx.num_gfx_rings) { /* legacy firmware loading */ r = gfx_v9_0_cp_gfx_load_microcode(adev); if (r) @@ -3838,7 +3838,7 @@ static int gfx_v9_0_cp_resume(struct amdgpu_device *adev) if (r) return r; - if (adev->asic_type != CHIP_ARCTURUS) { + if (adev->gfx.num_gfx_rings) { r = gfx_v9_0_cp_gfx_resume(adev); if (r) return r; @@ -3848,7 +3848,7 @@ static int gfx_v9_0_cp_resume(struct amdgpu_device *adev) if (r) return r; - if (adev->asic_type != CHIP_ARCTURUS) { + if (adev->gfx.num_gfx_rings) { ring = &adev->gfx.gfx_ring[0]; r = amdgpu_ring_test_helper(ring); if (r) @@ -3884,7 +3884,7 @@ static void gfx_v9_0_init_tcp_config(struct amdgpu_device *adev) static void gfx_v9_0_cp_enable(struct amdgpu_device *adev, bool enable) { - if (adev->asic_type != CHIP_ARCTURUS) + if (adev->gfx.num_gfx_rings) gfx_v9_0_cp_gfx_enable(adev, enable); gfx_v9_0_cp_compute_enable(adev, enable); } @@ -4025,7 +4025,7 @@ static int gfx_v9_0_soft_reset(void *handle) /* stop the rlc */ adev->gfx.rlc.funcs->stop(adev); - if (adev->asic_type != CHIP_ARCTURUS) + if (adev->gfx.num_gfx_rings) /* Disable GFX parsing/prefetching */ gfx_v9_0_cp_gfx_enable(adev, false); From d0f2f634f59d8f35e70644daf956bf04d2ff2d0c Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Sat, 21 Nov 2020 21:07:12 +0800 Subject: [PATCH 273/326] drm/amdgpu: remove unnecessary asic type check The number of crtc should be 0 for ASICs that don't have display engine. Remove the unnecessary asic type check then. Signed-off-by: Hawking Zhang Reviewed-by: Feifei Xu Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index e1531d97f486..e22268f9dba7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1577,13 +1577,10 @@ static int gmc_v9_0_hw_init(void *handle) gmc_v9_0_init_golden_registers(adev); if (adev->mode_info.num_crtc) { - if (adev->asic_type != CHIP_ARCTURUS) { - /* Lockout access through VGA aperture*/ - WREG32_FIELD15(DCE, 0, VGA_HDP_CONTROL, VGA_MEMORY_DISABLE, 1); - - /* disable VGA render */ - WREG32_FIELD15(DCE, 0, VGA_RENDER_CONTROL, VGA_VSTATUS_CNTL, 0); - } + /* Lockout access through VGA aperture*/ + WREG32_FIELD15(DCE, 0, VGA_HDP_CONTROL, VGA_MEMORY_DISABLE, 1); + /* disable VGA render */ + WREG32_FIELD15(DCE, 0, VGA_RENDER_CONTROL, VGA_VSTATUS_CNTL, 0); } amdgpu_device_program_register_sequence(adev, From 462fbeb1fcfcd35e453eeaa80d6d3d26464269fd Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Sat, 21 Nov 2020 21:58:19 +0800 Subject: [PATCH 274/326] drm/amdgpu: check gfx pipe availability before toggling its interrupts GUI_IDLE interrupts controlled by CP_INT_CNTL_RING0 are only applicable to me0 pipe0. For ASICs that have gfx pipe removed, don't toggle those bits. Signed-off-by: Hawking Zhang Reviewed-by: Feifei Xu Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index ef430f285472..5f4805e4d04a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2633,7 +2633,14 @@ static void gfx_v9_0_wait_for_rlc_serdes(struct amdgpu_device *adev) static void gfx_v9_0_enable_gui_idle_interrupt(struct amdgpu_device *adev, bool enable) { - u32 tmp = RREG32_SOC15(GC, 0, mmCP_INT_CNTL_RING0); + u32 tmp; + + /* don't toggle interrupts that are only applicable + * to me0 pipe0 on AISCs that have me0 removed */ + if (!adev->gfx.num_gfx_rings) + return; + + tmp= RREG32_SOC15(GC, 0, mmCP_INT_CNTL_RING0); tmp = REG_SET_FIELD(tmp, CP_INT_CNTL_RING0, CNTX_BUSY_INT_ENABLE, enable ? 1 : 0); tmp = REG_SET_FIELD(tmp, CP_INT_CNTL_RING0, CNTX_EMPTY_INT_ENABLE, enable ? 1 : 0); From ea96b12aa4fa116aa8ff4cf8de839ea65a2bb3ef Mon Sep 17 00:00:00 2001 From: Qingqing Zhuo Date: Fri, 4 Dec 2020 10:55:13 -0500 Subject: [PATCH 275/326] drm/amd/display: handler not correctly checked at remove_irq_handler [why] handler is supposedly passed in as a function pointer; however, the entire struct amdgpu_dm_irq_handler_data gets from the list is used to check match. [how] use the interrupt_handler within amdgpu_dm_irq_handler_data for checking match. Signed-off-by: Qingqing Zhuo Acked-by: Bindu Ramamurthy Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c index 357778556b06..26ed70e5538a 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c @@ -165,7 +165,10 @@ static struct list_head *remove_irq_handler(struct amdgpu_device *adev, handler = list_entry(entry, struct amdgpu_dm_irq_handler_data, list); - if (ih == handler) { + if (handler == NULL) + continue; + + if (ih == handler->handler) { /* Found our handler. Remove it from the list. */ list_del(&handler->list); handler_removed = true; From 2da94e2808bd7df30ace134991ed0fbd95188acd Mon Sep 17 00:00:00 2001 From: Wesley Chalmers Date: Mon, 7 Dec 2020 11:46:08 -0500 Subject: [PATCH 276/326] drm/amd/display: Interfaces for hubp blank and soft reset [WHY] HUBP blanking sequence on DCN30 requires us to check if HUBP is in blank and also toggle HUBP_DISABLE, which should instead be called HUBP_SOFT_RESET for what it does in HW. Signed-off-by: Wesley Chalmers Acked-by: Bindu Ramamurthy Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/dcn10/dcn10_hubp.c | 18 ++++++++++++++++++ .../gpu/drm/amd/display/dc/dcn10/dcn10_hubp.h | 4 ++++ .../gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c | 2 ++ .../gpu/drm/amd/display/dc/dcn30/dcn30_hubp.c | 2 ++ drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h | 2 ++ 5 files changed, 28 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hubp.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hubp.c index 41679ad531c5..9e796dfeac20 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hubp.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hubp.c @@ -1241,6 +1241,22 @@ void hubp1_vtg_sel(struct hubp *hubp, uint32_t otg_inst) REG_UPDATE(DCHUBP_CNTL, HUBP_VTG_SEL, otg_inst); } +bool hubp1_in_blank(struct hubp *hubp) +{ + uint32_t in_blank; + struct dcn10_hubp *hubp1 = TO_DCN10_HUBP(hubp); + + REG_GET(DCHUBP_CNTL, HUBP_IN_BLANK, &in_blank); + return in_blank ? true : false; +} + +void hubp1_soft_reset(struct hubp *hubp, bool reset) +{ + struct dcn10_hubp *hubp1 = TO_DCN10_HUBP(hubp); + + REG_UPDATE(DCHUBP_CNTL, HUBP_DISABLE, reset ? 1 : 0); +} + void hubp1_init(struct hubp *hubp) { //do nothing @@ -1272,6 +1288,8 @@ static const struct hubp_funcs dcn10_hubp_funcs = { .dmdata_set_attributes = NULL, .dmdata_load = NULL, + .hubp_soft_reset = hubp1_soft_reset, + .hubp_in_blank = hubp1_in_blank, }; /*****************************************/ diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hubp.h b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hubp.h index 780af5b3c16f..a9a6ed7f4f99 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hubp.h +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hubp.h @@ -260,6 +260,7 @@ HUBP_SF(HUBP0_DCHUBP_CNTL, HUBP_NO_OUTSTANDING_REQ, mask_sh),\ HUBP_SF(HUBP0_DCHUBP_CNTL, HUBP_VTG_SEL, mask_sh),\ HUBP_SF(HUBP0_DCHUBP_CNTL, HUBP_DISABLE, mask_sh),\ + HUBP_SF(HUBP0_DCHUBP_CNTL, HUBP_IN_BLANK, mask_sh),\ HUBP_SF(HUBP0_DCSURF_ADDR_CONFIG, NUM_PIPES, mask_sh),\ HUBP_SF(HUBP0_DCSURF_ADDR_CONFIG, NUM_BANKS, mask_sh),\ HUBP_SF(HUBP0_DCSURF_ADDR_CONFIG, PIPE_INTERLEAVE, mask_sh),\ @@ -455,6 +456,7 @@ type HUBP_VTG_SEL;\ type HUBP_UNDERFLOW_STATUS;\ type HUBP_UNDERFLOW_CLEAR;\ + type HUBP_IN_BLANK;\ type NUM_PIPES;\ type NUM_BANKS;\ type PIPE_INTERLEAVE;\ @@ -772,5 +774,7 @@ void hubp1_vready_workaround(struct hubp *hubp, void hubp1_init(struct hubp *hubp); void hubp1_read_state_common(struct hubp *hubp); +bool hubp1_in_blank(struct hubp *hubp); +void hubp1_soft_reset(struct hubp *hubp, bool reset); #endif diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c index b7e44e53a342..0df0da2e6a4d 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c @@ -1595,6 +1595,8 @@ static struct hubp_funcs dcn20_hubp_funcs = { .hubp_set_flip_control_surface_gsl = hubp2_set_flip_control_surface_gsl, .hubp_init = hubp1_init, .validate_dml_output = hubp2_validate_dml_output, + .hubp_in_blank = hubp1_in_blank, + .hubp_soft_reset = hubp1_soft_reset, }; diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hubp.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hubp.c index af462fe4260d..88ffa9ff1ed1 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hubp.c +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hubp.c @@ -509,6 +509,8 @@ static struct hubp_funcs dcn30_hubp_funcs = { .hubp_clear_underflow = hubp2_clear_underflow, .hubp_set_flip_control_surface_gsl = hubp2_set_flip_control_surface_gsl, .hubp_init = hubp3_init, + .hubp_in_blank = hubp1_in_blank, + .hubp_soft_reset = hubp1_soft_reset, }; bool hubp3_construct( diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h b/drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h index 315e3061c592..22f3f643ed1b 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h @@ -188,6 +188,8 @@ struct hubp_funcs { void (*set_unbounded_requesting)( struct hubp *hubp, bool enable); + bool (*hubp_in_blank)(struct hubp *hubp); + void (*hubp_soft_reset)(struct hubp *hubp, bool reset); }; From c2d61e309171437e042f4c859e88077fffee18e5 Mon Sep 17 00:00:00 2001 From: Martin Tsai Date: Thu, 3 Dec 2020 10:47:11 +0800 Subject: [PATCH 277/326] drm/amd/display: Modify the hdcp device count check condition [why] Some MST display may not report the internal panel to DEVICE_COUNT, that makes the check condition always failed. [how] To update this condition with the reported device count + 1 (because the immediate repeater's internal panel is possibly not included in DEVICE_COUNT) Signed-off-by: Martin Tsai Acked-by: Bindu Ramamurthy Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c | 8 ++++++-- .../gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c | 7 +++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c index f244b72e74e0..73ca49f05bd3 100644 --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c @@ -128,8 +128,12 @@ static inline uint8_t get_device_count(struct mod_hdcp *hdcp) static inline enum mod_hdcp_status check_device_count(struct mod_hdcp *hdcp) { - /* device count must be greater than or equal to tracked hdcp displays */ - return (get_device_count(hdcp) < get_active_display_count(hdcp)) ? + /* Some MST display may choose to report the internal panel as an HDCP RX. + * To update this condition with 1(because the immediate repeater's internal + * panel is possibly not included in DEVICE_COUNT) + get_device_count(hdcp). + * Device count must be greater than or equal to tracked hdcp displays. + */ + return ((1 + get_device_count(hdcp)) < get_active_display_count(hdcp)) ? MOD_HDCP_STATUS_HDCP1_DEVICE_COUNT_MISMATCH_FAILURE : MOD_HDCP_STATUS_SUCCESS; } diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c index 549c113abcf7..a0895a7efda2 100644 --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c @@ -207,8 +207,11 @@ static inline uint8_t get_device_count(struct mod_hdcp *hdcp) static enum mod_hdcp_status check_device_count(struct mod_hdcp *hdcp) { - /* device count must be greater than or equal to tracked hdcp displays */ - return (get_device_count(hdcp) < get_active_display_count(hdcp)) ? + /* Some MST display may choose to report the internal panel as an HDCP RX. */ + /* To update this condition with 1(because the immediate repeater's internal */ + /* panel is possibly not included in DEVICE_COUNT) + get_device_count(hdcp). */ + /* Device count must be greater than or equal to tracked hdcp displays. */ + return ((1 + get_device_count(hdcp)) < get_active_display_count(hdcp)) ? MOD_HDCP_STATUS_HDCP2_DEVICE_COUNT_MISMATCH_FAILURE : MOD_HDCP_STATUS_SUCCESS; } From 9413b23fadad3861f5afd626ac44ef83ad8068ab Mon Sep 17 00:00:00 2001 From: Martin Tsai Date: Wed, 2 Dec 2020 20:22:13 +0800 Subject: [PATCH 278/326] drm/amd/display: To modify the condition in indicating branch device [why] The sink count change HPD_IRQ will be ignored if the branch device has only DP DFP. [how] To remove the port type restriction. Signed-off-by: Martin Tsai Acked-by: Bindu Ramamurthy Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c index 6b11d4af54af..2fc12239b22c 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c @@ -3173,13 +3173,7 @@ static void get_active_converter_info( } /* DPCD 0x5 bit 0 = 1, it indicate it's branch device */ - if (ds_port.fields.PORT_TYPE == DOWNSTREAM_DP) { - link->dpcd_caps.is_branch_dev = false; - } - - else { - link->dpcd_caps.is_branch_dev = ds_port.fields.PORT_PRESENT; - } + link->dpcd_caps.is_branch_dev = ds_port.fields.PORT_PRESENT; switch (ds_port.fields.PORT_TYPE) { case DOWNSTREAM_VGA: From e8e91f9395ef13cf054860f8ccd757333d9b6d0d Mon Sep 17 00:00:00 2001 From: Rizvi Date: Wed, 2 Dec 2020 14:52:22 -0700 Subject: [PATCH 279/326] drm/amd/display: gradually ramp ABM intensity [Why] Need driver to pass values of backlight ramp start and ramp reduction so that intensity can be ramped down appropriately. [How] Using abm_parameters structure to get these values from driver. Signed-off-by: Rizvi Acked-by: Bindu Ramamurthy Signed-off-by: Alex Deucher --- .../amd/display/modules/power/power_helpers.c | 35 +++++++++++++------ .../amd/display/modules/power/power_helpers.h | 1 + 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/display/modules/power/power_helpers.c b/drivers/gpu/drm/amd/display/modules/power/power_helpers.c index cc983f662157..4fd8bce95d84 100644 --- a/drivers/gpu/drm/amd/display/modules/power/power_helpers.c +++ b/drivers/gpu/drm/amd/display/modules/power/power_helpers.c @@ -82,22 +82,24 @@ struct abm_parameters { unsigned char deviation_gain; unsigned char min_knee; unsigned char max_knee; + unsigned short blRampReduction; + unsigned short blRampStart; }; static const struct abm_parameters abm_settings_config0[abm_defines_max_level] = { -// min_red max_red bright_pos dark_pos brightness_gain contrast deviation min_knee max_knee - {0xff, 0xbf, 0x20, 0x00, 0xff, 0x99, 0xb3, 0x40, 0xe0}, - {0xde, 0x85, 0x20, 0x00, 0xff, 0x90, 0xa8, 0x40, 0xdf}, - {0xb0, 0x50, 0x20, 0x00, 0xc0, 0x88, 0x78, 0x70, 0xa0}, - {0x82, 0x40, 0x20, 0x00, 0x00, 0xff, 0xb3, 0x70, 0x70}, +// min_red max_red bright_pos dark_pos bright_gain contrast dev min_knee max_knee blStart blRed + {0xff, 0xbf, 0x20, 0x00, 0xff, 0x99, 0xb3, 0x40, 0xe0, 0xCCCC, 0xCCCC}, + {0xde, 0x85, 0x20, 0x00, 0xff, 0x90, 0xa8, 0x40, 0xdf, 0xCCCC, 0xCCCC}, + {0xb0, 0x50, 0x20, 0x00, 0xc0, 0x88, 0x78, 0x70, 0xa0, 0xCCCC, 0xCCCC}, + {0x82, 0x40, 0x20, 0x00, 0x00, 0xff, 0xb3, 0x70, 0x70, 0xCCCC, 0xCCCC}, }; static const struct abm_parameters abm_settings_config1[abm_defines_max_level] = { -// min_red max_red bright_pos dark_pos brightness_gain contrast deviation min_knee max_knee - {0xf0, 0xd9, 0x20, 0x00, 0x00, 0xff, 0xb3, 0x70, 0x70}, - {0xcd, 0xa5, 0x20, 0x00, 0x00, 0xff, 0xb3, 0x70, 0x70}, - {0x99, 0x65, 0x20, 0x00, 0x00, 0xff, 0xb3, 0x70, 0x70}, - {0x82, 0x4d, 0x20, 0x00, 0x00, 0xff, 0xb3, 0x70, 0x70}, +// min_red max_red bright_pos dark_pos bright_gain contrast dev min_knee max_knee blStart blRed + {0xf0, 0xd9, 0x20, 0x00, 0x00, 0xff, 0xb3, 0x70, 0x70, 0xCCCC, 0xCCCC}, + {0xcd, 0xa5, 0x20, 0x00, 0x00, 0xff, 0xb3, 0x70, 0x70, 0xCCCC, 0xCCCC}, + {0x99, 0x65, 0x20, 0x00, 0x00, 0xff, 0xb3, 0x70, 0x70, 0xCCCC, 0xCCCC}, + {0x82, 0x4d, 0x20, 0x00, 0x00, 0xff, 0xb3, 0x70, 0x70, 0xCCCC, 0xCCCC}, }; static const struct abm_parameters * const abm_settings[] = { @@ -662,6 +664,7 @@ bool dmub_init_abm_config(struct resource_pool *res_pool, { struct iram_table_v_2_2 ram_table; struct abm_config_table config; + unsigned int set = params.set; bool result = false; uint32_t i, j = 0; @@ -710,6 +713,18 @@ bool dmub_init_abm_config(struct resource_pool *res_pool, config.max_knee[i] = ram_table.max_knee[i]; } + if (params.backlight_ramping_override) { + for (i = 0; i < NUM_AGGR_LEVEL; i++) { + config.blRampReduction[i] = params.backlight_ramping_reduction; + config.blRampStart[i] = params.backlight_ramping_start; + } + } else { + for (i = 0; i < NUM_AGGR_LEVEL; i++) { + config.blRampReduction[i] = abm_settings[set][i].blRampReduction; + config.blRampStart[i] = abm_settings[set][i].blRampStart; + } + } + config.min_abm_backlight = ram_table.min_abm_backlight; #if defined(CONFIG_DRM_AMD_DC_DCN) diff --git a/drivers/gpu/drm/amd/display/modules/power/power_helpers.h b/drivers/gpu/drm/amd/display/modules/power/power_helpers.h index fa4728d88092..6f2eecce6baa 100644 --- a/drivers/gpu/drm/amd/display/modules/power/power_helpers.h +++ b/drivers/gpu/drm/amd/display/modules/power/power_helpers.h @@ -39,6 +39,7 @@ enum abm_defines { struct dmcu_iram_parameters { unsigned int *backlight_lut_array; unsigned int backlight_lut_array_size; + bool backlight_ramping_override; unsigned int backlight_ramping_reduction; unsigned int backlight_ramping_start; unsigned int min_abm_backlight; From cf7fc75523b32a9a119a466dcff325f1fda38c7d Mon Sep 17 00:00:00 2001 From: Yongqiang Sun Date: Wed, 9 Dec 2020 16:56:51 -0500 Subject: [PATCH 280/326] drm/amd/display: change SMU repsonse timeout to 2s. [Why] there is some garbage showing up during reboot test. Reason: SMU might handle display driver msg defered and driver will send next msg to SMU after 10ms timeout, once SMU FW handle previous msg, parameters are changed to next one, which result in a wrong value be programmed. [How] Extend timeout to 2s so SMU will have enough time to handle driver msg. Signed-off-by: Yongqiang Sun Acked-by: Bindu Ramamurthy Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr_vbios_smu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr_vbios_smu.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr_vbios_smu.c index 11a7b583d561..7deeec9d1c7c 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr_vbios_smu.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr_vbios_smu.c @@ -99,7 +99,7 @@ int rn_vbios_smu_send_msg_with_param(struct clk_mgr_internal *clk_mgr, unsigned /* Trigger the message transaction by writing the message ID */ REG_WRITE(MP1_SMN_C2PMSG_67, msg_id); - result = rn_smu_wait_for_response(clk_mgr, 10, 1000); + result = rn_smu_wait_for_response(clk_mgr, 10, 200000); ASSERT(result == VBIOSSMC_Result_OK || result == VBIOSSMC_Result_UnknownCmd); From e82632356d531dbc575377d594e85e65aa1293f9 Mon Sep 17 00:00:00 2001 From: Michael Strauss Date: Mon, 30 Nov 2020 12:14:00 -0500 Subject: [PATCH 281/326] drm/amd/display: Update RN/VGH active display count workaround [WHY] Virtual signals were previously counted as a workaround to S0i2 hang which is fixed on Renoir. This blocks S0i3 diags testing. [HOW] Stop counting virtual signals as S0i2 hang is fixed on Renoir. Signed-off-by: Michael Strauss Acked-by: Bindu Ramamurthy Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c | 9 +-------- .../gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c | 9 +-------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c index d00b02553d62..9aa1b63bb161 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c @@ -75,15 +75,8 @@ int rn_get_active_display_cnt_wa( for (i = 0; i < dc->link_count; i++) { const struct dc_link *link = dc->links[i]; - /* - * Only notify active stream or virtual stream. - * Need to notify virtual stream to work around - * headless case. HPD does not fire when system is in - * S0i2. - */ /* abusing the fact that the dig and phy are coupled to see if the phy is enabled */ - if (link->connector_signal == SIGNAL_TYPE_VIRTUAL || - link->link_enc->funcs->is_dig_enabled(link->link_enc)) + if (link->link_enc->funcs->is_dig_enabled(link->link_enc)) display_count++; } diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c index 9a8e66bba9c0..991b9c5beaa3 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn301/vg_clk_mgr.c @@ -74,15 +74,8 @@ int vg_get_active_display_cnt_wa( for (i = 0; i < dc->link_count; i++) { const struct dc_link *link = dc->links[i]; - /* - * Only notify active stream or virtual stream. - * Need to notify virtual stream to work around - * headless case. HPD does not fire when system is in - * S0i2. - */ /* abusing the fact that the dig and phy are coupled to see if the phy is enabled */ - if (link->connector_signal == SIGNAL_TYPE_VIRTUAL || - link->link_enc->funcs->is_dig_enabled(link->link_enc)) + if (link->link_enc->funcs->is_dig_enabled(link->link_enc)) display_count++; } From cbac53f7fc90754b898e79ab2d5c11052ce1b640 Mon Sep 17 00:00:00 2001 From: Eryk Brol Date: Tue, 8 Dec 2020 12:52:36 -0500 Subject: [PATCH 282/326] drm/amd/display: Remove unnecessary NULL check [Why] new_crtc_state is already dereferenced earlier in the function [How] Remove the check Signed-off-by: Eryk Brol Acked-by: Bindu Ramamurthy Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 2c4dbdeec46a..8fac6116591b 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -9367,7 +9367,7 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev, if (ret) goto fail; - if (dm_old_crtc_state->dsc_force_changed && new_crtc_state) + if (dm_old_crtc_state->dsc_force_changed) new_crtc_state->mode_changed = true; } From a71e5529d2674584fda0fa09a7de4efc8e17160d Mon Sep 17 00:00:00 2001 From: Aric Cyr Date: Thu, 10 Dec 2020 12:11:32 -0500 Subject: [PATCH 283/326] drm/amd/display: Multi-display underflow observed [Why] FP2 programming not happening when topology changes occur with multiple displays. [How] Ensure FP2 is programmed whenever global sync changes occur but wait for VACTIVE first to avoid underflow. Signed-off-by: Aric Cyr Acked-by: Bindu Ramamurthy Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 20 ------------------- .../drm/amd/display/dc/dcn20/dcn20_hwseq.c | 12 ++++++++--- 2 files changed, 9 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 7339d9855ec8..58eb0d69873a 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -2625,26 +2625,6 @@ static void commit_planes_for_stream(struct dc *dc, } } - if (update_type != UPDATE_TYPE_FAST) { - // If changing VTG FP2: wait until back in vactive to program FP2 - // Need to ensure that pipe unlock happens soon after to minimize race condition - for (i = 0; i < dc->res_pool->pipe_count; i++) { - struct pipe_ctx *pipe_ctx = &context->res_ctx.pipe_ctx[i]; - - if (pipe_ctx->top_pipe || pipe_ctx->stream != stream) - continue; - - if (!pipe_ctx->update_flags.bits.global_sync) - continue; - - pipe_ctx->stream_res.tg->funcs->wait_for_state(pipe_ctx->stream_res.tg, CRTC_STATE_VBLANK); - pipe_ctx->stream_res.tg->funcs->wait_for_state(pipe_ctx->stream_res.tg, CRTC_STATE_VACTIVE); - - pipe_ctx->stream_res.tg->funcs->set_vtg_params( - pipe_ctx->stream_res.tg, &pipe_ctx->stream->timing, true); - } - } - if ((update_type != UPDATE_TYPE_FAST) && dc->hwss.interdependent_update_lock) dc->hwss.interdependent_update_lock(dc, context, false); else diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c index 31a477194d3b..cb822df21b7c 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c @@ -1586,7 +1586,10 @@ static void dcn20_program_pipe( && !pipe_ctx->top_pipe && !pipe_ctx->prev_odm_pipe) hws->funcs.blank_pixel_data(dc, pipe_ctx, !pipe_ctx->plane_state->visible); - if (pipe_ctx->update_flags.bits.global_sync) { + /* Only update TG on top pipe */ + if (pipe_ctx->update_flags.bits.global_sync && !pipe_ctx->top_pipe + && !pipe_ctx->prev_odm_pipe) { + pipe_ctx->stream_res.tg->funcs->program_global_sync( pipe_ctx->stream_res.tg, pipe_ctx->pipe_dlg_param.vready_offset, @@ -1594,8 +1597,11 @@ static void dcn20_program_pipe( pipe_ctx->pipe_dlg_param.vupdate_offset, pipe_ctx->pipe_dlg_param.vupdate_width); + pipe_ctx->stream_res.tg->funcs->wait_for_state(pipe_ctx->stream_res.tg, CRTC_STATE_VBLANK); + pipe_ctx->stream_res.tg->funcs->wait_for_state(pipe_ctx->stream_res.tg, CRTC_STATE_VACTIVE); + pipe_ctx->stream_res.tg->funcs->set_vtg_params( - pipe_ctx->stream_res.tg, &pipe_ctx->stream->timing, false); + pipe_ctx->stream_res.tg, &pipe_ctx->stream->timing, true); if (hws->funcs.setup_vupdate_interrupt) hws->funcs.setup_vupdate_interrupt(dc, pipe_ctx); @@ -2570,4 +2576,4 @@ void dcn20_set_disp_pattern_generator(const struct dc *dc, { pipe_ctx->stream_res.opp->funcs->opp_set_disp_pattern_generator(pipe_ctx->stream_res.opp, test_pattern, color_space, color_depth, solid_color, width, height, offset); -} \ No newline at end of file +} From 73d48f0851847268482260eb955ed8d928b7f19c Mon Sep 17 00:00:00 2001 From: Sung Lee Date: Wed, 9 Dec 2020 14:58:59 -0500 Subject: [PATCH 284/326] drm/amd/display: Acquire DSC during split stream for ODM only if top_pipe [WHY] DSC should only be acquired per OPP. Therefore, DSC should only be acquired for the top_pipe when ODM is enabled. Not doing this check may lead to acquiring more DSC's than needed when doing MPO + ODM Combine. [HOW] Only acquire DSC if pipe is top_pipe. Signed-off-by: Sung Lee Acked-by: Bindu Ramamurthy Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c index ff36db5edf6c..e04ecf0fc0db 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c @@ -1933,7 +1933,7 @@ bool dcn20_split_stream_for_odm( next_odm_pipe->stream_res.opp = pool->opps[next_odm_pipe->pipe_idx]; else next_odm_pipe->stream_res.opp = next_odm_pipe->top_pipe->stream_res.opp; - if (next_odm_pipe->stream->timing.flags.DSC == 1) { + if (next_odm_pipe->stream->timing.flags.DSC == 1 && !next_odm_pipe->top_pipe) { dcn20_acquire_dsc(dc, res_ctx, &next_odm_pipe->stream_res.dsc, next_odm_pipe->pipe_idx); ASSERT(next_odm_pipe->stream_res.dsc); if (next_odm_pipe->stream_res.dsc == NULL) From 1e7445dcc17444569d9f0acce227aadf095ac989 Mon Sep 17 00:00:00 2001 From: Jake Wang Date: Wed, 9 Dec 2020 18:00:18 -0500 Subject: [PATCH 285/326] drm/amd/display: updated wm table for Renoir [Why] For certain timings, Renoir may underflow due to sr exit latency being too slow. [How] Updated wm table for renoir. Signed-off-by: Jake Wang Acked-by: Bindu Ramamurthy Signed-off-by: Alex Deucher --- .../amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c index 9aa1b63bb161..bad30217c7b4 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c @@ -731,32 +731,32 @@ static struct wm_table ddr4_wm_table_rn = { .wm_inst = WM_A, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.72, - .sr_exit_time_us = 9.09, - .sr_enter_plus_exit_time_us = 10.14, + .sr_exit_time_us = 11.90, + .sr_enter_plus_exit_time_us = 12.80, .valid = true, }, { .wm_inst = WM_B, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.72, - .sr_exit_time_us = 11.12, - .sr_enter_plus_exit_time_us = 12.48, + .sr_exit_time_us = 13.18, + .sr_enter_plus_exit_time_us = 14.30, .valid = true, }, { .wm_inst = WM_C, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.72, - .sr_exit_time_us = 11.12, - .sr_enter_plus_exit_time_us = 12.48, + .sr_exit_time_us = 13.18, + .sr_enter_plus_exit_time_us = 14.30, .valid = true, }, { .wm_inst = WM_D, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.72, - .sr_exit_time_us = 11.12, - .sr_enter_plus_exit_time_us = 12.48, + .sr_exit_time_us = 13.18, + .sr_enter_plus_exit_time_us = 14.30, .valid = true, }, } From c277925cca8c534ddcf1fb0ec9b9e4ca35b1d064 Mon Sep 17 00:00:00 2001 From: Yongqiang Sun Date: Fri, 11 Dec 2020 15:34:30 -0500 Subject: [PATCH 286/326] drm/amd/display: [FW Promotion] Release 0.0.47 - restore lvtma_pwrseq_delay2 from vbios integrated info table - restore MVID/NVID after power up. - Enable timer wake up mask when enable timer interrupt. Signed-off-by: Yongqiang Sun Acked-by: Bindu Ramamurthy Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h index f512bda96917..249a076d6f69 100644 --- a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h +++ b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h @@ -47,10 +47,10 @@ /* Firmware versioning. */ #ifdef DMUB_EXPOSE_VERSION -#define DMUB_FW_VERSION_GIT_HASH 0xa18e25995 +#define DMUB_FW_VERSION_GIT_HASH 0xf51b86a #define DMUB_FW_VERSION_MAJOR 0 #define DMUB_FW_VERSION_MINOR 0 -#define DMUB_FW_VERSION_REVISION 46 +#define DMUB_FW_VERSION_REVISION 47 #define DMUB_FW_VERSION_TEST 0 #define DMUB_FW_VERSION_VBIOS 0 #define DMUB_FW_VERSION_HOTFIX 0 From 4aa9d658d21cf192fa12227591526d06fec114e0 Mon Sep 17 00:00:00 2001 From: Jake Wang Date: Fri, 11 Dec 2020 16:53:57 -0500 Subject: [PATCH 287/326] drm/amd/display: always program DPPDTO unless not safe to lower [Why] We defer clock updates to after pipes have been programmed. In some instances we use DPPCLK that have been previously set to be "unused". This results in a brief window of time where underflow could occur. [How] During prepare bandwidth allow rn_update_clocks_update_dpp_dto to check each instance and compare previous clock to new clock. If new clock is higher than previous clock, program DPPDTO. Signed-off-by: Jake Wang Acked-by: Bindu Ramamurthy Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c index bad30217c7b4..01b1853b7750 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c @@ -227,12 +227,11 @@ void rn_update_clocks(struct clk_mgr *clk_mgr_base, rn_vbios_smu_set_dppclk(clk_mgr, clk_mgr_base->clks.dppclk_khz); // always update dtos unless clock is lowered and not safe to lower - if (new_clocks->dppclk_khz >= dc->current_state->bw_ctx.bw.dcn.clk.dppclk_khz) - rn_update_clocks_update_dpp_dto( - clk_mgr, - context, - clk_mgr_base->clks.actual_dppclk_khz, - safe_to_lower); + rn_update_clocks_update_dpp_dto( + clk_mgr, + context, + clk_mgr_base->clks.actual_dppclk_khz, + safe_to_lower); } if (update_dispclk && From 110b055b282736e277298141c42227595408f606 Mon Sep 17 00:00:00 2001 From: Josip Pavic Date: Fri, 11 Dec 2020 00:09:11 -0500 Subject: [PATCH 288/326] drm/amd/display: add getter routine to retrieve mpcc mux [Why & How] Add function to identify which MPCC is providing input to a specified OPP Signed-off-by: Josip Pavic Acked-by: Bindu Ramamurthy Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c | 12 ++++++++++++ drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.h | 1 + drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c | 1 + drivers/gpu/drm/amd/display/dc/dcn30/dcn30_mpc.c | 1 + drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h | 4 ++++ 5 files changed, 19 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c index 3fcd408e9103..a46cb20596fe 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c @@ -467,6 +467,17 @@ void mpc1_cursor_lock(struct mpc *mpc, int opp_id, bool lock) REG_SET(CUR[opp_id], 0, CUR_VUPDATE_LOCK_SET, lock ? 1 : 0); } +unsigned int mpc1_get_mpc_out_mux(struct mpc *mpc, int opp_id) +{ + struct dcn10_mpc *mpc10 = TO_DCN10_MPC(mpc); + uint32_t val; + + if (opp_id < MAX_OPP && REG(MUX[opp_id])) + REG_GET(MUX[opp_id], MPC_OUT_MUX, &val); + + return val; +} + static const struct mpc_funcs dcn10_mpc_funcs = { .read_mpcc_state = mpc1_read_mpcc_state, .insert_plane = mpc1_insert_plane, @@ -483,6 +494,7 @@ static const struct mpc_funcs dcn10_mpc_funcs = { .set_denorm_clamp = NULL, .set_output_csc = NULL, .set_output_gamma = NULL, + .get_mpc_out_mux = mpc1_get_mpc_out_mux, }; void dcn10_mpc_construct(struct dcn10_mpc *mpc10, diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.h b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.h index 66a4719c22a0..dbfffc6383dc 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.h +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.h @@ -200,4 +200,5 @@ void mpc1_read_mpcc_state( void mpc1_cursor_lock(struct mpc *mpc, int opp_id, bool lock); +unsigned int mpc1_get_mpc_out_mux(struct mpc *mpc, int opp_id); #endif diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c index 99cc095dc33c..6a99fdd55e8c 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_mpc.c @@ -556,6 +556,7 @@ const struct mpc_funcs dcn20_mpc_funcs = { .set_ocsc_default = mpc2_set_ocsc_default, .set_output_gamma = mpc2_set_output_gamma, .power_on_mpc_mem_pwr = mpc20_power_on_ogam_lut, + .get_mpc_out_mux = mpc1_get_mpc_out_mux, }; void dcn20_mpc_construct(struct dcn20_mpc *mpc20, diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_mpc.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_mpc.c index d7d053fc6e91..3e6f76096119 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_mpc.c +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_mpc.c @@ -1428,6 +1428,7 @@ const struct mpc_funcs dcn30_mpc_funcs = { .program_3dlut = mpc3_program_3dlut, .release_rmu = mpcc3_release_rmu, .power_on_mpc_mem_pwr = mpc20_power_on_ogam_lut, + .get_mpc_out_mux = mpc1_get_mpc_out_mux, }; diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h b/drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h index 879f502ae530..75c77ad9cbfe 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h @@ -359,6 +359,10 @@ struct mpc_funcs { int (*release_rmu)(struct mpc *mpc, int mpcc_id); + unsigned int (*get_mpc_out_mux)( + struct mpc *mpc, + int opp_id); + }; #endif From e75a9db3c59e923f54a36870a7cc339afe9e611b Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Fri, 18 Dec 2020 12:38:50 +0800 Subject: [PATCH 289/326] drm/amd/pm: bump Sienna Cichlid smu_driver_if version to match latest pmfw This can suppress the annoying but unharmful prompts. Signed-off-by: Evan Quan Reviewed-by: Guchun Chen Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/inc/smu_v11_0.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h b/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h index e5aa0725147c..13de692a4213 100644 --- a/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h +++ b/drivers/gpu/drm/amd/pm/inc/smu_v11_0.h @@ -30,7 +30,7 @@ #define SMU11_DRIVER_IF_VERSION_NV10 0x36 #define SMU11_DRIVER_IF_VERSION_NV12 0x36 #define SMU11_DRIVER_IF_VERSION_NV14 0x36 -#define SMU11_DRIVER_IF_VERSION_Sienna_Cichlid 0x3B +#define SMU11_DRIVER_IF_VERSION_Sienna_Cichlid 0x3D #define SMU11_DRIVER_IF_VERSION_Navy_Flounder 0xC #define SMU11_DRIVER_IF_VERSION_VANGOGH 0x02 #define SMU11_DRIVER_IF_VERSION_Dimgrey_Cavefish 0xF From 05211e7fbbf042dd7f51155ebe64eb2ecacb25cb Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 17 Dec 2020 12:11:36 -0500 Subject: [PATCH 290/326] drm/amdgpu: only set DP subconnector type on DP and eDP connectors Fixes a crash in drm_object_property_set_value() because the property is not set for internal DP ports that connect to a bridge chips (e.g., DP to VGA or DP to LVDS). Bug: https://bugzilla.kernel.org/show_bug.cgi?id=210739 Fixes: 65bf2cf95d3ade ("drm/amdgpu: utilize subconnector property for DP through atombios") Tested-By: Kris Karas Cc: Oleg Vasilev Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.10.x --- drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c index 65d1b23d7e74..b9c11c2b2885 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c @@ -1414,10 +1414,12 @@ out: pm_runtime_put_autosuspend(connector->dev->dev); } - drm_dp_set_subconnector_property(&amdgpu_connector->base, - ret, - amdgpu_dig_connector->dpcd, - amdgpu_dig_connector->downstream_ports); + if (connector->connector_type == DRM_MODE_CONNECTOR_DisplayPort || + connector->connector_type == DRM_MODE_CONNECTOR_eDP) + drm_dp_set_subconnector_property(&amdgpu_connector->base, + ret, + amdgpu_dig_connector->dpcd, + amdgpu_dig_connector->downstream_ports); return ret; } From 505199a3b714aeb9d13dd0a04c33db9f5d99482a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 18 Dec 2020 11:19:30 -0500 Subject: [PATCH 291/326] drm/amdgpu: Fix a copy-pasta comment This is not a scsi driver. Reviewed-by: Nirmoy Das Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7d2f7a2240b8..1cb7d73f7317 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5069,8 +5069,7 @@ out: * @pdev: pointer to PCI device * * Called when the error recovery driver tells us that its - * OK to resume normal operation. Use completion to allow - * halted scsi ops to resume. + * OK to resume normal operation. */ void amdgpu_pci_resume(struct pci_dev *pdev) { From a135a1b4c4db1f3b8cbed9676a40ede39feb3362 Mon Sep 17 00:00:00 2001 From: Stylon Wang Date: Tue, 10 Nov 2020 15:40:06 +0800 Subject: [PATCH 292/326] drm/amd/display: Fix memory leaks in S3 resume EDID parsing in S3 resume pushes new display modes to probed_modes list but doesn't consolidate to actual mode list. This creates a race condition when amdgpu_dm_connector_ddc_get_modes() re-initializes the list head without walking the list and results in memory leak. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=209987 Acked-by: Harry Wentland Acked-by: Alex Deucher Reviewed-by: Nicholas Kazlauskas Signed-off-by: Stylon Wang Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 8fac6116591b..519080e9a233 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2386,7 +2386,8 @@ void amdgpu_dm_update_connector_after_detect( drm_connector_update_edid_property(connector, aconnector->edid); - drm_add_edid_modes(connector, aconnector->edid); + aconnector->num_modes = drm_add_edid_modes(connector, aconnector->edid); + drm_connector_list_update(connector); if (aconnector->dc_link->aux_mode) drm_dp_cec_set_edid(&aconnector->dm_dp_aux.aux, From d2ee8447e1bed7def30bab1748c876b8bd4e0876 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 24 Dec 2020 12:45:18 +0100 Subject: [PATCH 293/326] coccinelle: update expiring email addresses Signed-off-by: Julia Lawall --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index b516bb34a8d5..48cd0f73484a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4324,8 +4324,8 @@ T: git git://linuxtv.org/media_tree.git F: drivers/media/pci/cobalt/ COCCINELLE/Semantic Patches (SmPL) -M: Julia Lawall -M: Gilles Muller +M: Julia Lawall +M: Gilles Muller M: Nicolas Palix M: Michal Marek L: cocci@systeme.lip6.fr (moderated for non-subscribers) From d8f6e5c6c83737cfdad46077e614885a3db9e809 Mon Sep 17 00:00:00 2001 From: Sumera Priyadarsini Date: Wed, 25 Nov 2020 02:02:12 +0530 Subject: [PATCH 294/326] scripts: coccicheck: Correct usage of make coccicheck The command "make coccicheck C=1 CHECK=scripts/coccicheck" results in the error: ./scripts/coccicheck: line 65: -1: shift count out of range This happens because every time the C variable is specified, the shell arguments need to be "shifted" in order to take only the last argument, which is the C file to test. These shell arguments mostly comprise flags that have been set in the Makefile. However, when coccicheck is specified in the make command as a rule, the number of shell arguments is zero, thus passing the invalid value -1 to the shift command, resulting in an error. Modify coccicheck to print correct usage of make coccicheck so as to avoid the error. Signed-off-by: Sumera Priyadarsini Signed-off-by: Julia Lawall --- scripts/coccicheck | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/coccicheck b/scripts/coccicheck index d7f6b7ff130a..65fee63aeadb 100755 --- a/scripts/coccicheck +++ b/scripts/coccicheck @@ -60,6 +60,18 @@ COCCIINCLUDE=${COCCIINCLUDE// -include/ --include} if [ "$C" = "1" -o "$C" = "2" ]; then ONLINE=1 + if [[ $# -le 0 ]]; then + echo '' + echo 'Specifying both the variable "C" and rule "coccicheck" in the make +command results in a shift count error.' + echo '' + echo 'Try specifying "scripts/coccicheck" as a value for the CHECK variable instead.' + echo '' + echo 'Example: make C=2 CHECK=scripts/coccicheck drivers/net/ethernet/ethoc.o' + echo '' + exit 1 + fi + # Take only the last argument, which is the C file to test shift $(( $# - 1 )) OPTIONS="$COCCIINCLUDE $1" From 6e5192143ab571dbefb584edf900565098bdfd23 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 21 Dec 2020 09:03:04 -0300 Subject: [PATCH 295/326] tools headers UAPI: Update epoll_pwait2 affected files To pick the changes from: b0a0c2615f6f199a ("epoll: wire up syscall epoll_pwait2") That addresses these perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/unistd.h' differs from latest version at 'include/uapi/asm-generic/unistd.h' diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h Warning: Kernel ABI header at 'tools/perf/arch/x86/entry/syscalls/syscall_64.tbl' differs from latest version at 'arch/x86/entry/syscalls/syscall_64.tbl' diff -u tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Willem de Bruijn Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/asm-generic/unistd.h | 4 +++- tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index fc48c64700eb..728752917785 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) __SYSCALL(__NR_faccessat2, sys_faccessat2) #define __NR_process_madvise 440 __SYSCALL(__NR_process_madvise, sys_process_madvise) +#define __NR_epoll_pwait2 441 +__SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2) #undef __NR_syscalls -#define __NR_syscalls 441 +#define __NR_syscalls 442 /* * 32 bit systems traditionally used different diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index 379819244b91..78672124d28b 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -362,6 +362,7 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 # # Due to a historical design error, certain syscalls are numbered differently From 7f3905f00a2025591a6883ee6880f928029b4d96 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 21 Dec 2020 09:04:54 -0300 Subject: [PATCH 296/326] tools headers cpufeatures: Sync with the kernel sources To pick the changes in: 69372cf01290b958 ("x86/cpu: Add VM page flush MSR availablility as a CPUID feature") e1b35da5e624f8b0 ("x86: Enumerate AVX512 FP16 CPUID feature flag") That causes only these 'perf bench' objects to rebuild: CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o CC /tmp/build/perf/bench/mem-memset-x86-64-asm.o And addresses these perf build warnings: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' differs from latest version at 'arch/x86/include/asm/cpufeatures.h' diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kyung Min Park Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Tom Lendacky Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/cpufeatures.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index f5ef2d5b9231..84b887825f12 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -237,6 +237,7 @@ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ #define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_VM_PAGE_FLUSH ( 8*32+21) /* "" VM Page Flush MSR is supported */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -376,6 +377,7 @@ #define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ +#define X86_FEATURE_AVX512_FP16 (18*32+23) /* AVX512 FP16 */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ From fde668244d1d8d490b5b9daf53fe4f92a6751773 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 21 Dec 2020 09:07:36 -0300 Subject: [PATCH 297/326] tools arch x86: Sync the msr-index.h copy with the kernel sources To pick up the changes in: Fixes: 69372cf01290b958 ("x86/cpu: Add VM page flush MSR availablility as a CPUID feature") That cause these changes in tooling: $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > before $ cp arch/x86/include/asm/msr-index.h tools/arch/x86/include/asm/msr-index.h $ tools/perf/trace/beauty/tracepoints/x86_msr.sh > after $ diff -u before after --- before 2020-12-21 09:09:05.593005003 -0300 +++ after 2020-12-21 09:12:48.436994802 -0300 @@ -21,7 +21,7 @@ [0x0000004f] = "PPIN", [0x00000060] = "LBR_CORE_TO", [0x00000079] = "IA32_UCODE_WRITE", - [0x0000008b] = "IA32_UCODE_REV", + [0x0000008b] = "AMD64_PATCH_LEVEL", [0x0000008C] = "IA32_SGXLEPUBKEYHASH0", [0x0000008D] = "IA32_SGXLEPUBKEYHASH1", [0x0000008E] = "IA32_SGXLEPUBKEYHASH2", @@ -286,6 +286,7 @@ [0xc0010114 - x86_AMD_V_KVM_MSRs_offset] = "VM_CR", [0xc0010115 - x86_AMD_V_KVM_MSRs_offset] = "VM_IGNNE", [0xc0010117 - x86_AMD_V_KVM_MSRs_offset] = "VM_HSAVE_PA", + [0xc001011e - x86_AMD_V_KVM_MSRs_offset] = "AMD64_VM_PAGE_FLUSH", [0xc001011f - x86_AMD_V_KVM_MSRs_offset] = "AMD64_VIRT_SPEC_CTRL", [0xc0010130 - x86_AMD_V_KVM_MSRs_offset] = "AMD64_SEV_ES_GHCB", [0xc0010131 - x86_AMD_V_KVM_MSRs_offset] = "AMD64_SEV", $ The new MSR has a pattern that wasn't matched to avoid a clash with IA32_UCODE_REV, change the regex to prefer the more relevant AMD_ prefixed ones to catch this new AMD64_VM_PAGE_FLUSH MSR. Which causes these parts of tools/perf/ to be rebuilt: CC /tmp/build/perf/trace/beauty/tracepoints/x86_msr.o LD /tmp/build/perf/trace/beauty/tracepoints/perf-in.o LD /tmp/build/perf/trace/beauty/perf-in.o LD /tmp/build/perf/perf-in.o LINK /tmp/build/perf/perf This addresses this perf tools build warning: diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Warning: Kernel ABI header at 'tools/arch/x86/include/asm/msr-index.h' differs from latest version at 'arch/x86/include/asm/msr-index.h' Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Tom Lendacky Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/msr-index.h | 1 + tools/perf/trace/beauty/tracepoints/x86_msr.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 2b5fc9accec4..546d6ecf0a35 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -472,6 +472,7 @@ #define MSR_AMD64_ICIBSEXTDCTL 0xc001103c #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +#define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 diff --git a/tools/perf/trace/beauty/tracepoints/x86_msr.sh b/tools/perf/trace/beauty/tracepoints/x86_msr.sh index 831c02cf0586..27ee1ea1fe94 100755 --- a/tools/perf/trace/beauty/tracepoints/x86_msr.sh +++ b/tools/perf/trace/beauty/tracepoints/x86_msr.sh @@ -15,7 +15,7 @@ x86_msr_index=${arch_x86_header_dir}/msr-index.h printf "static const char *x86_MSRs[] = {\n" regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MSR_([[:alnum:]][[:alnum:]_]+)[[:space:]]+(0x00000[[:xdigit:]]+)[[:space:]]*.*' -egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|AMD64|IA32_TSCDEADLINE|IDT_FCR4)' | \ +egrep $regex ${x86_msr_index} | egrep -v 'MSR_(ATOM|P[46]|IA32_(TSCDEADLINE|UCODE_REV)|IDT_FCR4)' | \ sed -r "s/$regex/\2 \1/g" | sort -n | \ xargs printf "\t[%s] = \"%s\",\n" printf "};\n\n" From 288807fc3a5f19ed77cb8c25342323bbe58a75a1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 21 Dec 2020 09:20:52 -0300 Subject: [PATCH 298/326] tools headers UAPI: Sync kvm.h headers with the kernel sources To pick the changes in: fb04a1eddb1a65b6 ("KVM: X86: Implement ring-based dirty memory tracking") That result in these change in tooling: $ tools/perf/trace/beauty/kvm_ioctl.sh > before $ cp include/uapi/linux/kvm.h tools/include/uapi/linux/kvm.h $ cp arch/x86/include/uapi/asm/kvm.h tools/arch/x86/include/uapi/asm/kvm.h $ tools/perf/trace/beauty/kvm_ioctl.sh > after $ diff -u before after --- before 2020-12-21 11:55:45.229737066 -0300 +++ after 2020-12-21 11:55:56.379983393 -0300 @@ -90,6 +90,7 @@ [0xc0] = "CLEAR_DIRTY_LOG", [0xc1] = "GET_SUPPORTED_HV_CPUID", [0xc6] = "X86_SET_MSR_FILTER", + [0xc7] = "RESET_DIRTY_RINGS", [0xe0] = "CREATE_DEVICE", [0xe1] = "SET_DEVICE_ATTR", [0xe2] = "GET_DEVICE_ATTR", $ Now one can use that string in filters when tracing ioctls, etc. And silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/kvm.h' differs from latest version at 'arch/x86/include/uapi/asm/kvm.h' diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Peter Xu Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/kvm.h | 1 + tools/include/uapi/linux/kvm.h | 56 ++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 89e5f3d1bba8..8e76d3701db3 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -12,6 +12,7 @@ #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 +#define KVM_DIRTY_LOG_PAGE_OFFSET 64 #define DE_VECTOR 0 #define DB_VECTOR 1 diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index ca41220b40b8..886802b8ffba 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -250,6 +250,7 @@ struct kvm_hyperv_exit { #define KVM_EXIT_ARM_NISV 28 #define KVM_EXIT_X86_RDMSR 29 #define KVM_EXIT_X86_WRMSR 30 +#define KVM_EXIT_DIRTY_RING_FULL 31 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -1053,6 +1054,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_X86_USER_SPACE_MSR 188 #define KVM_CAP_X86_MSR_FILTER 189 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 +#define KVM_CAP_SYS_HYPERV_CPUID 191 +#define KVM_CAP_DIRTY_LOG_RING 192 #ifdef KVM_CAP_IRQ_ROUTING @@ -1511,7 +1514,7 @@ struct kvm_enc_region { /* Available with KVM_CAP_MANUAL_DIRTY_LOG_PROTECT_2 */ #define KVM_CLEAR_DIRTY_LOG _IOWR(KVMIO, 0xc0, struct kvm_clear_dirty_log) -/* Available with KVM_CAP_HYPERV_CPUID */ +/* Available with KVM_CAP_HYPERV_CPUID (vcpu) / KVM_CAP_SYS_HYPERV_CPUID (system) */ #define KVM_GET_SUPPORTED_HV_CPUID _IOWR(KVMIO, 0xc1, struct kvm_cpuid2) /* Available with KVM_CAP_ARM_SVE */ @@ -1557,6 +1560,9 @@ struct kvm_pv_cmd { /* Available with KVM_CAP_X86_MSR_FILTER */ #define KVM_X86_SET_MSR_FILTER _IOW(KVMIO, 0xc6, struct kvm_msr_filter) +/* Available with KVM_CAP_DIRTY_LOG_RING */ +#define KVM_RESET_DIRTY_RINGS _IO(KVMIO, 0xc7) + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ @@ -1710,4 +1716,52 @@ struct kvm_hyperv_eventfd { #define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) #define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1) +/* + * Arch needs to define the macro after implementing the dirty ring + * feature. KVM_DIRTY_LOG_PAGE_OFFSET should be defined as the + * starting page offset of the dirty ring structures. + */ +#ifndef KVM_DIRTY_LOG_PAGE_OFFSET +#define KVM_DIRTY_LOG_PAGE_OFFSET 0 +#endif + +/* + * KVM dirty GFN flags, defined as: + * + * |---------------+---------------+--------------| + * | bit 1 (reset) | bit 0 (dirty) | Status | + * |---------------+---------------+--------------| + * | 0 | 0 | Invalid GFN | + * | 0 | 1 | Dirty GFN | + * | 1 | X | GFN to reset | + * |---------------+---------------+--------------| + * + * Lifecycle of a dirty GFN goes like: + * + * dirtied harvested reset + * 00 -----------> 01 -------------> 1X -------+ + * ^ | + * | | + * +------------------------------------------+ + * + * The userspace program is only responsible for the 01->1X state + * conversion after harvesting an entry. Also, it must not skip any + * dirty bits, so that dirty bits are always harvested in sequence. + */ +#define KVM_DIRTY_GFN_F_DIRTY BIT(0) +#define KVM_DIRTY_GFN_F_RESET BIT(1) +#define KVM_DIRTY_GFN_F_MASK 0x3 + +/* + * KVM dirty rings should be mapped at KVM_DIRTY_LOG_PAGE_OFFSET of + * per-vcpu mmaped regions as an array of struct kvm_dirty_gfn. The + * size of the gfn buffer is decided by the first argument when + * enabling KVM_CAP_DIRTY_LOG_RING. + */ +struct kvm_dirty_gfn { + __u32 flags; + __u32 slot; + __u64 offset; +}; + #endif /* __LINUX_KVM_H */ From cd97448db80e0238a819dc6b733da6ec0173cadd Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 21 Dec 2020 12:51:03 -0300 Subject: [PATCH 299/326] tools headers UAPI: Sync KVM's vmx.h header with the kernel sources To pick the changes in: bf0cd88ce363a2de ("KVM: x86: emulate wait-for-SIPI and SIPI-VMExit") That makes 'perf kvm-stat' aware of this new SIPI_SIGNAL exit reason, thus addressing the following perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/vmx.h' differs from latest version at 'arch/x86/include/uapi/asm/vmx.h' diff -u tools/arch/x86/include/uapi/asm/vmx.h arch/x86/include/uapi/asm/vmx.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Yadong Qi Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/vmx.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index b8ff9e8ac0d5..ada955c5ebb6 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -32,6 +32,7 @@ #define EXIT_REASON_EXTERNAL_INTERRUPT 1 #define EXIT_REASON_TRIPLE_FAULT 2 #define EXIT_REASON_INIT_SIGNAL 3 +#define EXIT_REASON_SIPI_SIGNAL 4 #define EXIT_REASON_INTERRUPT_WINDOW 7 #define EXIT_REASON_NMI_WINDOW 8 @@ -94,6 +95,7 @@ { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \ + { EXIT_REASON_SIPI_SIGNAL, "SIPI_SIGNAL" }, \ { EXIT_REASON_INTERRUPT_WINDOW, "INTERRUPT_WINDOW" }, \ { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ From 9880e71cbaa8a0e826d8f144704301476b2d6cf9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 21 Dec 2020 12:53:44 -0300 Subject: [PATCH 300/326] tools kvm headers: Update KVM headers from the kernel sources To pick the changes from: 8d14797b53f044fd ("KVM: arm64: Move 'struct kvm_arch_memory_slot' out of uapi/") That don't causes any changes in tooling, only addresses this perf build warning: Warning: Kernel ABI header at 'tools/arch/arm64/include/uapi/asm/kvm.h' differs from latest version at 'arch/arm64/include/uapi/asm/kvm.h' diff -u tools/arch/arm64/include/uapi/asm/kvm.h arch/arm64/include/uapi/asm/kvm.h Cc: Marc Zyngier Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/arm64/include/uapi/asm/kvm.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index 1c17c3a24411..24223adae150 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -156,9 +156,6 @@ struct kvm_sync_regs { __u64 device_irq_level; }; -struct kvm_arch_memory_slot { -}; - /* * PMU filter structure. Describe a range of events with a particular * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER. From b71df82d05b7a38f38c4b1109c57b209b8ed43ff Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 21 Dec 2020 20:04:45 -0300 Subject: [PATCH 301/326] tools headers UAPI: Synch KVM's svm.h header with the kernel To pick up the changes from: d1949b93c60504b3 ("KVM: SVM: Add support for CR8 write traps for an SEV-ES guest") 5b51cb13160ae0ba ("KVM: SVM: Add support for CR4 write traps for an SEV-ES guest") f27ad38aac23263c ("KVM: SVM: Add support for CR0 write traps for an SEV-ES guest") 2985afbcdbb1957a ("KVM: SVM: Add support for EFER write traps for an SEV-ES guest") 291bd20d5d88814a ("KVM: SVM: Add initial support for a VMGEXIT VMEXIT") Picking these new SVM exit reasons: + { SVM_EXIT_EFER_WRITE_TRAP, "write_efer_trap" }, \ + { SVM_EXIT_CR0_WRITE_TRAP, "write_cr0_trap" }, \ + { SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \ + { SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \ + { SVM_EXIT_VMGEXIT, "vmgexit" }, \ + { SVM_VMGEXIT_MMIO_READ, "vmgexit_mmio_read" }, \ + { SVM_VMGEXIT_MMIO_WRITE, "vmgexit_mmio_write" }, \ + { SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \ + { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \ + { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \ And address this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/svm.h' differs from latest version at 'arch/x86/include/uapi/asm/svm.h' diff -u tools/arch/x86/include/uapi/asm/svm.h arch/x86/include/uapi/asm/svm.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Tom Lendacky Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/svm.h | 28 +++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/arch/x86/include/uapi/asm/svm.h b/tools/arch/x86/include/uapi/asm/svm.h index f1d8307454e0..554f75fe013c 100644 --- a/tools/arch/x86/include/uapi/asm/svm.h +++ b/tools/arch/x86/include/uapi/asm/svm.h @@ -77,10 +77,28 @@ #define SVM_EXIT_MWAIT_COND 0x08c #define SVM_EXIT_XSETBV 0x08d #define SVM_EXIT_RDPRU 0x08e +#define SVM_EXIT_EFER_WRITE_TRAP 0x08f +#define SVM_EXIT_CR0_WRITE_TRAP 0x090 +#define SVM_EXIT_CR1_WRITE_TRAP 0x091 +#define SVM_EXIT_CR2_WRITE_TRAP 0x092 +#define SVM_EXIT_CR3_WRITE_TRAP 0x093 +#define SVM_EXIT_CR4_WRITE_TRAP 0x094 +#define SVM_EXIT_CR5_WRITE_TRAP 0x095 +#define SVM_EXIT_CR6_WRITE_TRAP 0x096 +#define SVM_EXIT_CR7_WRITE_TRAP 0x097 +#define SVM_EXIT_CR8_WRITE_TRAP 0x098 +#define SVM_EXIT_CR9_WRITE_TRAP 0x099 +#define SVM_EXIT_CR10_WRITE_TRAP 0x09a +#define SVM_EXIT_CR11_WRITE_TRAP 0x09b +#define SVM_EXIT_CR12_WRITE_TRAP 0x09c +#define SVM_EXIT_CR13_WRITE_TRAP 0x09d +#define SVM_EXIT_CR14_WRITE_TRAP 0x09e +#define SVM_EXIT_CR15_WRITE_TRAP 0x09f #define SVM_EXIT_INVPCID 0x0a2 #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 +#define SVM_EXIT_VMGEXIT 0x403 /* SEV-ES software-defined VMGEXIT events */ #define SVM_VMGEXIT_MMIO_READ 0x80000001 @@ -183,10 +201,20 @@ { SVM_EXIT_MONITOR, "monitor" }, \ { SVM_EXIT_MWAIT, "mwait" }, \ { SVM_EXIT_XSETBV, "xsetbv" }, \ + { SVM_EXIT_EFER_WRITE_TRAP, "write_efer_trap" }, \ + { SVM_EXIT_CR0_WRITE_TRAP, "write_cr0_trap" }, \ + { SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \ + { SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \ { SVM_EXIT_INVPCID, "invpcid" }, \ { SVM_EXIT_NPF, "npf" }, \ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \ + { SVM_EXIT_VMGEXIT, "vmgexit" }, \ + { SVM_VMGEXIT_MMIO_READ, "vmgexit_mmio_read" }, \ + { SVM_VMGEXIT_MMIO_WRITE, "vmgexit_mmio_write" }, \ + { SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \ + { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \ + { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \ { SVM_EXIT_ERR, "invalid_guest_state" } From 9bad32b2c63c985fc9f04b29186974ad5bb0b74c Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 18 Dec 2020 15:59:21 +0800 Subject: [PATCH 302/326] perf powerpc: Move syscall.tbl check to check-headers.sh It is better to check syscall.tbl for powerpc in check-headers.sh, it is similar with commit c9b51a017065 ("perf tools: Move syscall_64.tbl check into check-headers.sh"). Signed-off-by: Tiezhu Yang Reviewed-by: Naveen N. Rao Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Xuefeng Li Link: http://lore.kernel.org/lkml/1608278364-6733-2-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/powerpc/Makefile | 7 ------- tools/perf/check-headers.sh | 1 + 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile index e58d00d62f02..840ea0e59287 100644 --- a/tools/perf/arch/powerpc/Makefile +++ b/tools/perf/arch/powerpc/Makefile @@ -14,7 +14,6 @@ PERF_HAVE_JITDUMP := 1 out := $(OUTPUT)arch/powerpc/include/generated/asm header32 := $(out)/syscalls_32.c header64 := $(out)/syscalls_64.c -syskrn := $(srctree)/arch/powerpc/kernel/syscalls/syscall.tbl sysprf := $(srctree)/tools/perf/arch/powerpc/entry/syscalls sysdef := $(sysprf)/syscall.tbl systbl := $(sysprf)/mksyscalltbl @@ -23,15 +22,9 @@ systbl := $(sysprf)/mksyscalltbl _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') $(header64): $(sysdef) $(systbl) - @(test -d ../../kernel -a -d ../../tools -a -d ../perf && ( \ - (diff -B $(sysdef) $(syskrn) >/dev/null) \ - || echo "Warning: Kernel ABI header at '$(sysdef)' differs from latest version at '$(syskrn)'" >&2 )) || true $(Q)$(SHELL) '$(systbl)' '64' $(sysdef) > $@ $(header32): $(sysdef) $(systbl) - @(test -d ../../kernel -a -d ../../tools -a -d ../perf && ( \ - (diff -B $(sysdef) $(syskrn) >/dev/null) \ - || echo "Warning: Kernel ABI header at '$(sysdef)' differs from latest version at '$(syskrn)'" >&2 )) || true $(Q)$(SHELL) '$(systbl)' '32' $(sysdef) > $@ clean:: diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 15ecb1803fb9..3c96fd3a7370 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -144,6 +144,7 @@ check arch/x86/lib/insn.c '-I "^#include [\"<]\(../include/\)*asm/in # diff non-symmetric files check_2 tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl +check_2 tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl for i in $BEAUTY_FILES; do beauty_check $i -B From 22ffc3f5598d2a51e2da4ea5e07e734715bde782 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 18 Dec 2020 15:59:22 +0800 Subject: [PATCH 303/326] perf s390: Move syscall.tbl check into check-headers.sh It is better to check syscall.tbl for s390 in check-headers.sh, it is similar with commit c9b51a017065 ("perf tools: Move syscall_64.tbl check into check-headers.sh"). Signed-off-by: Tiezhu Yang Reviewed-by: Naveen N. Rao Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Xuefeng Li Link: http://lore.kernel.org/lkml/1608278364-6733-3-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/s390/Makefile | 4 ---- tools/perf/check-headers.sh | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/perf/arch/s390/Makefile b/tools/perf/arch/s390/Makefile index 6ac8887be7c9..74bffbea03e2 100644 --- a/tools/perf/arch/s390/Makefile +++ b/tools/perf/arch/s390/Makefile @@ -12,7 +12,6 @@ PERF_HAVE_JITDUMP := 1 out := $(OUTPUT)arch/s390/include/generated/asm header := $(out)/syscalls_64.c -syskrn := $(srctree)/arch/s390/kernel/syscalls/syscall.tbl sysprf := $(srctree)/tools/perf/arch/s390/entry/syscalls sysdef := $(sysprf)/syscall.tbl systbl := $(sysprf)/mksyscalltbl @@ -21,9 +20,6 @@ systbl := $(sysprf)/mksyscalltbl _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') $(header): $(sysdef) $(systbl) - @(test -d ../../kernel -a -d ../../tools -a -d ../perf && ( \ - (diff -B $(sysdef) $(syskrn) >/dev/null) \ - || echo "Warning: Kernel ABI header at '$(sysdef)' differs from latest version at '$(syskrn)'" >&2 )) || true $(Q)$(SHELL) '$(systbl)' $(sysdef) > $@ clean:: diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 3c96fd3a7370..dded93a2bc89 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -145,6 +145,7 @@ check arch/x86/lib/insn.c '-I "^#include [\"<]\(../include/\)*asm/in # diff non-symmetric files check_2 tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl check_2 tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl +check_2 tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl for i in $BEAUTY_FILES; do beauty_check $i -B From c5ef52944a2d80017092cdf6aa474b2f4d596072 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 23 Dec 2020 13:13:16 -0300 Subject: [PATCH 304/326] perf tools: Update powerpc's syscall.tbl copy from the kernel sources This silences the following tools/perf/ build warning: Warning: Kernel ABI header at 'tools/perf/arch/powerpc/entry/syscalls/syscall.tbl' differs from latest version at 'arch/powerpc/kernel/syscalls/syscall.tbl' Just make them same: cp arch/powerpc/kernel/syscalls/syscall.tbl tools/perf/arch/powerpc/entry/syscalls/syscall.tbl Signed-off-by: Tiezhu Yang Reviewed-by: Naveen N. Rao Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Xuefeng Li Link: http://lore.kernel.org/lkml/1608278364-6733-4-git-send-email-yangtiezhu@loongson.cn [ There were updates after Tiezhu's post, so I just updated the copy ] Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/powerpc/entry/syscalls/syscall.tbl | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index b168364ac050..f744eb5cba88 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -9,7 +9,9 @@ # 0 nospu restart_syscall sys_restart_syscall 1 nospu exit sys_exit -2 nospu fork ppc_fork +2 32 fork ppc_fork sys_fork +2 64 fork sys_fork +2 spu fork sys_ni_syscall 3 common read sys_read 4 common write sys_write 5 common open sys_open compat_sys_open @@ -158,7 +160,9 @@ 119 32 sigreturn sys_sigreturn compat_sys_sigreturn 119 64 sigreturn sys_ni_syscall 119 spu sigreturn sys_ni_syscall -120 nospu clone ppc_clone +120 32 clone ppc_clone sys_clone +120 64 clone sys_clone +120 spu clone sys_ni_syscall 121 common setdomainname sys_setdomainname 122 common uname sys_newuname 123 common modify_ldt sys_ni_syscall @@ -240,7 +244,9 @@ 186 spu sendfile sys_sendfile64 187 common getpmsg sys_ni_syscall 188 common putpmsg sys_ni_syscall -189 nospu vfork ppc_vfork +189 32 vfork ppc_vfork sys_vfork +189 64 vfork sys_vfork +189 spu vfork sys_ni_syscall 190 common ugetrlimit sys_getrlimit compat_sys_getrlimit 191 common readahead sys_readahead compat_sys_readahead 192 32 mmap2 sys_mmap2 compat_sys_mmap2 @@ -316,8 +322,8 @@ 248 32 clock_nanosleep sys_clock_nanosleep_time32 248 64 clock_nanosleep sys_clock_nanosleep 248 spu clock_nanosleep sys_clock_nanosleep -249 32 swapcontext ppc_swapcontext ppc32_swapcontext -249 64 swapcontext ppc64_swapcontext +249 32 swapcontext ppc_swapcontext compat_sys_swapcontext +249 64 swapcontext sys_swapcontext 249 spu swapcontext sys_ni_syscall 250 common tgkill sys_tgkill 251 32 utimes sys_utimes_time32 @@ -456,7 +462,7 @@ 361 common bpf sys_bpf 362 nospu execveat sys_execveat compat_sys_execveat 363 32 switch_endian sys_ni_syscall -363 64 switch_endian ppc_switch_endian +363 64 switch_endian sys_switch_endian 363 spu switch_endian sys_ni_syscall 364 common userfaultfd sys_userfaultfd 365 common membarrier sys_membarrier @@ -516,6 +522,12 @@ 432 common fsmount sys_fsmount 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open -435 nospu clone3 ppc_clone3 +435 32 clone3 ppc_clone3 sys_clone3 +435 64 clone3 sys_clone3 +435 spu clone3 sys_ni_syscall +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 From b27d20ab1c6a1a7738c02419c28287d260ca8036 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 23 Dec 2020 13:24:35 -0300 Subject: [PATCH 305/326] perf tools: Update s390's syscall.tbl copy from the kernel sources This silences the following tools/perf/ build warning: Warning: Kernel ABI header at 'tools/perf/arch/s390/entry/syscalls/syscall.tbl' differs from latest version at 'arch/s390/kernel/syscalls/syscall.tbl' Just make them same: cp arch/s390/kernel/syscalls/syscall.tbl tools/perf/arch/s390/entry/syscalls/syscall.tbl Signed-off-by: Tiezhu Yang Reviewed-by: Naveen N. Rao Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Xuefeng Li Link: http://lore.kernel.org/lkml/1608278364-6733-5-git-send-email-yangtiezhu@loongson.cn [ There were updates after Tiezhu's post, so I just updated the copy ] Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/arch/s390/entry/syscalls/syscall.tbl | 396 ++++++++++-------- 1 file changed, 226 insertions(+), 170 deletions(-) diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index d2fa9647ce25..d443423495e5 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -15,86 +15,86 @@ 5 common open sys_open compat_sys_open 6 common close sys_close sys_close 7 common restart_syscall sys_restart_syscall sys_restart_syscall -8 common creat sys_creat compat_sys_creat -9 common link sys_link compat_sys_link -10 common unlink sys_unlink compat_sys_unlink +8 common creat sys_creat sys_creat +9 common link sys_link sys_link +10 common unlink sys_unlink sys_unlink 11 common execve sys_execve compat_sys_execve -12 common chdir sys_chdir compat_sys_chdir -13 32 time - compat_sys_time -14 common mknod sys_mknod compat_sys_mknod -15 common chmod sys_chmod compat_sys_chmod -16 32 lchown - compat_sys_s390_lchown16 +12 common chdir sys_chdir sys_chdir +13 32 time - sys_time32 +14 common mknod sys_mknod sys_mknod +15 common chmod sys_chmod sys_chmod +16 32 lchown - sys_lchown16 19 common lseek sys_lseek compat_sys_lseek 20 common getpid sys_getpid sys_getpid -21 common mount sys_mount -22 common umount sys_oldumount compat_sys_oldumount -23 32 setuid - compat_sys_s390_setuid16 -24 32 getuid - compat_sys_s390_getuid16 -25 32 stime - compat_sys_stime +21 common mount sys_mount sys_mount +22 common umount sys_oldumount sys_oldumount +23 32 setuid - sys_setuid16 +24 32 getuid - sys_getuid16 +25 32 stime - sys_stime32 26 common ptrace sys_ptrace compat_sys_ptrace 27 common alarm sys_alarm sys_alarm 29 common pause sys_pause sys_pause -30 common utime sys_utime compat_sys_utime -33 common access sys_access compat_sys_access +30 common utime sys_utime sys_utime32 +33 common access sys_access sys_access 34 common nice sys_nice sys_nice 36 common sync sys_sync sys_sync 37 common kill sys_kill sys_kill -38 common rename sys_rename compat_sys_rename -39 common mkdir sys_mkdir compat_sys_mkdir -40 common rmdir sys_rmdir compat_sys_rmdir +38 common rename sys_rename sys_rename +39 common mkdir sys_mkdir sys_mkdir +40 common rmdir sys_rmdir sys_rmdir 41 common dup sys_dup sys_dup -42 common pipe sys_pipe compat_sys_pipe +42 common pipe sys_pipe sys_pipe 43 common times sys_times compat_sys_times -45 common brk sys_brk compat_sys_brk -46 32 setgid - compat_sys_s390_setgid16 -47 32 getgid - compat_sys_s390_getgid16 -48 common signal sys_signal compat_sys_signal -49 32 geteuid - compat_sys_s390_geteuid16 -50 32 getegid - compat_sys_s390_getegid16 -51 common acct sys_acct compat_sys_acct -52 common umount2 sys_umount compat_sys_umount +45 common brk sys_brk sys_brk +46 32 setgid - sys_setgid16 +47 32 getgid - sys_getgid16 +48 common signal sys_signal sys_signal +49 32 geteuid - sys_geteuid16 +50 32 getegid - sys_getegid16 +51 common acct sys_acct sys_acct +52 common umount2 sys_umount sys_umount 54 common ioctl sys_ioctl compat_sys_ioctl 55 common fcntl sys_fcntl compat_sys_fcntl 57 common setpgid sys_setpgid sys_setpgid 60 common umask sys_umask sys_umask -61 common chroot sys_chroot compat_sys_chroot +61 common chroot sys_chroot sys_chroot 62 common ustat sys_ustat compat_sys_ustat 63 common dup2 sys_dup2 sys_dup2 64 common getppid sys_getppid sys_getppid 65 common getpgrp sys_getpgrp sys_getpgrp 66 common setsid sys_setsid sys_setsid 67 common sigaction sys_sigaction compat_sys_sigaction -70 32 setreuid - compat_sys_s390_setreuid16 -71 32 setregid - compat_sys_s390_setregid16 -72 common sigsuspend sys_sigsuspend compat_sys_sigsuspend +70 32 setreuid - sys_setreuid16 +71 32 setregid - sys_setregid16 +72 common sigsuspend sys_sigsuspend sys_sigsuspend 73 common sigpending sys_sigpending compat_sys_sigpending -74 common sethostname sys_sethostname compat_sys_sethostname +74 common sethostname sys_sethostname sys_sethostname 75 common setrlimit sys_setrlimit compat_sys_setrlimit 76 32 getrlimit - compat_sys_old_getrlimit 77 common getrusage sys_getrusage compat_sys_getrusage 78 common gettimeofday sys_gettimeofday compat_sys_gettimeofday 79 common settimeofday sys_settimeofday compat_sys_settimeofday -80 32 getgroups - compat_sys_s390_getgroups16 -81 32 setgroups - compat_sys_s390_setgroups16 -83 common symlink sys_symlink compat_sys_symlink -85 common readlink sys_readlink compat_sys_readlink -86 common uselib sys_uselib compat_sys_uselib -87 common swapon sys_swapon compat_sys_swapon -88 common reboot sys_reboot compat_sys_reboot +80 32 getgroups - sys_getgroups16 +81 32 setgroups - sys_setgroups16 +83 common symlink sys_symlink sys_symlink +85 common readlink sys_readlink sys_readlink +86 common uselib sys_uselib sys_uselib +87 common swapon sys_swapon sys_swapon +88 common reboot sys_reboot sys_reboot 89 common readdir - compat_sys_old_readdir 90 common mmap sys_old_mmap compat_sys_s390_old_mmap -91 common munmap sys_munmap compat_sys_munmap +91 common munmap sys_munmap sys_munmap 92 common truncate sys_truncate compat_sys_truncate 93 common ftruncate sys_ftruncate compat_sys_ftruncate 94 common fchmod sys_fchmod sys_fchmod -95 32 fchown - compat_sys_s390_fchown16 +95 32 fchown - sys_fchown16 96 common getpriority sys_getpriority sys_getpriority 97 common setpriority sys_setpriority sys_setpriority 99 common statfs sys_statfs compat_sys_statfs 100 common fstatfs sys_fstatfs compat_sys_fstatfs 101 32 ioperm - - 102 common socketcall sys_socketcall compat_sys_socketcall -103 common syslog sys_syslog compat_sys_syslog +103 common syslog sys_syslog sys_syslog 104 common setitimer sys_setitimer compat_sys_setitimer 105 common getitimer sys_getitimer compat_sys_getitimer 106 common stat sys_newstat compat_sys_newstat @@ -104,76 +104,76 @@ 111 common vhangup sys_vhangup sys_vhangup 112 common idle - - 114 common wait4 sys_wait4 compat_sys_wait4 -115 common swapoff sys_swapoff compat_sys_swapoff +115 common swapoff sys_swapoff sys_swapoff 116 common sysinfo sys_sysinfo compat_sys_sysinfo 117 common ipc sys_s390_ipc compat_sys_s390_ipc 118 common fsync sys_fsync sys_fsync 119 common sigreturn sys_sigreturn compat_sys_sigreturn -120 common clone sys_clone compat_sys_clone -121 common setdomainname sys_setdomainname compat_sys_setdomainname -122 common uname sys_newuname compat_sys_newuname -124 common adjtimex sys_adjtimex compat_sys_adjtimex -125 common mprotect sys_mprotect compat_sys_mprotect +120 common clone sys_clone sys_clone +121 common setdomainname sys_setdomainname sys_setdomainname +122 common uname sys_newuname sys_newuname +124 common adjtimex sys_adjtimex sys_adjtimex_time32 +125 common mprotect sys_mprotect sys_mprotect 126 common sigprocmask sys_sigprocmask compat_sys_sigprocmask 127 common create_module - - -128 common init_module sys_init_module compat_sys_init_module -129 common delete_module sys_delete_module compat_sys_delete_module +128 common init_module sys_init_module sys_init_module +129 common delete_module sys_delete_module sys_delete_module 130 common get_kernel_syms - - -131 common quotactl sys_quotactl compat_sys_quotactl +131 common quotactl sys_quotactl sys_quotactl 132 common getpgid sys_getpgid sys_getpgid 133 common fchdir sys_fchdir sys_fchdir -134 common bdflush sys_bdflush compat_sys_bdflush -135 common sysfs sys_sysfs compat_sys_sysfs +134 common bdflush sys_bdflush sys_bdflush +135 common sysfs sys_sysfs sys_sysfs 136 common personality sys_s390_personality sys_s390_personality 137 common afs_syscall - - -138 32 setfsuid - compat_sys_s390_setfsuid16 -139 32 setfsgid - compat_sys_s390_setfsgid16 -140 32 _llseek - compat_sys_llseek +138 32 setfsuid - sys_setfsuid16 +139 32 setfsgid - sys_setfsgid16 +140 32 _llseek - sys_llseek 141 common getdents sys_getdents compat_sys_getdents 142 32 _newselect - compat_sys_select 142 64 select sys_select - 143 common flock sys_flock sys_flock -144 common msync sys_msync compat_sys_msync -145 common readv sys_readv -146 common writev sys_writev +144 common msync sys_msync sys_msync +145 common readv sys_readv sys_readv +146 common writev sys_writev sys_writev 147 common getsid sys_getsid sys_getsid 148 common fdatasync sys_fdatasync sys_fdatasync 149 common _sysctl - - -150 common mlock sys_mlock compat_sys_mlock -151 common munlock sys_munlock compat_sys_munlock +150 common mlock sys_mlock sys_mlock +151 common munlock sys_munlock sys_munlock 152 common mlockall sys_mlockall sys_mlockall 153 common munlockall sys_munlockall sys_munlockall -154 common sched_setparam sys_sched_setparam compat_sys_sched_setparam -155 common sched_getparam sys_sched_getparam compat_sys_sched_getparam -156 common sched_setscheduler sys_sched_setscheduler compat_sys_sched_setscheduler +154 common sched_setparam sys_sched_setparam sys_sched_setparam +155 common sched_getparam sys_sched_getparam sys_sched_getparam +156 common sched_setscheduler sys_sched_setscheduler sys_sched_setscheduler 157 common sched_getscheduler sys_sched_getscheduler sys_sched_getscheduler 158 common sched_yield sys_sched_yield sys_sched_yield 159 common sched_get_priority_max sys_sched_get_priority_max sys_sched_get_priority_max 160 common sched_get_priority_min sys_sched_get_priority_min sys_sched_get_priority_min -161 common sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval -162 common nanosleep sys_nanosleep compat_sys_nanosleep -163 common mremap sys_mremap compat_sys_mremap -164 32 setresuid - compat_sys_s390_setresuid16 -165 32 getresuid - compat_sys_s390_getresuid16 +161 common sched_rr_get_interval sys_sched_rr_get_interval sys_sched_rr_get_interval_time32 +162 common nanosleep sys_nanosleep sys_nanosleep_time32 +163 common mremap sys_mremap sys_mremap +164 32 setresuid - sys_setresuid16 +165 32 getresuid - sys_getresuid16 167 common query_module - - -168 common poll sys_poll compat_sys_poll +168 common poll sys_poll sys_poll 169 common nfsservctl - - -170 32 setresgid - compat_sys_s390_setresgid16 -171 32 getresgid - compat_sys_s390_getresgid16 -172 common prctl sys_prctl compat_sys_prctl +170 32 setresgid - sys_setresgid16 +171 32 getresgid - sys_getresgid16 +172 common prctl sys_prctl sys_prctl 173 common rt_sigreturn sys_rt_sigreturn compat_sys_rt_sigreturn 174 common rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction 175 common rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask 176 common rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending -177 common rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait +177 common rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time32 178 common rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 179 common rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend 180 common pread64 sys_pread64 compat_sys_s390_pread64 181 common pwrite64 sys_pwrite64 compat_sys_s390_pwrite64 -182 32 chown - compat_sys_s390_chown16 -183 common getcwd sys_getcwd compat_sys_getcwd -184 common capget sys_capget compat_sys_capget -185 common capset sys_capset compat_sys_capset +182 32 chown - sys_chown16 +183 common getcwd sys_getcwd sys_getcwd +184 common capget sys_capget sys_capget +185 common capset sys_capset sys_capset 186 common sigaltstack sys_sigaltstack compat_sys_sigaltstack 187 common sendfile sys_sendfile64 compat_sys_sendfile 188 common getpmsg - - @@ -187,7 +187,7 @@ 195 32 stat64 - compat_sys_s390_stat64 196 32 lstat64 - compat_sys_s390_lstat64 197 32 fstat64 - compat_sys_s390_fstat64 -198 32 lchown32 - compat_sys_lchown +198 32 lchown32 - sys_lchown 198 64 lchown sys_lchown - 199 32 getuid32 - sys_getuid 199 64 getuid sys_getuid - @@ -201,21 +201,21 @@ 203 64 setreuid sys_setreuid - 204 32 setregid32 - sys_setregid 204 64 setregid sys_setregid - -205 32 getgroups32 - compat_sys_getgroups +205 32 getgroups32 - sys_getgroups 205 64 getgroups sys_getgroups - -206 32 setgroups32 - compat_sys_setgroups +206 32 setgroups32 - sys_setgroups 206 64 setgroups sys_setgroups - 207 32 fchown32 - sys_fchown 207 64 fchown sys_fchown - 208 32 setresuid32 - sys_setresuid 208 64 setresuid sys_setresuid - -209 32 getresuid32 - compat_sys_getresuid +209 32 getresuid32 - sys_getresuid 209 64 getresuid sys_getresuid - 210 32 setresgid32 - sys_setresgid 210 64 setresgid sys_setresgid - -211 32 getresgid32 - compat_sys_getresgid +211 32 getresgid32 - sys_getresgid 211 64 getresgid sys_getresgid - -212 32 chown32 - compat_sys_chown +212 32 chown32 - sys_chown 212 64 chown sys_chown - 213 32 setuid32 - sys_setuid 213 64 setuid sys_setuid - @@ -225,166 +225,222 @@ 215 64 setfsuid sys_setfsuid - 216 32 setfsgid32 - sys_setfsgid 216 64 setfsgid sys_setfsgid - -217 common pivot_root sys_pivot_root compat_sys_pivot_root -218 common mincore sys_mincore compat_sys_mincore -219 common madvise sys_madvise compat_sys_madvise -220 common getdents64 sys_getdents64 compat_sys_getdents64 +217 common pivot_root sys_pivot_root sys_pivot_root +218 common mincore sys_mincore sys_mincore +219 common madvise sys_madvise sys_madvise +220 common getdents64 sys_getdents64 sys_getdents64 221 32 fcntl64 - compat_sys_fcntl64 222 common readahead sys_readahead compat_sys_s390_readahead 223 32 sendfile64 - compat_sys_sendfile64 -224 common setxattr sys_setxattr compat_sys_setxattr -225 common lsetxattr sys_lsetxattr compat_sys_lsetxattr -226 common fsetxattr sys_fsetxattr compat_sys_fsetxattr -227 common getxattr sys_getxattr compat_sys_getxattr -228 common lgetxattr sys_lgetxattr compat_sys_lgetxattr -229 common fgetxattr sys_fgetxattr compat_sys_fgetxattr -230 common listxattr sys_listxattr compat_sys_listxattr -231 common llistxattr sys_llistxattr compat_sys_llistxattr -232 common flistxattr sys_flistxattr compat_sys_flistxattr -233 common removexattr sys_removexattr compat_sys_removexattr -234 common lremovexattr sys_lremovexattr compat_sys_lremovexattr -235 common fremovexattr sys_fremovexattr compat_sys_fremovexattr +224 common setxattr sys_setxattr sys_setxattr +225 common lsetxattr sys_lsetxattr sys_lsetxattr +226 common fsetxattr sys_fsetxattr sys_fsetxattr +227 common getxattr sys_getxattr sys_getxattr +228 common lgetxattr sys_lgetxattr sys_lgetxattr +229 common fgetxattr sys_fgetxattr sys_fgetxattr +230 common listxattr sys_listxattr sys_listxattr +231 common llistxattr sys_llistxattr sys_llistxattr +232 common flistxattr sys_flistxattr sys_flistxattr +233 common removexattr sys_removexattr sys_removexattr +234 common lremovexattr sys_lremovexattr sys_lremovexattr +235 common fremovexattr sys_fremovexattr sys_fremovexattr 236 common gettid sys_gettid sys_gettid 237 common tkill sys_tkill sys_tkill -238 common futex sys_futex compat_sys_futex +238 common futex sys_futex sys_futex_time32 239 common sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity 240 common sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity 241 common tgkill sys_tgkill sys_tgkill 243 common io_setup sys_io_setup compat_sys_io_setup -244 common io_destroy sys_io_destroy compat_sys_io_destroy -245 common io_getevents sys_io_getevents compat_sys_io_getevents +244 common io_destroy sys_io_destroy sys_io_destroy +245 common io_getevents sys_io_getevents sys_io_getevents_time32 246 common io_submit sys_io_submit compat_sys_io_submit -247 common io_cancel sys_io_cancel compat_sys_io_cancel +247 common io_cancel sys_io_cancel sys_io_cancel 248 common exit_group sys_exit_group sys_exit_group 249 common epoll_create sys_epoll_create sys_epoll_create -250 common epoll_ctl sys_epoll_ctl compat_sys_epoll_ctl -251 common epoll_wait sys_epoll_wait compat_sys_epoll_wait -252 common set_tid_address sys_set_tid_address compat_sys_set_tid_address +250 common epoll_ctl sys_epoll_ctl sys_epoll_ctl +251 common epoll_wait sys_epoll_wait sys_epoll_wait +252 common set_tid_address sys_set_tid_address sys_set_tid_address 253 common fadvise64 sys_fadvise64_64 compat_sys_s390_fadvise64 254 common timer_create sys_timer_create compat_sys_timer_create -255 common timer_settime sys_timer_settime compat_sys_timer_settime -256 common timer_gettime sys_timer_gettime compat_sys_timer_gettime +255 common timer_settime sys_timer_settime sys_timer_settime32 +256 common timer_gettime sys_timer_gettime sys_timer_gettime32 257 common timer_getoverrun sys_timer_getoverrun sys_timer_getoverrun 258 common timer_delete sys_timer_delete sys_timer_delete -259 common clock_settime sys_clock_settime compat_sys_clock_settime -260 common clock_gettime sys_clock_gettime compat_sys_clock_gettime -261 common clock_getres sys_clock_getres compat_sys_clock_getres -262 common clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep +259 common clock_settime sys_clock_settime sys_clock_settime32 +260 common clock_gettime sys_clock_gettime sys_clock_gettime32 +261 common clock_getres sys_clock_getres sys_clock_getres_time32 +262 common clock_nanosleep sys_clock_nanosleep sys_clock_nanosleep_time32 264 32 fadvise64_64 - compat_sys_s390_fadvise64_64 265 common statfs64 sys_statfs64 compat_sys_statfs64 266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 -267 common remap_file_pages sys_remap_file_pages compat_sys_remap_file_pages +267 common remap_file_pages sys_remap_file_pages sys_remap_file_pages 268 common mbind sys_mbind compat_sys_mbind 269 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy 270 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy 271 common mq_open sys_mq_open compat_sys_mq_open -272 common mq_unlink sys_mq_unlink compat_sys_mq_unlink -273 common mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend -274 common mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive +272 common mq_unlink sys_mq_unlink sys_mq_unlink +273 common mq_timedsend sys_mq_timedsend sys_mq_timedsend_time32 +274 common mq_timedreceive sys_mq_timedreceive sys_mq_timedreceive_time32 275 common mq_notify sys_mq_notify compat_sys_mq_notify 276 common mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr 277 common kexec_load sys_kexec_load compat_sys_kexec_load -278 common add_key sys_add_key compat_sys_add_key -279 common request_key sys_request_key compat_sys_request_key +278 common add_key sys_add_key sys_add_key +279 common request_key sys_request_key sys_request_key 280 common keyctl sys_keyctl compat_sys_keyctl 281 common waitid sys_waitid compat_sys_waitid 282 common ioprio_set sys_ioprio_set sys_ioprio_set 283 common ioprio_get sys_ioprio_get sys_ioprio_get 284 common inotify_init sys_inotify_init sys_inotify_init -285 common inotify_add_watch sys_inotify_add_watch compat_sys_inotify_add_watch +285 common inotify_add_watch sys_inotify_add_watch sys_inotify_add_watch 286 common inotify_rm_watch sys_inotify_rm_watch sys_inotify_rm_watch 287 common migrate_pages sys_migrate_pages compat_sys_migrate_pages 288 common openat sys_openat compat_sys_openat -289 common mkdirat sys_mkdirat compat_sys_mkdirat -290 common mknodat sys_mknodat compat_sys_mknodat -291 common fchownat sys_fchownat compat_sys_fchownat -292 common futimesat sys_futimesat compat_sys_futimesat +289 common mkdirat sys_mkdirat sys_mkdirat +290 common mknodat sys_mknodat sys_mknodat +291 common fchownat sys_fchownat sys_fchownat +292 common futimesat sys_futimesat sys_futimesat_time32 293 32 fstatat64 - compat_sys_s390_fstatat64 293 64 newfstatat sys_newfstatat - -294 common unlinkat sys_unlinkat compat_sys_unlinkat -295 common renameat sys_renameat compat_sys_renameat -296 common linkat sys_linkat compat_sys_linkat -297 common symlinkat sys_symlinkat compat_sys_symlinkat -298 common readlinkat sys_readlinkat compat_sys_readlinkat -299 common fchmodat sys_fchmodat compat_sys_fchmodat -300 common faccessat sys_faccessat compat_sys_faccessat -301 common pselect6 sys_pselect6 compat_sys_pselect6 -302 common ppoll sys_ppoll compat_sys_ppoll -303 common unshare sys_unshare compat_sys_unshare +294 common unlinkat sys_unlinkat sys_unlinkat +295 common renameat sys_renameat sys_renameat +296 common linkat sys_linkat sys_linkat +297 common symlinkat sys_symlinkat sys_symlinkat +298 common readlinkat sys_readlinkat sys_readlinkat +299 common fchmodat sys_fchmodat sys_fchmodat +300 common faccessat sys_faccessat sys_faccessat +301 common pselect6 sys_pselect6 compat_sys_pselect6_time32 +302 common ppoll sys_ppoll compat_sys_ppoll_time32 +303 common unshare sys_unshare sys_unshare 304 common set_robust_list sys_set_robust_list compat_sys_set_robust_list 305 common get_robust_list sys_get_robust_list compat_sys_get_robust_list -306 common splice sys_splice compat_sys_splice +306 common splice sys_splice sys_splice 307 common sync_file_range sys_sync_file_range compat_sys_s390_sync_file_range -308 common tee sys_tee compat_sys_tee +308 common tee sys_tee sys_tee 309 common vmsplice sys_vmsplice sys_vmsplice 310 common move_pages sys_move_pages compat_sys_move_pages -311 common getcpu sys_getcpu compat_sys_getcpu +311 common getcpu sys_getcpu sys_getcpu 312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait -313 common utimes sys_utimes compat_sys_utimes +313 common utimes sys_utimes sys_utimes_time32 314 common fallocate sys_fallocate compat_sys_s390_fallocate -315 common utimensat sys_utimensat compat_sys_utimensat +315 common utimensat sys_utimensat sys_utimensat_time32 316 common signalfd sys_signalfd compat_sys_signalfd 317 common timerfd - - 318 common eventfd sys_eventfd sys_eventfd 319 common timerfd_create sys_timerfd_create sys_timerfd_create -320 common timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime -321 common timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime +320 common timerfd_settime sys_timerfd_settime sys_timerfd_settime32 +321 common timerfd_gettime sys_timerfd_gettime sys_timerfd_gettime32 322 common signalfd4 sys_signalfd4 compat_sys_signalfd4 323 common eventfd2 sys_eventfd2 sys_eventfd2 324 common inotify_init1 sys_inotify_init1 sys_inotify_init1 -325 common pipe2 sys_pipe2 compat_sys_pipe2 +325 common pipe2 sys_pipe2 sys_pipe2 326 common dup3 sys_dup3 sys_dup3 327 common epoll_create1 sys_epoll_create1 sys_epoll_create1 328 common preadv sys_preadv compat_sys_preadv 329 common pwritev sys_pwritev compat_sys_pwritev 330 common rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo -331 common perf_event_open sys_perf_event_open compat_sys_perf_event_open +331 common perf_event_open sys_perf_event_open sys_perf_event_open 332 common fanotify_init sys_fanotify_init sys_fanotify_init 333 common fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark -334 common prlimit64 sys_prlimit64 compat_sys_prlimit64 -335 common name_to_handle_at sys_name_to_handle_at compat_sys_name_to_handle_at +334 common prlimit64 sys_prlimit64 sys_prlimit64 +335 common name_to_handle_at sys_name_to_handle_at sys_name_to_handle_at 336 common open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at -337 common clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime +337 common clock_adjtime sys_clock_adjtime sys_clock_adjtime32 338 common syncfs sys_syncfs sys_syncfs 339 common setns sys_setns sys_setns 340 common process_vm_readv sys_process_vm_readv sys_process_vm_readv 341 common process_vm_writev sys_process_vm_writev sys_process_vm_writev 342 common s390_runtime_instr sys_s390_runtime_instr sys_s390_runtime_instr -343 common kcmp sys_kcmp compat_sys_kcmp -344 common finit_module sys_finit_module compat_sys_finit_module -345 common sched_setattr sys_sched_setattr compat_sys_sched_setattr -346 common sched_getattr sys_sched_getattr compat_sys_sched_getattr -347 common renameat2 sys_renameat2 compat_sys_renameat2 -348 common seccomp sys_seccomp compat_sys_seccomp -349 common getrandom sys_getrandom compat_sys_getrandom -350 common memfd_create sys_memfd_create compat_sys_memfd_create -351 common bpf sys_bpf compat_sys_bpf -352 common s390_pci_mmio_write sys_s390_pci_mmio_write compat_sys_s390_pci_mmio_write -353 common s390_pci_mmio_read sys_s390_pci_mmio_read compat_sys_s390_pci_mmio_read +343 common kcmp sys_kcmp sys_kcmp +344 common finit_module sys_finit_module sys_finit_module +345 common sched_setattr sys_sched_setattr sys_sched_setattr +346 common sched_getattr sys_sched_getattr sys_sched_getattr +347 common renameat2 sys_renameat2 sys_renameat2 +348 common seccomp sys_seccomp sys_seccomp +349 common getrandom sys_getrandom sys_getrandom +350 common memfd_create sys_memfd_create sys_memfd_create +351 common bpf sys_bpf sys_bpf +352 common s390_pci_mmio_write sys_s390_pci_mmio_write sys_s390_pci_mmio_write +353 common s390_pci_mmio_read sys_s390_pci_mmio_read sys_s390_pci_mmio_read 354 common execveat sys_execveat compat_sys_execveat 355 common userfaultfd sys_userfaultfd sys_userfaultfd 356 common membarrier sys_membarrier sys_membarrier -357 common recvmmsg sys_recvmmsg compat_sys_recvmmsg +357 common recvmmsg sys_recvmmsg compat_sys_recvmmsg_time32 358 common sendmmsg sys_sendmmsg compat_sys_sendmmsg 359 common socket sys_socket sys_socket -360 common socketpair sys_socketpair compat_sys_socketpair -361 common bind sys_bind compat_sys_bind -362 common connect sys_connect compat_sys_connect +360 common socketpair sys_socketpair sys_socketpair +361 common bind sys_bind sys_bind +362 common connect sys_connect sys_connect 363 common listen sys_listen sys_listen -364 common accept4 sys_accept4 compat_sys_accept4 +364 common accept4 sys_accept4 sys_accept4 365 common getsockopt sys_getsockopt sys_getsockopt 366 common setsockopt sys_setsockopt sys_setsockopt -367 common getsockname sys_getsockname compat_sys_getsockname -368 common getpeername sys_getpeername compat_sys_getpeername -369 common sendto sys_sendto compat_sys_sendto +367 common getsockname sys_getsockname sys_getsockname +368 common getpeername sys_getpeername sys_getpeername +369 common sendto sys_sendto sys_sendto 370 common sendmsg sys_sendmsg compat_sys_sendmsg 371 common recvfrom sys_recvfrom compat_sys_recvfrom 372 common recvmsg sys_recvmsg compat_sys_recvmsg 373 common shutdown sys_shutdown sys_shutdown -374 common mlock2 sys_mlock2 compat_sys_mlock2 -375 common copy_file_range sys_copy_file_range compat_sys_copy_file_range +374 common mlock2 sys_mlock2 sys_mlock2 +375 common copy_file_range sys_copy_file_range sys_copy_file_range 376 common preadv2 sys_preadv2 compat_sys_preadv2 377 common pwritev2 sys_pwritev2 compat_sys_pwritev2 -378 common s390_guarded_storage sys_s390_guarded_storage compat_sys_s390_guarded_storage -379 common statx sys_statx compat_sys_statx -380 common s390_sthyi sys_s390_sthyi compat_sys_s390_sthyi +378 common s390_guarded_storage sys_s390_guarded_storage sys_s390_guarded_storage +379 common statx sys_statx sys_statx +380 common s390_sthyi sys_s390_sthyi sys_s390_sthyi +381 common kexec_file_load sys_kexec_file_load sys_kexec_file_load +382 common io_pgetevents sys_io_pgetevents compat_sys_io_pgetevents +383 common rseq sys_rseq sys_rseq +384 common pkey_mprotect sys_pkey_mprotect sys_pkey_mprotect +385 common pkey_alloc sys_pkey_alloc sys_pkey_alloc +386 common pkey_free sys_pkey_free sys_pkey_free +# room for arch specific syscalls +392 64 semtimedop sys_semtimedop - +393 common semget sys_semget sys_semget +394 common semctl sys_semctl compat_sys_semctl +395 common shmget sys_shmget sys_shmget +396 common shmctl sys_shmctl compat_sys_shmctl +397 common shmat sys_shmat compat_sys_shmat +398 common shmdt sys_shmdt sys_shmdt +399 common msgget sys_msgget sys_msgget +400 common msgsnd sys_msgsnd compat_sys_msgsnd +401 common msgrcv sys_msgrcv compat_sys_msgrcv +402 common msgctl sys_msgctl compat_sys_msgctl +403 32 clock_gettime64 - sys_clock_gettime +404 32 clock_settime64 - sys_clock_settime +405 32 clock_adjtime64 - sys_clock_adjtime +406 32 clock_getres_time64 - sys_clock_getres +407 32 clock_nanosleep_time64 - sys_clock_nanosleep +408 32 timer_gettime64 - sys_timer_gettime +409 32 timer_settime64 - sys_timer_settime +410 32 timerfd_gettime64 - sys_timerfd_gettime +411 32 timerfd_settime64 - sys_timerfd_settime +412 32 utimensat_time64 - sys_utimensat +413 32 pselect6_time64 - compat_sys_pselect6_time64 +414 32 ppoll_time64 - compat_sys_ppoll_time64 +416 32 io_pgetevents_time64 - sys_io_pgetevents +417 32 recvmmsg_time64 - compat_sys_recvmmsg_time64 +418 32 mq_timedsend_time64 - sys_mq_timedsend +419 32 mq_timedreceive_time64 - sys_mq_timedreceive +420 32 semtimedop_time64 - sys_semtimedop +421 32 rt_sigtimedwait_time64 - compat_sys_rt_sigtimedwait_time64 +422 32 futex_time64 - sys_futex +423 32 sched_rr_get_interval_time64 - sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register sys_io_uring_register +428 common open_tree sys_open_tree sys_open_tree +429 common move_mount sys_move_mount sys_move_mount +430 common fsopen sys_fsopen sys_fsopen +431 common fsconfig sys_fsconfig sys_fsconfig +432 common fsmount sys_fsmount sys_fsmount +433 common fspick sys_fspick sys_fspick +434 common pidfd_open sys_pidfd_open sys_pidfd_open +435 common clone3 sys_clone3 sys_clone3 +436 common close_range sys_close_range sys_close_range +437 common openat2 sys_openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd +439 common faccessat2 sys_faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 From 23331eeb731a503aaa74d167055eeedc2073ff09 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 26 Nov 2020 16:13:17 +0200 Subject: [PATCH 306/326] perf tests: Improve topology test to check all aggregation types Improve the topology test to check all aggregation types. This is to lock down the behaviour before 'id' is changed into a struct in later commits. Committer testing: $ perf test topology 41: Session topology: Ok $ $ perf test -v topology 41: Session topology: --- start --- test child forked, pid 965552 templ file: /tmp/perf-test-mO7NtI Problems creating module maps, continuing anyway... CPU 0, core 0, socket 0 CPU 1, core 1, socket 0 CPU 2, core 2, socket 0 CPU 3, core 4, socket 0 CPU 4, core 5, socket 0 CPU 5, core 6, socket 0 CPU 6, core 8, socket 0 CPU 7, core 9, socket 0 CPU 8, core 10, socket 0 CPU 9, core 12, socket 0 CPU 10, core 13, socket 0 CPU 11, core 14, socket 0 CPU 12, core 0, socket 0 CPU 13, core 1, socket 0 CPU 14, core 2, socket 0 CPU 15, core 4, socket 0 CPU 16, core 5, socket 0 CPU 17, core 6, socket 0 CPU 18, core 8, socket 0 CPU 19, core 9, socket 0 CPU 20, core 10, socket 0 CPU 21, core 12, socket 0 CPU 22, core 13, socket 0 CPU 23, core 14, socket 0 test child finished with 0 ---- end ---- Session topology: Ok $ Signed-off-by: James Clark Acked-by: Namhyung Kim Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Tested-by: John Garry Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/r/20201126141328.6509-2-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/topology.c | 53 ++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 165feedc7863..f988b3109afc 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -64,10 +64,11 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) .path = path, .mode = PERF_DATA_MODE_READ, }; - int i; + int i, id; session = perf_session__new(&data, false, NULL); TEST_ASSERT_VAL("can't get session", !IS_ERR(session)); + cpu__setup_cpunode_map(); /* On platforms with large numbers of CPUs process_cpu_topology() * might issue an error while reading the perf.data file section @@ -85,11 +86,18 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) * "socket_id number is too big. You may need to upgrade the * perf tool." * - * This is the reason why this test might be skipped. + * This is the reason why this test might be skipped. aarch64 and + * s390 always write this part of the header, even when the above + * condition is true (see do_core_id_test in header.c). So always + * run this test on those platforms. */ - if (!session->header.env.cpu) + if (!session->header.env.cpu + && strncmp(session->header.env.arch, "s390", 4) + && strncmp(session->header.env.arch, "aarch64", 7)) return TEST_SKIP; + TEST_ASSERT_VAL("Session header CPU map not set", session->header.env.cpu); + for (i = 0; i < session->header.env.nr_cpus_avail; i++) { if (!cpu_map__has(map, i)) continue; @@ -98,14 +106,45 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) session->header.env.cpu[i].socket_id); } + // Test that core ID contains socket, die and core for (i = 0; i < map->nr; i++) { - TEST_ASSERT_VAL("Core ID doesn't match", - (session->header.env.cpu[map->map[i]].core_id == (cpu_map__get_core(map, i, NULL) & 0xffff))); + id = cpu_map__get_core(map, i, NULL); + TEST_ASSERT_VAL("Core map - Core ID doesn't match", + session->header.env.cpu[map->map[i]].core_id == cpu_map__id_to_cpu(id)); - TEST_ASSERT_VAL("Socket ID doesn't match", - (session->header.env.cpu[map->map[i]].socket_id == cpu_map__get_socket(map, i, NULL))); + TEST_ASSERT_VAL("Core map - Socket ID doesn't match", + session->header.env.cpu[map->map[i]].socket_id == + cpu_map__id_to_socket(id)); + + TEST_ASSERT_VAL("Core map - Die ID doesn't match", + session->header.env.cpu[map->map[i]].die_id == cpu_map__id_to_die(id)); } + // Test that die ID contains socket and die + for (i = 0; i < map->nr; i++) { + id = cpu_map__get_die(map, i, NULL); + TEST_ASSERT_VAL("Die map - Socket ID doesn't match", + session->header.env.cpu[map->map[i]].socket_id == + cpu_map__id_to_socket(id << 16)); + + TEST_ASSERT_VAL("Die map - Die ID doesn't match", + session->header.env.cpu[map->map[i]].die_id == + cpu_map__id_to_die(id << 16)); + } + + // Test that socket ID contains only socket + for (i = 0; i < map->nr; i++) { + id = cpu_map__get_socket(map, i, NULL); + TEST_ASSERT_VAL("Socket map - Socket ID doesn't match", + session->header.env.cpu[map->map[i]].socket_id == id); + } + + // Test that node ID contains only node + for (i = 0; i < map->nr; i++) { + id = cpu_map__get_node(map, i, NULL); + TEST_ASSERT_VAL("Node map - Node ID doesn't match", + cpu__get_node(map->map[i]) == id); + } perf_session__delete(session); return 0; From 91585846f105ef2e3f479a5124a264ebb770f6ab Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 26 Nov 2020 16:13:18 +0200 Subject: [PATCH 307/326] perf cpumap: Use existing allocator to avoid using malloc Use the existing allocator for perf_cpu_map to avoid use of raw malloc. This could cause an issue in later commits where the size of perf_cpu_map is changed. No functional changes. Signed-off-by: James Clark Acked-by: Namhyung Kim Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Tested-by: John Garry Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/r/20201126141328.6509-3-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cpumap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index dc5c5e6fc502..20e3a75953fc 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -132,15 +132,16 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct perf_cpu_map **res, int (*f)(struct perf_cpu_map *map, int cpu, void *data), void *data) { - struct perf_cpu_map *c; int nr = cpus->nr; + struct perf_cpu_map *c = perf_cpu_map__empty_new(nr); int cpu, s1, s2; - /* allocate as much as possible */ - c = calloc(1, sizeof(*c) + nr * sizeof(int)); if (!c) return -1; + /* Reset size as it may only be partially filled */ + c->nr = 0; + for (cpu = 0; cpu < nr; cpu++) { s1 = f(cpus, cpu, data); for (s2 = 0; s2 < c->nr; s2++) { @@ -155,7 +156,6 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct perf_cpu_map **res, /* ensure we process id in increasing order */ qsort(c->map, c->nr, sizeof(int), cmp_ids); - refcount_set(&c->refcnt, 1); *res = c; return 0; } From fa265e59b81a09fa3d88f3322b1e44d583cac9b0 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 26 Nov 2020 16:13:19 +0200 Subject: [PATCH 308/326] perf cpumap: Add new struct for cpu aggregation This struct currently has only a single int member so that it can be used as a drop in replacement for the existing behaviour. Comparison and constructor functions have also been added that will replace usages of '==' and '= -1'. No functional changes. Signed-off-by: James Clark Acked-by: Namhyung Kim Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Tested-by: John Garry Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/r/20201126141328.6509-4-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cpumap.c | 18 ++++++++++++++++++ tools/perf/util/cpumap.h | 8 ++++++++ 2 files changed, 26 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 20e3a75953fc..8624948b4f1d 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -586,3 +586,21 @@ const struct perf_cpu_map *cpu_map__online(void) /* thread unsafe */ return online; } + +bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b) +{ + return a.id == b.id; +} + +bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a) +{ + return a.id == -1; +} + +struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void) +{ + struct aggr_cpu_id ret = { + .id = -1 + }; + return ret; +} diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 3a442f021468..1cdccc69cd4b 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -7,6 +7,10 @@ #include #include +struct aggr_cpu_id { + int id; +}; + struct perf_record_cpu_map_data; struct perf_cpu_map *perf_cpu_map__empty_new(int nr); @@ -64,4 +68,8 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct perf_cpu_map **res, int cpu_map__cpu(struct perf_cpu_map *cpus, int idx); bool cpu_map__has(struct perf_cpu_map *cpus, int cpu); +bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b); +bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a); +struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void); + #endif /* __PERF_CPUMAP_H */ From 2760f5a14fe7aa466e38bbb92d0284fffc0e4da0 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 26 Nov 2020 16:13:20 +0200 Subject: [PATCH 309/326] perf stat: Replace aggregation ID with a struct Replace all occurences of the usage of int with the new struct cpu_aggr_id. No functional changes. Signed-off-by: James Clark Acked-by: Namhyung Kim Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Tested-by: John Garry Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/r/20201126141328.6509-5-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 76 +++++++++++++---------- tools/perf/tests/topology.c | 17 +++--- tools/perf/util/cpumap.c | 84 ++++++++++++++----------- tools/perf/util/cpumap.h | 10 +-- tools/perf/util/stat-display.c | 108 +++++++++++++++++++-------------- tools/perf/util/stat.c | 2 +- tools/perf/util/stat.h | 5 +- 7 files changed, 174 insertions(+), 128 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 89c32692f40c..1ff76d36d1c9 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1186,65 +1186,67 @@ static struct option stat_options[] = { OPT_END() }; -static int perf_stat__get_socket(struct perf_stat_config *config __maybe_unused, +static struct aggr_cpu_id perf_stat__get_socket(struct perf_stat_config *config __maybe_unused, struct perf_cpu_map *map, int cpu) { return cpu_map__get_socket(map, cpu, NULL); } -static int perf_stat__get_die(struct perf_stat_config *config __maybe_unused, +static struct aggr_cpu_id perf_stat__get_die(struct perf_stat_config *config __maybe_unused, struct perf_cpu_map *map, int cpu) { return cpu_map__get_die(map, cpu, NULL); } -static int perf_stat__get_core(struct perf_stat_config *config __maybe_unused, +static struct aggr_cpu_id perf_stat__get_core(struct perf_stat_config *config __maybe_unused, struct perf_cpu_map *map, int cpu) { return cpu_map__get_core(map, cpu, NULL); } -static int perf_stat__get_node(struct perf_stat_config *config __maybe_unused, +static struct aggr_cpu_id perf_stat__get_node(struct perf_stat_config *config __maybe_unused, struct perf_cpu_map *map, int cpu) { return cpu_map__get_node(map, cpu, NULL); } -static int perf_stat__get_aggr(struct perf_stat_config *config, +static struct aggr_cpu_id perf_stat__get_aggr(struct perf_stat_config *config, aggr_get_id_t get_id, struct perf_cpu_map *map, int idx) { int cpu; + struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id(); if (idx >= map->nr) - return -1; + return id; cpu = map->map[idx]; if (config->cpus_aggr_map->map[cpu] == -1) - config->cpus_aggr_map->map[cpu] = get_id(config, map, idx); + config->cpus_aggr_map->map[cpu] = get_id(config, map, idx).id; - return config->cpus_aggr_map->map[cpu]; + id.id = config->cpus_aggr_map->map[cpu]; + return id; } -static int perf_stat__get_socket_cached(struct perf_stat_config *config, +static struct aggr_cpu_id perf_stat__get_socket_cached(struct perf_stat_config *config, struct perf_cpu_map *map, int idx) { return perf_stat__get_aggr(config, perf_stat__get_socket, map, idx); } -static int perf_stat__get_die_cached(struct perf_stat_config *config, +static struct aggr_cpu_id perf_stat__get_die_cached(struct perf_stat_config *config, struct perf_cpu_map *map, int idx) { return perf_stat__get_aggr(config, perf_stat__get_die, map, idx); } -static int perf_stat__get_core_cached(struct perf_stat_config *config, +static struct aggr_cpu_id perf_stat__get_core_cached(struct perf_stat_config *config, struct perf_cpu_map *map, int idx) { return perf_stat__get_aggr(config, perf_stat__get_core, map, idx); } -static int perf_stat__get_node_cached(struct perf_stat_config *config, +static struct aggr_cpu_id perf_stat__get_node_cached(struct perf_stat_config *config, struct perf_cpu_map *map, int idx) { return perf_stat__get_aggr(config, perf_stat__get_node, map, idx); @@ -1345,18 +1347,23 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct perf_cpu_map *m return cpu; } -static int perf_env__get_socket(struct perf_cpu_map *map, int idx, void *data) +static struct aggr_cpu_id perf_env__get_socket(struct perf_cpu_map *map, int idx, void *data) { struct perf_env *env = data; int cpu = perf_env__get_cpu(env, map, idx); + struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id(); - return cpu == -1 ? -1 : env->cpu[cpu].socket_id; + if (cpu != -1) + id.id = env->cpu[cpu].socket_id; + + return id; } -static int perf_env__get_die(struct perf_cpu_map *map, int idx, void *data) +static struct aggr_cpu_id perf_env__get_die(struct perf_cpu_map *map, int idx, void *data) { struct perf_env *env = data; - int die_id = -1, cpu = perf_env__get_cpu(env, map, idx); + struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id(); + int cpu = perf_env__get_cpu(env, map, idx); if (cpu != -1) { /* @@ -1366,21 +1373,22 @@ static int perf_env__get_die(struct perf_cpu_map *map, int idx, void *data) * socket + die id */ if (WARN_ONCE(env->cpu[cpu].socket_id >> 8, "The socket id number is too big.\n")) - return -1; + return cpu_map__empty_aggr_cpu_id(); if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is too big.\n")) - return -1; + return cpu_map__empty_aggr_cpu_id(); - die_id = (env->cpu[cpu].socket_id << 8) | (env->cpu[cpu].die_id & 0xff); + id.id = (env->cpu[cpu].socket_id << 8) | (env->cpu[cpu].die_id & 0xff); } - return die_id; + return id; } -static int perf_env__get_core(struct perf_cpu_map *map, int idx, void *data) +static struct aggr_cpu_id perf_env__get_core(struct perf_cpu_map *map, int idx, void *data) { struct perf_env *env = data; - int core = -1, cpu = perf_env__get_cpu(env, map, idx); + struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id(); + int cpu = perf_env__get_cpu(env, map, idx); if (cpu != -1) { /* @@ -1391,27 +1399,29 @@ static int perf_env__get_core(struct perf_cpu_map *map, int idx, void *data) * socket + die id + core id */ if (WARN_ONCE(env->cpu[cpu].socket_id >> 8, "The socket id number is too big.\n")) - return -1; + return cpu_map__empty_aggr_cpu_id(); if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is too big.\n")) - return -1; + return cpu_map__empty_aggr_cpu_id(); if (WARN_ONCE(env->cpu[cpu].core_id >> 16, "The core id number is too big.\n")) - return -1; + return cpu_map__empty_aggr_cpu_id(); - core = (env->cpu[cpu].socket_id << 24) | + id.id = (env->cpu[cpu].socket_id << 24) | (env->cpu[cpu].die_id << 16) | (env->cpu[cpu].core_id & 0xffff); } - return core; + return id; } -static int perf_env__get_node(struct perf_cpu_map *map, int idx, void *data) +static struct aggr_cpu_id perf_env__get_node(struct perf_cpu_map *map, int idx, void *data) { int cpu = perf_env__get_cpu(data, map, idx); + struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id(); - return perf_env__numa_node(data, cpu); + id.id = perf_env__numa_node(data, cpu); + return id; } static int perf_env__build_socket_map(struct perf_env *env, struct perf_cpu_map *cpus, @@ -1438,24 +1448,24 @@ static int perf_env__build_node_map(struct perf_env *env, struct perf_cpu_map *c return cpu_map__build_map(cpus, nodep, perf_env__get_node, env); } -static int perf_stat__get_socket_file(struct perf_stat_config *config __maybe_unused, +static struct aggr_cpu_id perf_stat__get_socket_file(struct perf_stat_config *config __maybe_unused, struct perf_cpu_map *map, int idx) { return perf_env__get_socket(map, idx, &perf_stat.session->header.env); } -static int perf_stat__get_die_file(struct perf_stat_config *config __maybe_unused, +static struct aggr_cpu_id perf_stat__get_die_file(struct perf_stat_config *config __maybe_unused, struct perf_cpu_map *map, int idx) { return perf_env__get_die(map, idx, &perf_stat.session->header.env); } -static int perf_stat__get_core_file(struct perf_stat_config *config __maybe_unused, +static struct aggr_cpu_id perf_stat__get_core_file(struct perf_stat_config *config __maybe_unused, struct perf_cpu_map *map, int idx) { return perf_env__get_core(map, idx, &perf_stat.session->header.env); } -static int perf_stat__get_node_file(struct perf_stat_config *config __maybe_unused, +static struct aggr_cpu_id perf_stat__get_node_file(struct perf_stat_config *config __maybe_unused, struct perf_cpu_map *map, int idx) { return perf_env__get_node(map, idx, &perf_stat.session->header.env); diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index f988b3109afc..7a7d16b3950a 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -64,7 +64,8 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) .path = path, .mode = PERF_DATA_MODE_READ, }; - int i, id; + int i; + struct aggr_cpu_id id; session = perf_session__new(&data, false, NULL); TEST_ASSERT_VAL("can't get session", !IS_ERR(session)); @@ -110,14 +111,14 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) for (i = 0; i < map->nr; i++) { id = cpu_map__get_core(map, i, NULL); TEST_ASSERT_VAL("Core map - Core ID doesn't match", - session->header.env.cpu[map->map[i]].core_id == cpu_map__id_to_cpu(id)); + session->header.env.cpu[map->map[i]].core_id == cpu_map__id_to_cpu(id.id)); TEST_ASSERT_VAL("Core map - Socket ID doesn't match", session->header.env.cpu[map->map[i]].socket_id == - cpu_map__id_to_socket(id)); + cpu_map__id_to_socket(id.id)); TEST_ASSERT_VAL("Core map - Die ID doesn't match", - session->header.env.cpu[map->map[i]].die_id == cpu_map__id_to_die(id)); + session->header.env.cpu[map->map[i]].die_id == cpu_map__id_to_die(id.id)); } // Test that die ID contains socket and die @@ -125,25 +126,25 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) id = cpu_map__get_die(map, i, NULL); TEST_ASSERT_VAL("Die map - Socket ID doesn't match", session->header.env.cpu[map->map[i]].socket_id == - cpu_map__id_to_socket(id << 16)); + cpu_map__id_to_socket(id.id << 16)); TEST_ASSERT_VAL("Die map - Die ID doesn't match", session->header.env.cpu[map->map[i]].die_id == - cpu_map__id_to_die(id << 16)); + cpu_map__id_to_die(id.id << 16)); } // Test that socket ID contains only socket for (i = 0; i < map->nr; i++) { id = cpu_map__get_socket(map, i, NULL); TEST_ASSERT_VAL("Socket map - Socket ID doesn't match", - session->header.env.cpu[map->map[i]].socket_id == id); + session->header.env.cpu[map->map[i]].socket_id == id.id); } // Test that node ID contains only node for (i = 0; i < map->nr; i++) { id = cpu_map__get_node(map, i, NULL); TEST_ASSERT_VAL("Node map - Node ID doesn't match", - cpu__get_node(map->map[i]) == id); + cpu__get_node(map->map[i]) == id.id); } perf_session__delete(session); diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 8624948b4f1d..e05a12bde073 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -111,30 +111,37 @@ int cpu_map__get_socket_id(int cpu) return ret ?: value; } -int cpu_map__get_socket(struct perf_cpu_map *map, int idx, void *data __maybe_unused) +struct aggr_cpu_id cpu_map__get_socket(struct perf_cpu_map *map, int idx, + void *data __maybe_unused) { int cpu; + struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id(); if (idx > map->nr) - return -1; + return id; cpu = map->map[idx]; - return cpu_map__get_socket_id(cpu); + id.id = cpu_map__get_socket_id(cpu); + return id; } -static int cmp_ids(const void *a, const void *b) +static int cmp_aggr_cpu_id(const void *a_pointer, const void *b_pointer) { - return *(int *)a - *(int *)b; + struct aggr_cpu_id *a = (struct aggr_cpu_id *)a_pointer; + struct aggr_cpu_id *b = (struct aggr_cpu_id *)b_pointer; + + return a->id - b->id; } int cpu_map__build_map(struct perf_cpu_map *cpus, struct perf_cpu_map **res, - int (*f)(struct perf_cpu_map *map, int cpu, void *data), + struct aggr_cpu_id (*f)(struct perf_cpu_map *map, int cpu, void *data), void *data) { int nr = cpus->nr; struct perf_cpu_map *c = perf_cpu_map__empty_new(nr); - int cpu, s1, s2; + int cpu, s2; + struct aggr_cpu_id s1; if (!c) return -1; @@ -145,16 +152,16 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct perf_cpu_map **res, for (cpu = 0; cpu < nr; cpu++) { s1 = f(cpus, cpu, data); for (s2 = 0; s2 < c->nr; s2++) { - if (s1 == c->map[s2]) + if (s1.id == c->map[s2]) break; } if (s2 == c->nr) { - c->map[c->nr] = s1; + c->map[c->nr] = s1.id; c->nr++; } } /* ensure we process id in increasing order */ - qsort(c->map, c->nr, sizeof(int), cmp_ids); + qsort(c->map, c->nr, sizeof(struct aggr_cpu_id), cmp_aggr_cpu_id); *res = c; return 0; @@ -167,23 +174,24 @@ int cpu_map__get_die_id(int cpu) return ret ?: value; } -int cpu_map__get_die(struct perf_cpu_map *map, int idx, void *data) +struct aggr_cpu_id cpu_map__get_die(struct perf_cpu_map *map, int idx, void *data) { - int cpu, die_id, s; + int cpu, s; + struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id(); if (idx > map->nr) - return -1; + return id; cpu = map->map[idx]; - die_id = cpu_map__get_die_id(cpu); + id.id = cpu_map__get_die_id(cpu); /* There is no die_id on legacy system. */ - if (die_id == -1) - die_id = 0; + if (id.id == -1) + id.id = 0; - s = cpu_map__get_socket(map, idx, data); + s = cpu_map__get_socket(map, idx, data).id; if (s == -1) - return -1; + return cpu_map__empty_aggr_cpu_id(); /* * Encode socket in bit range 15:8 @@ -191,13 +199,14 @@ int cpu_map__get_die(struct perf_cpu_map *map, int idx, void *data) * we need a global id. So we combine * socket + die id */ - if (WARN_ONCE(die_id >> 8, "The die id number is too big.\n")) - return -1; + if (WARN_ONCE(id.id >> 8, "The die id number is too big.\n")) + return cpu_map__empty_aggr_cpu_id(); if (WARN_ONCE(s >> 8, "The socket id number is too big.\n")) - return -1; + return cpu_map__empty_aggr_cpu_id(); - return (s << 8) | (die_id & 0xff); + id.id = (s << 8) | (id.id & 0xff); + return id; } int cpu_map__get_core_id(int cpu) @@ -211,21 +220,22 @@ int cpu_map__get_node_id(int cpu) return cpu__get_node(cpu); } -int cpu_map__get_core(struct perf_cpu_map *map, int idx, void *data) +struct aggr_cpu_id cpu_map__get_core(struct perf_cpu_map *map, int idx, void *data) { - int cpu, s_die; + int cpu; + struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id(); if (idx > map->nr) - return -1; + return id; cpu = map->map[idx]; cpu = cpu_map__get_core_id(cpu); - /* s_die is the combination of socket + die id */ - s_die = cpu_map__get_die(map, idx, data); - if (s_die == -1) - return -1; + /* cpu_map__get_die returns the combination of socket + die id */ + id = cpu_map__get_die(map, idx, data); + if (cpu_map__aggr_cpu_id_is_empty(id)) + return id; /* * encode socket in bit range 31:24 @@ -235,17 +245,21 @@ int cpu_map__get_core(struct perf_cpu_map *map, int idx, void *data) * socket + die id + core id */ if (WARN_ONCE(cpu >> 16, "The core id number is too big.\n")) - return -1; + return cpu_map__empty_aggr_cpu_id(); - return (s_die << 16) | (cpu & 0xffff); + id.id = (id.id << 16) | (cpu & 0xffff); + return id; } -int cpu_map__get_node(struct perf_cpu_map *map, int idx, void *data __maybe_unused) +struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map *map, int idx, void *data __maybe_unused) { - if (idx < 0 || idx >= map->nr) - return -1; + struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id(); - return cpu_map__get_node_id(map->map[idx]); + if (idx < 0 || idx >= map->nr) + return id; + + id.id = cpu_map__get_node_id(map->map[idx]); + return id; } int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct perf_cpu_map **sockp) diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 1cdccc69cd4b..b8c2288a3f6d 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -19,13 +19,13 @@ size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size); size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size); size_t cpu_map__fprintf(struct perf_cpu_map *map, FILE *fp); int cpu_map__get_socket_id(int cpu); -int cpu_map__get_socket(struct perf_cpu_map *map, int idx, void *data); +struct aggr_cpu_id cpu_map__get_socket(struct perf_cpu_map *map, int idx, void *data); int cpu_map__get_die_id(int cpu); -int cpu_map__get_die(struct perf_cpu_map *map, int idx, void *data); +struct aggr_cpu_id cpu_map__get_die(struct perf_cpu_map *map, int idx, void *data); int cpu_map__get_core_id(int cpu); -int cpu_map__get_core(struct perf_cpu_map *map, int idx, void *data); +struct aggr_cpu_id cpu_map__get_core(struct perf_cpu_map *map, int idx, void *data); int cpu_map__get_node_id(int cpu); -int cpu_map__get_node(struct perf_cpu_map *map, int idx, void *data); +struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map *map, int idx, void *data); int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct perf_cpu_map **sockp); int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct perf_cpu_map **diep); int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct perf_cpu_map **corep); @@ -62,7 +62,7 @@ int cpu__max_present_cpu(void); int cpu__get_node(int cpu); int cpu_map__build_map(struct perf_cpu_map *cpus, struct perf_cpu_map **res, - int (*f)(struct perf_cpu_map *map, int cpu, void *data), + struct aggr_cpu_id (*f)(struct perf_cpu_map *map, int cpu, void *data), void *data); int cpu_map__cpu(struct perf_cpu_map *cpus, int idx); diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index fee7543843a8..8f4ee3621863 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -68,15 +68,15 @@ static void print_cgroup(struct perf_stat_config *config, struct evsel *evsel) static void aggr_printout(struct perf_stat_config *config, - struct evsel *evsel, int id, int nr) + struct evsel *evsel, struct aggr_cpu_id id, int nr) { switch (config->aggr_mode) { case AGGR_CORE: fprintf(config->output, "S%d-D%d-C%*d%s%*d%s", - cpu_map__id_to_socket(id), - cpu_map__id_to_die(id), + cpu_map__id_to_socket(id.id), + cpu_map__id_to_die(id.id), config->csv_output ? 0 : -8, - cpu_map__id_to_cpu(id), + cpu_map__id_to_cpu(id.id), config->csv_sep, config->csv_output ? 0 : 4, nr, @@ -84,9 +84,9 @@ static void aggr_printout(struct perf_stat_config *config, break; case AGGR_DIE: fprintf(config->output, "S%d-D%*d%s%*d%s", - cpu_map__id_to_socket(id << 16), + cpu_map__id_to_socket(id.id << 16), config->csv_output ? 0 : -8, - cpu_map__id_to_die(id << 16), + cpu_map__id_to_die(id.id << 16), config->csv_sep, config->csv_output ? 0 : 4, nr, @@ -95,7 +95,7 @@ static void aggr_printout(struct perf_stat_config *config, case AGGR_SOCKET: fprintf(config->output, "S%*d%s%*d%s", config->csv_output ? 0 : -5, - id, + id.id, config->csv_sep, config->csv_output ? 0 : 4, nr, @@ -104,7 +104,7 @@ static void aggr_printout(struct perf_stat_config *config, case AGGR_NODE: fprintf(config->output, "N%*d%s%*d%s", config->csv_output ? 0 : -5, - id, + id.id, config->csv_sep, config->csv_output ? 0 : 4, nr, @@ -113,23 +113,23 @@ static void aggr_printout(struct perf_stat_config *config, case AGGR_NONE: if (evsel->percore && !config->percore_show_thread) { fprintf(config->output, "S%d-D%d-C%*d%s", - cpu_map__id_to_socket(id), - cpu_map__id_to_die(id), + cpu_map__id_to_socket(id.id), + cpu_map__id_to_die(id.id), config->csv_output ? 0 : -3, - cpu_map__id_to_cpu(id), config->csv_sep); - } else if (id > -1) { + cpu_map__id_to_cpu(id.id), config->csv_sep); + } else if (id.id > -1) { fprintf(config->output, "CPU%*d%s", config->csv_output ? 0 : -7, - evsel__cpus(evsel)->map[id], + evsel__cpus(evsel)->map[id.id], config->csv_sep); } break; case AGGR_THREAD: fprintf(config->output, "%*s-%*d%s", config->csv_output ? 0 : 16, - perf_thread_map__comm(evsel->core.threads, id), + perf_thread_map__comm(evsel->core.threads, id.id), config->csv_output ? 0 : -8, - perf_thread_map__pid(evsel->core.threads, id), + perf_thread_map__pid(evsel->core.threads, id.id), config->csv_sep); break; case AGGR_GLOBAL: @@ -144,7 +144,8 @@ struct outstate { bool newline; const char *prefix; int nfields; - int id, nr; + int nr; + struct aggr_cpu_id id; struct evsel *evsel; }; @@ -319,13 +320,13 @@ static void print_metric_header(struct perf_stat_config *config, } static int first_shadow_cpu(struct perf_stat_config *config, - struct evsel *evsel, int id) + struct evsel *evsel, struct aggr_cpu_id id) { struct evlist *evlist = evsel->evlist; int i; if (config->aggr_mode == AGGR_NONE) - return id; + return id.id; if (!config->aggr_get_id) return 0; @@ -333,14 +334,17 @@ static int first_shadow_cpu(struct perf_stat_config *config, for (i = 0; i < evsel__nr_cpus(evsel); i++) { int cpu2 = evsel__cpus(evsel)->map[i]; - if (config->aggr_get_id(config, evlist->core.cpus, cpu2) == id) + if (cpu_map__compare_aggr_cpu_id( + config->aggr_get_id(config, evlist->core.cpus, cpu2), + id)) { return cpu2; + } } return 0; } static void abs_printout(struct perf_stat_config *config, - int id, int nr, struct evsel *evsel, double avg) + struct aggr_cpu_id id, int nr, struct evsel *evsel, double avg) { FILE *output = config->output; double sc = evsel->scale; @@ -393,7 +397,7 @@ static bool is_mixed_hw_group(struct evsel *counter) return false; } -static void printout(struct perf_stat_config *config, int id, int nr, +static void printout(struct perf_stat_config *config, struct aggr_cpu_id id, int nr, struct evsel *counter, double uval, char *prefix, u64 run, u64 ena, double noise, struct runtime_stat *st) @@ -496,17 +500,18 @@ static void printout(struct perf_stat_config *config, int id, int nr, static void aggr_update_shadow(struct perf_stat_config *config, struct evlist *evlist) { - int cpu, s2, id, s; + int cpu, s; + struct aggr_cpu_id s2, id; u64 val; struct evsel *counter; for (s = 0; s < config->aggr_map->nr; s++) { - id = config->aggr_map->map[s]; + id.id = config->aggr_map->map[s]; evlist__for_each_entry(evlist, counter) { val = 0; for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) { s2 = config->aggr_get_id(config, evlist->core.cpus, cpu); - if (s2 != id) + if (!cpu_map__compare_aggr_cpu_id(s2, id)) continue; val += perf_counts(counter->counts, cpu, 0)->val; } @@ -584,7 +589,7 @@ static bool collect_data(struct perf_stat_config *config, struct evsel *counter, struct aggr_data { u64 ena, run, val; - int id; + struct aggr_cpu_id id; int nr; int cpu; }; @@ -593,13 +598,14 @@ static void aggr_cb(struct perf_stat_config *config, struct evsel *counter, void *data, bool first) { struct aggr_data *ad = data; - int cpu, s2; + int cpu; + struct aggr_cpu_id s2; for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) { struct perf_counts_values *counts; s2 = config->aggr_get_id(config, evsel__cpus(counter), cpu); - if (s2 != ad->id) + if (!cpu_map__compare_aggr_cpu_id(s2, ad->id)) continue; if (first) ad->nr++; @@ -628,10 +634,11 @@ static void print_counter_aggrdata(struct perf_stat_config *config, struct aggr_data ad; FILE *output = config->output; u64 ena, run, val; - int id, nr; + int nr; + struct aggr_cpu_id id; double uval; - ad.id = id = config->aggr_map->map[s]; + ad.id.id = id.id = config->aggr_map->map[s]; ad.val = ad.ena = ad.run = 0; ad.nr = 0; if (!collect_data(config, counter, aggr_cb, &ad)) @@ -649,8 +656,12 @@ static void print_counter_aggrdata(struct perf_stat_config *config, fprintf(output, "%s", prefix); uval = val * counter->scale; - printout(config, cpu != -1 ? cpu : id, nr, counter, uval, prefix, - run, ena, 1.0, &rt_stat); + if (cpu != -1) { + id = cpu_map__empty_aggr_cpu_id(); + id.id = cpu; + } + printout(config, id, nr, counter, uval, + prefix, run, ena, 1.0, &rt_stat); if (!metric_only) fputc('\n', output); } @@ -728,7 +739,8 @@ static struct perf_aggr_thread_value *sort_aggr_thread( continue; buf[i].counter = counter; - buf[i].id = thread; + buf[i].id = cpu_map__empty_aggr_cpu_id(); + buf[i].id.id = thread; buf[i].uval = uval; buf[i].val = val; buf[i].run = run; @@ -751,7 +763,8 @@ static void print_aggr_thread(struct perf_stat_config *config, FILE *output = config->output; int nthreads = perf_thread_map__nr(counter->core.threads); int ncpus = perf_cpu_map__nr(counter->core.cpus); - int thread, sorted_threads, id; + int thread, sorted_threads; + struct aggr_cpu_id id; struct perf_aggr_thread_value *buf; buf = sort_aggr_thread(counter, nthreads, ncpus, &sorted_threads, _target); @@ -768,7 +781,7 @@ static void print_aggr_thread(struct perf_stat_config *config, if (config->stats) printout(config, id, 0, buf[thread].counter, buf[thread].uval, prefix, buf[thread].run, buf[thread].ena, 1.0, - &config->stats[id]); + &config->stats[id.id]); else printout(config, id, 0, buf[thread].counter, buf[thread].uval, prefix, buf[thread].run, buf[thread].ena, 1.0, @@ -814,8 +827,8 @@ static void print_counter_aggr(struct perf_stat_config *config, fprintf(output, "%s", prefix); uval = cd.avg * counter->scale; - printout(config, -1, 0, counter, uval, prefix, cd.avg_running, cd.avg_enabled, - cd.avg, &rt_stat); + printout(config, cpu_map__empty_aggr_cpu_id(), 0, counter, uval, prefix, cd.avg_running, + cd.avg_enabled, cd.avg, &rt_stat); if (!metric_only) fprintf(output, "\n"); } @@ -842,6 +855,7 @@ static void print_counter(struct perf_stat_config *config, u64 ena, run, val; double uval; int cpu; + struct aggr_cpu_id id; for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) { struct aggr_data ad = { .cpu = cpu }; @@ -856,8 +870,10 @@ static void print_counter(struct perf_stat_config *config, fprintf(output, "%s", prefix); uval = val * counter->scale; - printout(config, cpu, 0, counter, uval, prefix, run, ena, 1.0, - &rt_stat); + id = cpu_map__empty_aggr_cpu_id(); + id.id = cpu; + printout(config, id, 0, counter, uval, prefix, + run, ena, 1.0, &rt_stat); fputc('\n', output); } @@ -872,6 +888,7 @@ static void print_no_aggr_metric(struct perf_stat_config *config, struct evsel *counter; u64 ena, run, val; double uval; + struct aggr_cpu_id id; nrcpus = evlist->core.cpus->nr; for (cpu = 0; cpu < nrcpus; cpu++) { @@ -880,8 +897,10 @@ static void print_no_aggr_metric(struct perf_stat_config *config, if (prefix) fputs(prefix, config->output); evlist__for_each_entry(evlist, counter) { + id = cpu_map__empty_aggr_cpu_id(); + id.id = cpu; if (first) { - aggr_printout(config, counter, cpu, 0); + aggr_printout(config, counter, id, 0); first = false; } val = perf_counts(counter->counts, cpu, 0)->val; @@ -889,8 +908,8 @@ static void print_no_aggr_metric(struct perf_stat_config *config, run = perf_counts(counter->counts, cpu, 0)->run; uval = val * counter->scale; - printout(config, cpu, 0, counter, uval, prefix, run, ena, 1.0, - &rt_stat); + printout(config, id, 0, counter, uval, prefix, + run, ena, 1.0, &rt_stat); } fputc('\n', config->output); } @@ -1140,14 +1159,15 @@ static void print_footer(struct perf_stat_config *config) static void print_percore_thread(struct perf_stat_config *config, struct evsel *counter, char *prefix) { - int s, s2, id; + int s; + struct aggr_cpu_id s2, id; bool first = true; for (int i = 0; i < evsel__nr_cpus(counter); i++) { s2 = config->aggr_get_id(config, evsel__cpus(counter), i); for (s = 0; s < config->aggr_map->nr; s++) { - id = config->aggr_map->map[s]; - if (s2 == id) + id.id = config->aggr_map->map[s]; + if (cpu_map__compare_aggr_cpu_id(s2, id)) break; } diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 1e125e39ff84..1d5c9f98396b 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -313,7 +313,7 @@ static int check_per_pkg(struct evsel *counter, if (!(vals->run && vals->ena)) return 0; - s = cpu_map__get_socket(cpus, cpu, NULL); + s = cpu_map__get_socket(cpus, cpu, NULL).id; if (s < 0) return -1; diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 9979b4b100f2..87522ffb2db2 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -6,6 +6,7 @@ #include #include #include +#include "cpumap.h" #include "rblist.h" struct perf_cpu_map; @@ -99,7 +100,7 @@ struct runtime_stat { struct rblist value_list; }; -typedef int (*aggr_get_id_t)(struct perf_stat_config *config, +typedef struct aggr_cpu_id (*aggr_get_id_t)(struct perf_stat_config *config, struct perf_cpu_map *m, int cpu); struct perf_stat_config { @@ -170,7 +171,7 @@ struct evlist; struct perf_aggr_thread_value { struct evsel *counter; - int id; + struct aggr_cpu_id id; double uval; u64 val; u64 run; From cea6575fdccfc0624ca42f656e16e6b4d9bb48a5 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 26 Nov 2020 16:13:21 +0200 Subject: [PATCH 310/326] perf cpumap: Add new map type for aggregation Currently this is a duplicate of perf_cpu_map so that it can be used as a drop in replacement. In a later commit it will be changed from a map of ints to use the new cpu_aggr_id struct. No functional changes. Signed-off-by: James Clark Acked-by: Namhyung Kim Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Tested-by: John Garry Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/r/20201126141328.6509-6-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cpumap.c | 17 +++++++++++++++++ tools/perf/util/cpumap.h | 8 ++++++++ 2 files changed, 25 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index e05a12bde073..b18e53506656 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -95,6 +95,23 @@ struct perf_cpu_map *perf_cpu_map__empty_new(int nr) return cpus; } +struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr) +{ + struct cpu_aggr_map *cpus = malloc(sizeof(*cpus) + sizeof(int) * nr); + + if (cpus != NULL) { + int i; + + cpus->nr = nr; + for (i = 0; i < nr; i++) + cpus->map[i] = -1; + + refcount_set(&cpus->refcnt, 1); + } + + return cpus; +} + static int cpu__get_topology_int(int cpu, const char *name, int *value) { char path[PATH_MAX]; diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index b8c2288a3f6d..ebd65c4f431b 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -11,9 +11,17 @@ struct aggr_cpu_id { int id; }; +struct cpu_aggr_map { + refcount_t refcnt; + int nr; + int map[]; +}; + struct perf_record_cpu_map_data; struct perf_cpu_map *perf_cpu_map__empty_new(int nr); +struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr); + struct perf_cpu_map *cpu_map__new_data(struct perf_record_cpu_map_data *data); size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size); size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size); From d526e1a033e03ec4515b1800f99d99a35c7ea790 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 26 Nov 2020 16:13:22 +0200 Subject: [PATCH 311/326] perf cpumap: Drop in cpu_aggr_map struct Replace usages of perf_cpu_map with cpu_aggr map in places that are involved with 'perf stat' aggregation. This will then later be changed to be a map of cpu_aggr_id rather than an int so that more data can be stored. No functional changes. Signed-off-by: James Clark Acked-by: Namhyung Kim Acked-by: Jiri Olsa Tested-by: John Garry Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Mark Rutland Cc: Alexander Shishkin Cc: Thomas Richter Link: https://lore.kernel.org/r/20201126141328.6509-7-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 29 ++++++++++++++++++++++------- tools/perf/util/cpumap.c | 12 ++++++------ tools/perf/util/cpumap.h | 10 +++++----- tools/perf/util/stat.h | 4 ++-- 4 files changed, 35 insertions(+), 20 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 1ff76d36d1c9..cc017c7dcee9 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1320,14 +1320,29 @@ static int perf_stat_init_aggr_mode(void) * the aggregation translate cpumap. */ nr = perf_cpu_map__max(evsel_list->core.cpus); - stat_config.cpus_aggr_map = perf_cpu_map__empty_new(nr + 1); + stat_config.cpus_aggr_map = cpu_aggr_map__empty_new(nr + 1); return stat_config.cpus_aggr_map ? 0 : -ENOMEM; } +static void cpu_aggr_map__delete(struct cpu_aggr_map *map) +{ + if (map) { + WARN_ONCE(refcount_read(&map->refcnt) != 0, + "cpu_aggr_map refcnt unbalanced\n"); + free(map); + } +} + +static void cpu_aggr_map__put(struct cpu_aggr_map *map) +{ + if (map && refcount_dec_and_test(&map->refcnt)) + cpu_aggr_map__delete(map); +} + static void perf_stat__exit_aggr_mode(void) { - perf_cpu_map__put(stat_config.aggr_map); - perf_cpu_map__put(stat_config.cpus_aggr_map); + cpu_aggr_map__put(stat_config.aggr_map); + cpu_aggr_map__put(stat_config.cpus_aggr_map); stat_config.aggr_map = NULL; stat_config.cpus_aggr_map = NULL; } @@ -1425,25 +1440,25 @@ static struct aggr_cpu_id perf_env__get_node(struct perf_cpu_map *map, int idx, } static int perf_env__build_socket_map(struct perf_env *env, struct perf_cpu_map *cpus, - struct perf_cpu_map **sockp) + struct cpu_aggr_map **sockp) { return cpu_map__build_map(cpus, sockp, perf_env__get_socket, env); } static int perf_env__build_die_map(struct perf_env *env, struct perf_cpu_map *cpus, - struct perf_cpu_map **diep) + struct cpu_aggr_map **diep) { return cpu_map__build_map(cpus, diep, perf_env__get_die, env); } static int perf_env__build_core_map(struct perf_env *env, struct perf_cpu_map *cpus, - struct perf_cpu_map **corep) + struct cpu_aggr_map **corep) { return cpu_map__build_map(cpus, corep, perf_env__get_core, env); } static int perf_env__build_node_map(struct perf_env *env, struct perf_cpu_map *cpus, - struct perf_cpu_map **nodep) + struct cpu_aggr_map **nodep) { return cpu_map__build_map(cpus, nodep, perf_env__get_node, env); } diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index b18e53506656..ea81586305f4 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -151,12 +151,12 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const void *b_pointer) return a->id - b->id; } -int cpu_map__build_map(struct perf_cpu_map *cpus, struct perf_cpu_map **res, +int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res, struct aggr_cpu_id (*f)(struct perf_cpu_map *map, int cpu, void *data), void *data) { int nr = cpus->nr; - struct perf_cpu_map *c = perf_cpu_map__empty_new(nr); + struct cpu_aggr_map *c = cpu_aggr_map__empty_new(nr); int cpu, s2; struct aggr_cpu_id s1; @@ -279,22 +279,22 @@ struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map *map, int idx, void *da return id; } -int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct perf_cpu_map **sockp) +int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **sockp) { return cpu_map__build_map(cpus, sockp, cpu_map__get_socket, NULL); } -int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct perf_cpu_map **diep) +int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **diep) { return cpu_map__build_map(cpus, diep, cpu_map__get_die, NULL); } -int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct perf_cpu_map **corep) +int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **corep) { return cpu_map__build_map(cpus, corep, cpu_map__get_core, NULL); } -int cpu_map__build_node_map(struct perf_cpu_map *cpus, struct perf_cpu_map **numap) +int cpu_map__build_node_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **numap) { return cpu_map__build_map(cpus, numap, cpu_map__get_node, NULL); } diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index ebd65c4f431b..b112069038be 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -34,10 +34,10 @@ int cpu_map__get_core_id(int cpu); struct aggr_cpu_id cpu_map__get_core(struct perf_cpu_map *map, int idx, void *data); int cpu_map__get_node_id(int cpu); struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map *map, int idx, void *data); -int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct perf_cpu_map **sockp); -int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct perf_cpu_map **diep); -int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct perf_cpu_map **corep); -int cpu_map__build_node_map(struct perf_cpu_map *cpus, struct perf_cpu_map **nodep); +int cpu_map__build_socket_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **sockp); +int cpu_map__build_die_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **diep); +int cpu_map__build_core_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **corep); +int cpu_map__build_node_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **nodep); const struct perf_cpu_map *cpu_map__online(void); /* thread unsafe */ static inline int cpu_map__socket(struct perf_cpu_map *sock, int s) @@ -69,7 +69,7 @@ int cpu__max_cpu(void); int cpu__max_present_cpu(void); int cpu__get_node(int cpu); -int cpu_map__build_map(struct perf_cpu_map *cpus, struct perf_cpu_map **res, +int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res, struct aggr_cpu_id (*f)(struct perf_cpu_map *map, int cpu, void *data), void *data); diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 87522ffb2db2..b5369730b4a2 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -139,9 +139,9 @@ struct perf_stat_config { const char *csv_sep; struct stats *walltime_nsecs_stats; struct rusage ru_data; - struct perf_cpu_map *aggr_map; + struct cpu_aggr_map *aggr_map; aggr_get_id_t aggr_get_id; - struct perf_cpu_map *cpus_aggr_map; + struct cpu_aggr_map *cpus_aggr_map; u64 *walltime_run; struct rblist metric_events; int ctl_fd; From ff5232956e074994a66656f709c3ad1ee3d8a550 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 26 Nov 2020 16:13:23 +0200 Subject: [PATCH 312/326] perf stat aggregation: Start using cpu_aggr_id in map Use the new cpu_aggr_id struct in the cpu map instead of int so that it can store more data. No functional changes. Signed-off-by: James Clark Acked-by: Namhyung Kim Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Tested-by: John Garry Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Mark Rutland Cc: Alexander Shishkin Cc: Thomas Richter Link: https://lore.kernel.org/r/20201126141328.6509-8-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 6 +++--- tools/perf/util/cpumap.c | 8 ++++---- tools/perf/util/cpumap.h | 2 +- tools/perf/util/stat-display.c | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index cc017c7dcee9..3fec1b7e93b5 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1221,10 +1221,10 @@ static struct aggr_cpu_id perf_stat__get_aggr(struct perf_stat_config *config, cpu = map->map[idx]; - if (config->cpus_aggr_map->map[cpu] == -1) - config->cpus_aggr_map->map[cpu] = get_id(config, map, idx).id; + if (cpu_map__aggr_cpu_id_is_empty(config->cpus_aggr_map->map[cpu])) + config->cpus_aggr_map->map[cpu] = get_id(config, map, idx); - id.id = config->cpus_aggr_map->map[cpu]; + id = config->cpus_aggr_map->map[cpu]; return id; } diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index ea81586305f4..b50609b9a585 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -97,14 +97,14 @@ struct perf_cpu_map *perf_cpu_map__empty_new(int nr) struct cpu_aggr_map *cpu_aggr_map__empty_new(int nr) { - struct cpu_aggr_map *cpus = malloc(sizeof(*cpus) + sizeof(int) * nr); + struct cpu_aggr_map *cpus = malloc(sizeof(*cpus) + sizeof(struct aggr_cpu_id) * nr); if (cpus != NULL) { int i; cpus->nr = nr; for (i = 0; i < nr; i++) - cpus->map[i] = -1; + cpus->map[i] = cpu_map__empty_aggr_cpu_id(); refcount_set(&cpus->refcnt, 1); } @@ -169,11 +169,11 @@ int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res, for (cpu = 0; cpu < nr; cpu++) { s1 = f(cpus, cpu, data); for (s2 = 0; s2 < c->nr; s2++) { - if (s1.id == c->map[s2]) + if (cpu_map__compare_aggr_cpu_id(s1, c->map[s2])) break; } if (s2 == c->nr) { - c->map[c->nr] = s1.id; + c->map[c->nr] = s1; c->nr++; } } diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index b112069038be..d8fc265bc762 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -14,7 +14,7 @@ struct aggr_cpu_id { struct cpu_aggr_map { refcount_t refcnt; int nr; - int map[]; + struct aggr_cpu_id map[]; }; struct perf_record_cpu_map_data; diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 8f4ee3621863..a6309cedb37b 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -506,7 +506,7 @@ static void aggr_update_shadow(struct perf_stat_config *config, struct evsel *counter; for (s = 0; s < config->aggr_map->nr; s++) { - id.id = config->aggr_map->map[s]; + id = config->aggr_map->map[s]; evlist__for_each_entry(evlist, counter) { val = 0; for (cpu = 0; cpu < evsel__nr_cpus(counter); cpu++) { @@ -638,7 +638,7 @@ static void print_counter_aggrdata(struct perf_stat_config *config, struct aggr_cpu_id id; double uval; - ad.id.id = id.id = config->aggr_map->map[s]; + ad.id = id = config->aggr_map->map[s]; ad.val = ad.ena = ad.run = 0; ad.nr = 0; if (!collect_data(config, counter, aggr_cb, &ad)) @@ -1166,7 +1166,7 @@ static void print_percore_thread(struct perf_stat_config *config, for (int i = 0; i < evsel__nr_cpus(counter); i++) { s2 = config->aggr_get_id(config, evsel__cpus(counter), i); for (s = 0; s < config->aggr_map->nr; s++) { - id.id = config->aggr_map->map[s]; + id = config->aggr_map->map[s]; if (cpu_map__compare_aggr_cpu_id(s2, id)) break; } From fcd83a35dd93b89d3f48cfcd33c31b112cc96180 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 26 Nov 2020 16:13:24 +0200 Subject: [PATCH 313/326] perf stat aggregation: Add separate node member Add node as a separate member so that it doesn't have to be packed into the int value. Signed-off-by: James Clark Acked-by: Namhyung Kim Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Tested-by: John Garry Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/r/20201126141328.6509-9-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 8 +++++++- tools/perf/util/cpumap.c | 16 +++++++++++----- tools/perf/util/cpumap.h | 1 + tools/perf/util/stat-display.c | 2 +- 5 files changed, 21 insertions(+), 8 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 3fec1b7e93b5..e51c5469d712 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1435,7 +1435,7 @@ static struct aggr_cpu_id perf_env__get_node(struct perf_cpu_map *map, int idx, int cpu = perf_env__get_cpu(data, map, idx); struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id(); - id.id = perf_env__numa_node(data, cpu); + id.node = perf_env__numa_node(data, cpu); return id; } diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 7a7d16b3950a..ee272d45a445 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -119,6 +119,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Core map - Die ID doesn't match", session->header.env.cpu[map->map[i]].die_id == cpu_map__id_to_die(id.id)); + TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1); } // Test that die ID contains socket and die @@ -131,6 +132,8 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Die map - Die ID doesn't match", session->header.env.cpu[map->map[i]].die_id == cpu_map__id_to_die(id.id << 16)); + + TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1); } // Test that socket ID contains only socket @@ -138,13 +141,16 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) id = cpu_map__get_socket(map, i, NULL); TEST_ASSERT_VAL("Socket map - Socket ID doesn't match", session->header.env.cpu[map->map[i]].socket_id == id.id); + + TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1); } // Test that node ID contains only node for (i = 0; i < map->nr; i++) { id = cpu_map__get_node(map, i, NULL); TEST_ASSERT_VAL("Node map - Node ID doesn't match", - cpu__get_node(map->map[i]) == id.id); + cpu__get_node(map->map[i]) == id.node); + TEST_ASSERT_VAL("Node map - ID is set", id.id == -1); } perf_session__delete(session); diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index b50609b9a585..5f9e98ddbe34 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -148,7 +148,10 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const void *b_pointer) struct aggr_cpu_id *a = (struct aggr_cpu_id *)a_pointer; struct aggr_cpu_id *b = (struct aggr_cpu_id *)b_pointer; - return a->id - b->id; + if (a->id != b->id) + return a->id - b->id; + else + return a->node - b->node; } int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res, @@ -275,7 +278,7 @@ struct aggr_cpu_id cpu_map__get_node(struct perf_cpu_map *map, int idx, void *da if (idx < 0 || idx >= map->nr) return id; - id.id = cpu_map__get_node_id(map->map[idx]); + id.node = cpu_map__get_node_id(map->map[idx]); return id; } @@ -620,18 +623,21 @@ const struct perf_cpu_map *cpu_map__online(void) /* thread unsafe */ bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b) { - return a.id == b.id; + return a.id == b.id && + a.node == b.node; } bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a) { - return a.id == -1; + return a.id == -1 && + a.node == -1; } struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void) { struct aggr_cpu_id ret = { - .id = -1 + .id = -1, + .node = -1 }; return ret; } diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index d8fc265bc762..f79e92603024 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -9,6 +9,7 @@ struct aggr_cpu_id { int id; + int node; }; struct cpu_aggr_map { diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index a6309cedb37b..790392247026 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -104,7 +104,7 @@ static void aggr_printout(struct perf_stat_config *config, case AGGR_NODE: fprintf(config->output, "N%*d%s%*d%s", config->csv_output ? 0 : -5, - id.id, + id.node, config->csv_sep, config->csv_output ? 0 : 4, nr, From 1a270cb6b3cc18663f7fd165aa691c48d68739f2 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 26 Nov 2020 16:13:25 +0200 Subject: [PATCH 314/326] perf stat aggregation: Add separate socket member Add socket as a separate member so that it doesn't have to be packed into the int value. When the socket ID was larger than 8 bits the output appeared corrupted or incomplete. For example, here on ThunderX2 'perf stat' reports a socket of -1 and an invalid die number: ./perf stat -a --per-die The socket id number is too big. Performance counter stats for 'system wide': S-1-D255 128 687.99 msec cpu-clock # 57.240 CPUs utilized ... S36-D0 128 842.34 msec cpu-clock # 70.081 CPUs utilized ... And with --per-core there is an entry with an invalid core ID: ./perf stat record -a --per-core The socket id number is too big. Performance counter stats for 'system wide': S-1-D255-C65535 128 671.04 msec cpu-clock # 54.112 CPUs utilized ... S36-D0-C0 4 28.27 msec cpu-clock # 2.279 CPUs utilized ... This fixes the "Session topology" self test on ThunderX2. After this fix the output contains the correct socket and die IDs and no longer prints a warning about the size of the socket ID: ./perf stat --per-die -a Performance counter stats for 'system wide': S36-D0 128 169,869.39 msec cpu-clock # 127.501 CPUs utilized ... S3612-D0 128 169,733.05 msec cpu-clock # 127.398 CPUs utilized Signed-off-by: James Clark Acked-by: Namhyung Kim Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Tested-by: John Garry Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/r/20201126141328.6509-10-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 22 ++++++---------- tools/perf/tests/topology.c | 10 ++++---- tools/perf/util/cpumap.c | 46 +++++++++++++++++----------------- tools/perf/util/cpumap.h | 6 +---- tools/perf/util/stat-display.c | 8 +++--- tools/perf/util/stat.c | 2 +- 6 files changed, 42 insertions(+), 52 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e51c5469d712..6248baa0f612 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1369,7 +1369,7 @@ static struct aggr_cpu_id perf_env__get_socket(struct perf_cpu_map *map, int idx struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id(); if (cpu != -1) - id.id = env->cpu[cpu].socket_id; + id.socket = env->cpu[cpu].socket_id; return id; } @@ -1382,18 +1382,16 @@ static struct aggr_cpu_id perf_env__get_die(struct perf_cpu_map *map, int idx, v if (cpu != -1) { /* - * Encode socket in bit range 15:8 - * die_id is relative to socket, - * we need a global id. So we combine - * socket + die id + * die_id is relative to socket, so start + * with the socket ID and then add die to + * make a unique ID. */ - if (WARN_ONCE(env->cpu[cpu].socket_id >> 8, "The socket id number is too big.\n")) - return cpu_map__empty_aggr_cpu_id(); + id.socket = env->cpu[cpu].socket_id; if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is too big.\n")) return cpu_map__empty_aggr_cpu_id(); - id.id = (env->cpu[cpu].socket_id << 8) | (env->cpu[cpu].die_id & 0xff); + id.id = env->cpu[cpu].die_id & 0xff; } return id; @@ -1407,23 +1405,19 @@ static struct aggr_cpu_id perf_env__get_core(struct perf_cpu_map *map, int idx, if (cpu != -1) { /* - * Encode socket in bit range 31:24 * encode die id in bit range 23:16 * core_id is relative to socket and die, * we need a global id. So we combine * socket + die id + core id */ - if (WARN_ONCE(env->cpu[cpu].socket_id >> 8, "The socket id number is too big.\n")) - return cpu_map__empty_aggr_cpu_id(); - if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is too big.\n")) return cpu_map__empty_aggr_cpu_id(); if (WARN_ONCE(env->cpu[cpu].core_id >> 16, "The core id number is too big.\n")) return cpu_map__empty_aggr_cpu_id(); - id.id = (env->cpu[cpu].socket_id << 24) | - (env->cpu[cpu].die_id << 16) | + id.socket = env->cpu[cpu].socket_id; + id.id = (env->cpu[cpu].die_id << 16) | (env->cpu[cpu].core_id & 0xffff); } diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index ee272d45a445..777dd8291bcc 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -114,8 +114,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) session->header.env.cpu[map->map[i]].core_id == cpu_map__id_to_cpu(id.id)); TEST_ASSERT_VAL("Core map - Socket ID doesn't match", - session->header.env.cpu[map->map[i]].socket_id == - cpu_map__id_to_socket(id.id)); + session->header.env.cpu[map->map[i]].socket_id == id.socket); TEST_ASSERT_VAL("Core map - Die ID doesn't match", session->header.env.cpu[map->map[i]].die_id == cpu_map__id_to_die(id.id)); @@ -126,8 +125,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) for (i = 0; i < map->nr; i++) { id = cpu_map__get_die(map, i, NULL); TEST_ASSERT_VAL("Die map - Socket ID doesn't match", - session->header.env.cpu[map->map[i]].socket_id == - cpu_map__id_to_socket(id.id << 16)); + session->header.env.cpu[map->map[i]].socket_id == id.socket); TEST_ASSERT_VAL("Die map - Die ID doesn't match", session->header.env.cpu[map->map[i]].die_id == @@ -140,9 +138,10 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) for (i = 0; i < map->nr; i++) { id = cpu_map__get_socket(map, i, NULL); TEST_ASSERT_VAL("Socket map - Socket ID doesn't match", - session->header.env.cpu[map->map[i]].socket_id == id.id); + session->header.env.cpu[map->map[i]].socket_id == id.socket); TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1); + TEST_ASSERT_VAL("Socket map - ID is set", id.id == -1); } // Test that node ID contains only node @@ -151,6 +150,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Node map - Node ID doesn't match", cpu__get_node(map->map[i]) == id.node); TEST_ASSERT_VAL("Node map - ID is set", id.id == -1); + TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1); } perf_session__delete(session); diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 5f9e98ddbe34..d2630f03f682 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -139,7 +139,7 @@ struct aggr_cpu_id cpu_map__get_socket(struct perf_cpu_map *map, int idx, cpu = map->map[idx]; - id.id = cpu_map__get_socket_id(cpu); + id.socket = cpu_map__get_socket_id(cpu); return id; } @@ -150,8 +150,10 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const void *b_pointer) if (a->id != b->id) return a->id - b->id; - else + else if (a->node != b->node) return a->node - b->node; + else + return a->socket - b->socket; } int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res, @@ -196,7 +198,7 @@ int cpu_map__get_die_id(int cpu) struct aggr_cpu_id cpu_map__get_die(struct perf_cpu_map *map, int idx, void *data) { - int cpu, s; + int cpu, die; struct aggr_cpu_id id = cpu_map__empty_aggr_cpu_id(); if (idx > map->nr) @@ -204,28 +206,24 @@ struct aggr_cpu_id cpu_map__get_die(struct perf_cpu_map *map, int idx, void *dat cpu = map->map[idx]; - id.id = cpu_map__get_die_id(cpu); + die = cpu_map__get_die_id(cpu); /* There is no die_id on legacy system. */ - if (id.id == -1) - id.id = 0; - - s = cpu_map__get_socket(map, idx, data).id; - if (s == -1) - return cpu_map__empty_aggr_cpu_id(); + if (die == -1) + die = 0; /* - * Encode socket in bit range 15:8 - * die_id is relative to socket, and - * we need a global id. So we combine - * socket + die id + * die_id is relative to socket, so start + * with the socket ID and then add die to + * make a unique ID. */ - if (WARN_ONCE(id.id >> 8, "The die id number is too big.\n")) + id = cpu_map__get_socket(map, idx, data); + if (cpu_map__aggr_cpu_id_is_empty(id)) + return id; + + if (WARN_ONCE(die >> 8, "The die id number is too big.\n")) return cpu_map__empty_aggr_cpu_id(); - if (WARN_ONCE(s >> 8, "The socket id number is too big.\n")) - return cpu_map__empty_aggr_cpu_id(); - - id.id = (s << 8) | (id.id & 0xff); + id.id = (die & 0xff); return id; } @@ -258,7 +256,6 @@ struct aggr_cpu_id cpu_map__get_core(struct perf_cpu_map *map, int idx, void *da return id; /* - * encode socket in bit range 31:24 * encode die id in bit range 23:16 * core_id is relative to socket and die, * we need a global id. So we combine @@ -624,20 +621,23 @@ const struct perf_cpu_map *cpu_map__online(void) /* thread unsafe */ bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b) { return a.id == b.id && - a.node == b.node; + a.node == b.node && + a.socket == b.socket; } bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a) { return a.id == -1 && - a.node == -1; + a.node == -1 && + a.socket == -1; } struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void) { struct aggr_cpu_id ret = { .id = -1, - .node = -1 + .node = -1, + .socket = -1 }; return ret; } diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index f79e92603024..0123ecc90694 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -10,6 +10,7 @@ struct aggr_cpu_id { int id; int node; + int socket; }; struct cpu_aggr_map { @@ -48,11 +49,6 @@ static inline int cpu_map__socket(struct perf_cpu_map *sock, int s) return sock->map[s]; } -static inline int cpu_map__id_to_socket(int id) -{ - return id >> 24; -} - static inline int cpu_map__id_to_die(int id) { return (id >> 16) & 0xff; diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 790392247026..5a756c88c124 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -73,7 +73,7 @@ static void aggr_printout(struct perf_stat_config *config, switch (config->aggr_mode) { case AGGR_CORE: fprintf(config->output, "S%d-D%d-C%*d%s%*d%s", - cpu_map__id_to_socket(id.id), + id.socket, cpu_map__id_to_die(id.id), config->csv_output ? 0 : -8, cpu_map__id_to_cpu(id.id), @@ -84,7 +84,7 @@ static void aggr_printout(struct perf_stat_config *config, break; case AGGR_DIE: fprintf(config->output, "S%d-D%*d%s%*d%s", - cpu_map__id_to_socket(id.id << 16), + id.socket, config->csv_output ? 0 : -8, cpu_map__id_to_die(id.id << 16), config->csv_sep, @@ -95,7 +95,7 @@ static void aggr_printout(struct perf_stat_config *config, case AGGR_SOCKET: fprintf(config->output, "S%*d%s%*d%s", config->csv_output ? 0 : -5, - id.id, + id.socket, config->csv_sep, config->csv_output ? 0 : 4, nr, @@ -113,7 +113,7 @@ static void aggr_printout(struct perf_stat_config *config, case AGGR_NONE: if (evsel->percore && !config->percore_show_thread) { fprintf(config->output, "S%d-D%d-C%*d%s", - cpu_map__id_to_socket(id.id), + id.socket, cpu_map__id_to_die(id.id), config->csv_output ? 0 : -3, cpu_map__id_to_cpu(id.id), config->csv_sep); diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 1d5c9f98396b..8ce1479c98f0 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -313,7 +313,7 @@ static int check_per_pkg(struct evsel *counter, if (!(vals->run && vals->ena)) return 0; - s = cpu_map__get_socket(cpus, cpu, NULL).id; + s = cpu_map__get_socket(cpus, cpu, NULL).socket; if (s < 0) return -1; From ba2ee166d92b201078cb941956547ab9828989d3 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 26 Nov 2020 16:13:26 +0200 Subject: [PATCH 315/326] perf stat aggregation: Add separate die member Add die as a separate member so that it doesn't have to be packed into the int value. Signed-off-by: James Clark Acked-by: Namhyung Kim Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Tested-by: John Garry Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/r/20201126141328.6509-11-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 14 +++----------- tools/perf/tests/topology.c | 8 +++++--- tools/perf/util/cpumap.c | 28 ++++++++++++++-------------- tools/perf/util/cpumap.h | 6 +----- tools/perf/util/stat-display.c | 6 +++--- 5 files changed, 26 insertions(+), 36 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 6248baa0f612..bac37fe9373c 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1387,11 +1387,7 @@ static struct aggr_cpu_id perf_env__get_die(struct perf_cpu_map *map, int idx, v * make a unique ID. */ id.socket = env->cpu[cpu].socket_id; - - if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is too big.\n")) - return cpu_map__empty_aggr_cpu_id(); - - id.id = env->cpu[cpu].die_id & 0xff; + id.die = env->cpu[cpu].die_id; } return id; @@ -1405,20 +1401,16 @@ static struct aggr_cpu_id perf_env__get_core(struct perf_cpu_map *map, int idx, if (cpu != -1) { /* - * encode die id in bit range 23:16 * core_id is relative to socket and die, * we need a global id. So we combine * socket + die id + core id */ - if (WARN_ONCE(env->cpu[cpu].die_id >> 8, "The die id number is too big.\n")) - return cpu_map__empty_aggr_cpu_id(); - if (WARN_ONCE(env->cpu[cpu].core_id >> 16, "The core id number is too big.\n")) return cpu_map__empty_aggr_cpu_id(); id.socket = env->cpu[cpu].socket_id; - id.id = (env->cpu[cpu].die_id << 16) | - (env->cpu[cpu].core_id & 0xffff); + id.die = env->cpu[cpu].die_id; + id.id = env->cpu[cpu].core_id & 0xffff; } return id; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 777dd8291bcc..e3f822890a84 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -117,7 +117,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) session->header.env.cpu[map->map[i]].socket_id == id.socket); TEST_ASSERT_VAL("Core map - Die ID doesn't match", - session->header.env.cpu[map->map[i]].die_id == cpu_map__id_to_die(id.id)); + session->header.env.cpu[map->map[i]].die_id == id.die); TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1); } @@ -128,10 +128,10 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) session->header.env.cpu[map->map[i]].socket_id == id.socket); TEST_ASSERT_VAL("Die map - Die ID doesn't match", - session->header.env.cpu[map->map[i]].die_id == - cpu_map__id_to_die(id.id << 16)); + session->header.env.cpu[map->map[i]].die_id == id.die); TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1); + TEST_ASSERT_VAL("Die map - ID is set", id.id == -1); } // Test that socket ID contains only socket @@ -141,6 +141,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) session->header.env.cpu[map->map[i]].socket_id == id.socket); TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1); + TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1); TEST_ASSERT_VAL("Socket map - ID is set", id.id == -1); } @@ -151,6 +152,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) cpu__get_node(map->map[i]) == id.node); TEST_ASSERT_VAL("Node map - ID is set", id.id == -1); TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1); + TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1); } perf_session__delete(session); diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index d2630f03f682..10a52058d838 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -152,8 +152,10 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const void *b_pointer) return a->id - b->id; else if (a->node != b->node) return a->node - b->node; - else + else if (a->socket != b->socket) return a->socket - b->socket; + else + return a->die - b->die; } int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res, @@ -220,10 +222,7 @@ struct aggr_cpu_id cpu_map__get_die(struct perf_cpu_map *map, int idx, void *dat if (cpu_map__aggr_cpu_id_is_empty(id)) return id; - if (WARN_ONCE(die >> 8, "The die id number is too big.\n")) - return cpu_map__empty_aggr_cpu_id(); - - id.id = (die & 0xff); + id.die = die; return id; } @@ -250,21 +249,19 @@ struct aggr_cpu_id cpu_map__get_core(struct perf_cpu_map *map, int idx, void *da cpu = cpu_map__get_core_id(cpu); - /* cpu_map__get_die returns the combination of socket + die id */ + /* cpu_map__get_die returns a struct with socket and die set*/ id = cpu_map__get_die(map, idx, data); if (cpu_map__aggr_cpu_id_is_empty(id)) return id; /* - * encode die id in bit range 23:16 - * core_id is relative to socket and die, - * we need a global id. So we combine - * socket + die id + core id + * core_id is relative to socket and die, we need a global id. + * So we combine the result from cpu_map__get_die with the core id */ if (WARN_ONCE(cpu >> 16, "The core id number is too big.\n")) return cpu_map__empty_aggr_cpu_id(); - id.id = (id.id << 16) | (cpu & 0xffff); + id.id = (cpu & 0xffff); return id; } @@ -622,14 +619,16 @@ bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b) { return a.id == b.id && a.node == b.node && - a.socket == b.socket; + a.socket == b.socket && + a.die == b.die; } bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a) { return a.id == -1 && a.node == -1 && - a.socket == -1; + a.socket == -1 && + a.die == -1; } struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void) @@ -637,7 +636,8 @@ struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void) struct aggr_cpu_id ret = { .id = -1, .node = -1, - .socket = -1 + .socket = -1, + .die = -1 }; return ret; } diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 0123ecc90694..51bbe1eca3f4 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -11,6 +11,7 @@ struct aggr_cpu_id { int id; int node; int socket; + int die; }; struct cpu_aggr_map { @@ -49,11 +50,6 @@ static inline int cpu_map__socket(struct perf_cpu_map *sock, int s) return sock->map[s]; } -static inline int cpu_map__id_to_die(int id) -{ - return (id >> 16) & 0xff; -} - static inline int cpu_map__id_to_cpu(int id) { return id & 0xffff; diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 5a756c88c124..dcce753f351d 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -74,7 +74,7 @@ static void aggr_printout(struct perf_stat_config *config, case AGGR_CORE: fprintf(config->output, "S%d-D%d-C%*d%s%*d%s", id.socket, - cpu_map__id_to_die(id.id), + id.die, config->csv_output ? 0 : -8, cpu_map__id_to_cpu(id.id), config->csv_sep, @@ -86,7 +86,7 @@ static void aggr_printout(struct perf_stat_config *config, fprintf(config->output, "S%d-D%*d%s%*d%s", id.socket, config->csv_output ? 0 : -8, - cpu_map__id_to_die(id.id << 16), + id.die, config->csv_sep, config->csv_output ? 0 : 4, nr, @@ -114,7 +114,7 @@ static void aggr_printout(struct perf_stat_config *config, if (evsel->percore && !config->percore_show_thread) { fprintf(config->output, "S%d-D%d-C%*d%s", id.socket, - cpu_map__id_to_die(id.id), + id.die, config->csv_output ? 0 : -3, cpu_map__id_to_cpu(id.id), config->csv_sep); } else if (id.id > -1) { From b993381779da406ca9ca0ae1e1b3968e9075ce77 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 26 Nov 2020 16:13:27 +0200 Subject: [PATCH 316/326] perf stat aggregation: Add separate core member Add core as a separate member so that it doesn't have to be packed into the int value. Signed-off-by: James Clark Acked-by: Namhyung Kim Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Tested-by: John Garry Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/r/20201126141328.6509-12-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 9 +++------ tools/perf/tests/topology.c | 6 +++++- tools/perf/util/cpumap.c | 18 ++++++++++-------- tools/perf/util/cpumap.h | 6 +----- tools/perf/util/stat-display.c | 16 ++++++++-------- 5 files changed, 27 insertions(+), 28 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index bac37fe9373c..8cc24967bc27 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1402,15 +1402,12 @@ static struct aggr_cpu_id perf_env__get_core(struct perf_cpu_map *map, int idx, if (cpu != -1) { /* * core_id is relative to socket and die, - * we need a global id. So we combine - * socket + die id + core id + * we need a global id. So we set + * socket, die id and core id */ - if (WARN_ONCE(env->cpu[cpu].core_id >> 16, "The core id number is too big.\n")) - return cpu_map__empty_aggr_cpu_id(); - id.socket = env->cpu[cpu].socket_id; id.die = env->cpu[cpu].die_id; - id.id = env->cpu[cpu].core_id & 0xffff; + id.core = env->cpu[cpu].core_id; } return id; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index e3f822890a84..a6e289f2c6db 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -111,7 +111,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) for (i = 0; i < map->nr; i++) { id = cpu_map__get_core(map, i, NULL); TEST_ASSERT_VAL("Core map - Core ID doesn't match", - session->header.env.cpu[map->map[i]].core_id == cpu_map__id_to_cpu(id.id)); + session->header.env.cpu[map->map[i]].core_id == id.core); TEST_ASSERT_VAL("Core map - Socket ID doesn't match", session->header.env.cpu[map->map[i]].socket_id == id.socket); @@ -119,6 +119,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Core map - Die ID doesn't match", session->header.env.cpu[map->map[i]].die_id == id.die); TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1); + TEST_ASSERT_VAL("Core map - ID is set", id.id == -1); } // Test that die ID contains socket and die @@ -132,6 +133,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Die map - ID is set", id.id == -1); + TEST_ASSERT_VAL("Die map - Core is set", id.core == -1); } // Test that socket ID contains only socket @@ -143,6 +145,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1); TEST_ASSERT_VAL("Socket map - ID is set", id.id == -1); + TEST_ASSERT_VAL("Socket map - Core is set", id.core == -1); } // Test that node ID contains only node @@ -153,6 +156,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Node map - ID is set", id.id == -1); TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1); TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1); + TEST_ASSERT_VAL("Node map - Core is set", id.core == -1); } perf_session__delete(session); diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 10a52058d838..d164f7bd1ac7 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -154,8 +154,10 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const void *b_pointer) return a->node - b->node; else if (a->socket != b->socket) return a->socket - b->socket; - else + else if (a->die != b->die) return a->die - b->die; + else + return a->core - b->core; } int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res, @@ -258,10 +260,7 @@ struct aggr_cpu_id cpu_map__get_core(struct perf_cpu_map *map, int idx, void *da * core_id is relative to socket and die, we need a global id. * So we combine the result from cpu_map__get_die with the core id */ - if (WARN_ONCE(cpu >> 16, "The core id number is too big.\n")) - return cpu_map__empty_aggr_cpu_id(); - - id.id = (cpu & 0xffff); + id.core = cpu; return id; } @@ -620,7 +619,8 @@ bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b) return a.id == b.id && a.node == b.node && a.socket == b.socket && - a.die == b.die; + a.die == b.die && + a.core == b.core; } bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a) @@ -628,7 +628,8 @@ bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a) return a.id == -1 && a.node == -1 && a.socket == -1 && - a.die == -1; + a.die == -1 && + a.core == -1; } struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void) @@ -637,7 +638,8 @@ struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void) .id = -1, .node = -1, .socket = -1, - .die = -1 + .die = -1, + .core = -1 }; return ret; } diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 51bbe1eca3f4..1bb8f7d47206 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -12,6 +12,7 @@ struct aggr_cpu_id { int node; int socket; int die; + int core; }; struct cpu_aggr_map { @@ -50,11 +51,6 @@ static inline int cpu_map__socket(struct perf_cpu_map *sock, int s) return sock->map[s]; } -static inline int cpu_map__id_to_cpu(int id) -{ - return id & 0xffff; -} - int cpu__setup_cpunode_map(void); int cpu__max_node(void); diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index dcce753f351d..2b3842f6080d 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -76,7 +76,7 @@ static void aggr_printout(struct perf_stat_config *config, id.socket, id.die, config->csv_output ? 0 : -8, - cpu_map__id_to_cpu(id.id), + id.core, config->csv_sep, config->csv_output ? 0 : 4, nr, @@ -116,11 +116,11 @@ static void aggr_printout(struct perf_stat_config *config, id.socket, id.die, config->csv_output ? 0 : -3, - cpu_map__id_to_cpu(id.id), config->csv_sep); - } else if (id.id > -1) { + id.core, config->csv_sep); + } else if (id.core > -1) { fprintf(config->output, "CPU%*d%s", config->csv_output ? 0 : -7, - evsel__cpus(evsel)->map[id.id], + evsel__cpus(evsel)->map[id.core], config->csv_sep); } break; @@ -326,7 +326,7 @@ static int first_shadow_cpu(struct perf_stat_config *config, int i; if (config->aggr_mode == AGGR_NONE) - return id.id; + return id.core; if (!config->aggr_get_id) return 0; @@ -658,7 +658,7 @@ static void print_counter_aggrdata(struct perf_stat_config *config, uval = val * counter->scale; if (cpu != -1) { id = cpu_map__empty_aggr_cpu_id(); - id.id = cpu; + id.core = cpu; } printout(config, id, nr, counter, uval, prefix, run, ena, 1.0, &rt_stat); @@ -871,7 +871,7 @@ static void print_counter(struct perf_stat_config *config, uval = val * counter->scale; id = cpu_map__empty_aggr_cpu_id(); - id.id = cpu; + id.core = cpu; printout(config, id, 0, counter, uval, prefix, run, ena, 1.0, &rt_stat); @@ -898,7 +898,7 @@ static void print_no_aggr_metric(struct perf_stat_config *config, fputs(prefix, config->output); evlist__for_each_entry(evlist, counter) { id = cpu_map__empty_aggr_cpu_id(); - id.id = cpu; + id.core = cpu; if (first) { aggr_printout(config, counter, id, 0); first = false; From 8d4852b468c38168c4e1e1652602b4a6c6c080b3 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 26 Nov 2020 16:13:28 +0200 Subject: [PATCH 317/326] perf stat aggregation: Add separate thread member A separate field isn't strictly required. The core field could be re-used for thread IDs as a single field was used previously. But separating them will avoid confusion and catch potential errors where core IDs are read as thread IDs and vice versa. Also remove the placeholder id field which is now no longer used. Signed-off-by: James Clark Acked-by: Namhyung Kim Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Tested-by: John Garry Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Richter Link: https://lore.kernel.org/r/20201126141328.6509-13-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/topology.c | 8 ++++---- tools/perf/util/cpumap.c | 14 +++++++------- tools/perf/util/cpumap.h | 2 +- tools/perf/util/stat-display.c | 8 ++++---- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index a6e289f2c6db..74748ed75b2c 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -119,7 +119,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Core map - Die ID doesn't match", session->header.env.cpu[map->map[i]].die_id == id.die); TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1); - TEST_ASSERT_VAL("Core map - ID is set", id.id == -1); + TEST_ASSERT_VAL("Core map - Thread is set", id.thread == -1); } // Test that die ID contains socket and die @@ -132,8 +132,8 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) session->header.env.cpu[map->map[i]].die_id == id.die); TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1); - TEST_ASSERT_VAL("Die map - ID is set", id.id == -1); TEST_ASSERT_VAL("Die map - Core is set", id.core == -1); + TEST_ASSERT_VAL("Die map - Thread is set", id.thread == -1); } // Test that socket ID contains only socket @@ -144,8 +144,8 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1); - TEST_ASSERT_VAL("Socket map - ID is set", id.id == -1); TEST_ASSERT_VAL("Socket map - Core is set", id.core == -1); + TEST_ASSERT_VAL("Socket map - Thread is set", id.thread == -1); } // Test that node ID contains only node @@ -153,10 +153,10 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) id = cpu_map__get_node(map, i, NULL); TEST_ASSERT_VAL("Node map - Node ID doesn't match", cpu__get_node(map->map[i]) == id.node); - TEST_ASSERT_VAL("Node map - ID is set", id.id == -1); TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1); TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1); TEST_ASSERT_VAL("Node map - Core is set", id.core == -1); + TEST_ASSERT_VAL("Node map - Thread is set", id.thread == -1); } perf_session__delete(session); diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index d164f7bd1ac7..87d3eca9b872 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -148,16 +148,16 @@ static int cmp_aggr_cpu_id(const void *a_pointer, const void *b_pointer) struct aggr_cpu_id *a = (struct aggr_cpu_id *)a_pointer; struct aggr_cpu_id *b = (struct aggr_cpu_id *)b_pointer; - if (a->id != b->id) - return a->id - b->id; - else if (a->node != b->node) + if (a->node != b->node) return a->node - b->node; else if (a->socket != b->socket) return a->socket - b->socket; else if (a->die != b->die) return a->die - b->die; - else + else if (a->core != b->core) return a->core - b->core; + else + return a->thread - b->thread; } int cpu_map__build_map(struct perf_cpu_map *cpus, struct cpu_aggr_map **res, @@ -616,7 +616,7 @@ const struct perf_cpu_map *cpu_map__online(void) /* thread unsafe */ bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b) { - return a.id == b.id && + return a.thread == b.thread && a.node == b.node && a.socket == b.socket && a.die == b.die && @@ -625,7 +625,7 @@ bool cpu_map__compare_aggr_cpu_id(struct aggr_cpu_id a, struct aggr_cpu_id b) bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a) { - return a.id == -1 && + return a.thread == -1 && a.node == -1 && a.socket == -1 && a.die == -1 && @@ -635,7 +635,7 @@ bool cpu_map__aggr_cpu_id_is_empty(struct aggr_cpu_id a) struct aggr_cpu_id cpu_map__empty_aggr_cpu_id(void) { struct aggr_cpu_id ret = { - .id = -1, + .thread = -1, .node = -1, .socket = -1, .die = -1, diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 1bb8f7d47206..a27eeaf086e8 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -8,7 +8,7 @@ #include struct aggr_cpu_id { - int id; + int thread; int node; int socket; int die; diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c index 2b3842f6080d..583ae4f09c5d 100644 --- a/tools/perf/util/stat-display.c +++ b/tools/perf/util/stat-display.c @@ -127,9 +127,9 @@ static void aggr_printout(struct perf_stat_config *config, case AGGR_THREAD: fprintf(config->output, "%*s-%*d%s", config->csv_output ? 0 : 16, - perf_thread_map__comm(evsel->core.threads, id.id), + perf_thread_map__comm(evsel->core.threads, id.thread), config->csv_output ? 0 : -8, - perf_thread_map__pid(evsel->core.threads, id.id), + perf_thread_map__pid(evsel->core.threads, id.thread), config->csv_sep); break; case AGGR_GLOBAL: @@ -740,7 +740,7 @@ static struct perf_aggr_thread_value *sort_aggr_thread( buf[i].counter = counter; buf[i].id = cpu_map__empty_aggr_cpu_id(); - buf[i].id.id = thread; + buf[i].id.thread = thread; buf[i].uval = uval; buf[i].val = val; buf[i].run = run; @@ -781,7 +781,7 @@ static void print_aggr_thread(struct perf_stat_config *config, if (config->stats) printout(config, id, 0, buf[thread].counter, buf[thread].uval, prefix, buf[thread].run, buf[thread].ena, 1.0, - &config->stats[id.id]); + &config->stats[id.thread]); else printout(config, id, 0, buf[thread].counter, buf[thread].uval, prefix, buf[thread].run, buf[thread].ena, 1.0, From 5149303fdfe5c67ddb51c911e23262f781cd75eb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 24 Dec 2020 10:52:10 -0300 Subject: [PATCH 318/326] perf probe: Fix memory leak when synthesizing SDT probes The argv_split() function must be paired with argv_free(), else we must keep a reference to the argv array received or do the freeing ourselves, in synthesize_sdt_probe_command() we were simply leaking that argv[] array. Fixes: 3b1f8311f6963cd1 ("perf probe: Add sdt probes arguments into the uprobe cmd string") Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexandre Truong Cc: Alexis Berlemont Cc: He Zhe Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Garry Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sumanth Korikkar Cc: Thomas Richter Cc: Will Deacon Link: https://lore.kernel.org/r/20201224135139.GF477817@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-file.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c index 064b63a6a3f3..bbecb449ea94 100644 --- a/tools/perf/util/probe-file.c +++ b/tools/perf/util/probe-file.c @@ -791,7 +791,7 @@ static char *synthesize_sdt_probe_command(struct sdt_note *note, const char *sdtgrp) { struct strbuf buf; - char *ret = NULL, **args; + char *ret = NULL; int i, args_count, err; unsigned long long ref_ctr_offset; @@ -813,12 +813,19 @@ static char *synthesize_sdt_probe_command(struct sdt_note *note, goto out; if (note->args) { - args = argv_split(note->args, &args_count); + char **args = argv_split(note->args, &args_count); + + if (args == NULL) + goto error; for (i = 0; i < args_count; ++i) { - if (synthesize_sdt_probe_arg(&buf, i, args[i]) < 0) + if (synthesize_sdt_probe_arg(&buf, i, args[i]) < 0) { + argv_free(args); goto error; + } } + + argv_free(args); } out: From 6268d7da4d192af339f4d688942b9ccb45a65e04 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 18 Dec 2020 18:41:41 -0800 Subject: [PATCH 319/326] device-dax: Fix range release There are multiple locations that open-code the release of the last range in a device-dax instance. Consolidate this into a new dev_dax_trim_range() helper. This also addresses a kmemleak report: # cat /sys/kernel/debug/kmemleak [..] unreferenced object 0xffff976bd46f6240 (size 64): comm "ndctl", pid 23556, jiffies 4299514316 (age 5406.733s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 20 c3 37 00 00 00 .......... .7... ff ff ff 7f 38 00 00 00 00 00 00 00 00 00 00 00 ....8........... backtrace: [<00000000064003cf>] __kmalloc_track_caller+0x136/0x379 [<00000000d85e3c52>] krealloc+0x67/0x92 [<00000000d7d3ba8a>] __alloc_dev_dax_range+0x73/0x25c [<0000000027d58626>] devm_create_dev_dax+0x27d/0x416 [<00000000434abd43>] __dax_pmem_probe+0x1c9/0x1000 [dax_pmem_core] [<0000000083726c1c>] dax_pmem_probe+0x10/0x1f [dax_pmem] [<00000000b5f2319c>] nvdimm_bus_probe+0x9d/0x340 [libnvdimm] [<00000000c055e544>] really_probe+0x230/0x48d [<000000006cabd38e>] driver_probe_device+0x122/0x13b [<0000000029c7b95a>] device_driver_attach+0x5b/0x60 [<0000000053e5659b>] bind_store+0xb7/0xc3 [<00000000d3bdaadc>] drv_attr_store+0x27/0x31 [<00000000949069c5>] sysfs_kf_write+0x4a/0x57 [<000000004a8b5adf>] kernfs_fop_write+0x150/0x1e5 [<00000000bded60f0>] __vfs_write+0x1b/0x34 [<00000000b92900f0>] vfs_write+0xd8/0x1d1 Reported-by: Jane Chu Cc: Zhen Lei Link: https://lore.kernel.org/r/160834570161.1791850.14911670304441510419.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/dax/bus.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 9761cb40d4bb..720cd140209f 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -367,19 +367,28 @@ void kill_dev_dax(struct dev_dax *dev_dax) } EXPORT_SYMBOL_GPL(kill_dev_dax); -static void free_dev_dax_ranges(struct dev_dax *dev_dax) +static void trim_dev_dax_range(struct dev_dax *dev_dax) { + int i = dev_dax->nr_range - 1; + struct range *range = &dev_dax->ranges[i].range; struct dax_region *dax_region = dev_dax->region; - int i; device_lock_assert(dax_region->dev); - for (i = 0; i < dev_dax->nr_range; i++) { - struct range *range = &dev_dax->ranges[i].range; + dev_dbg(&dev_dax->dev, "delete range[%d]: %#llx:%#llx\n", i, + (unsigned long long)range->start, + (unsigned long long)range->end); - __release_region(&dax_region->res, range->start, - range_len(range)); + __release_region(&dax_region->res, range->start, range_len(range)); + if (--dev_dax->nr_range == 0) { + kfree(dev_dax->ranges); + dev_dax->ranges = NULL; } - dev_dax->nr_range = 0; +} + +static void free_dev_dax_ranges(struct dev_dax *dev_dax) +{ + while (dev_dax->nr_range) + trim_dev_dax_range(dev_dax); } static void unregister_dev_dax(void *dev) @@ -804,15 +813,10 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start, return 0; rc = devm_register_dax_mapping(dev_dax, dev_dax->nr_range - 1); - if (rc) { - dev_dbg(dev, "delete range[%d]: %pa:%pa\n", dev_dax->nr_range - 1, - &alloc->start, &alloc->end); - dev_dax->nr_range--; - __release_region(res, alloc->start, resource_size(alloc)); - return rc; - } + if (rc) + trim_dev_dax_range(dev_dax); - return 0; + return rc; } static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, resource_size_t size) @@ -885,12 +889,7 @@ static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size) if (shrink >= range_len(range)) { devm_release_action(dax_region->dev, unregister_dax_mapping, &mapping->dev); - __release_region(&dax_region->res, range->start, - range_len(range)); - dev_dax->nr_range--; - dev_dbg(dev, "delete range[%d]: %#llx:%#llx\n", i, - (unsigned long long) range->start, - (unsigned long long) range->end); + trim_dev_dax_range(dev_dax); to_shrink -= shrink; if (!to_shrink) break; @@ -1267,7 +1266,6 @@ static void dev_dax_release(struct device *dev) put_dax(dax_dev); free_dev_dax_id(dev_dax); dax_region_put(dax_region); - kfree(dev_dax->ranges); kfree(dev_dax->pgmap); kfree(dev_dax); } From ff8da37d3d8d438ded5a4841d979899269b94d0d Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 19 Dec 2020 16:18:40 +0800 Subject: [PATCH 320/326] device-dax: Avoid an unnecessary check in alloc_dev_dax_range() Swap the calling sequence of krealloc() and __request_region(), call the latter first. In this way, the value of dev_dax->nr_range does not need to be considered when __request_region() failed. Signed-off-by: Zhen Lei Link: https://lore.kernel.org/r/20201219081840.1149-2-thunder.leizhen@huawei.com Signed-off-by: Dan Williams --- drivers/dax/bus.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 720cd140209f..737b207c9e30 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -772,22 +772,14 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start, return 0; } - ranges = krealloc(dev_dax->ranges, sizeof(*ranges) - * (dev_dax->nr_range + 1), GFP_KERNEL); - if (!ranges) + alloc = __request_region(res, start, size, dev_name(dev), 0); + if (!alloc) return -ENOMEM; - alloc = __request_region(res, start, size, dev_name(dev), 0); - if (!alloc) { - /* - * If this was an empty set of ranges nothing else - * will release @ranges, so do it now. - */ - if (!dev_dax->nr_range) { - kfree(ranges); - ranges = NULL; - } - dev_dax->ranges = ranges; + ranges = krealloc(dev_dax->ranges, sizeof(*ranges) + * (dev_dax->nr_range + 1), GFP_KERNEL); + if (!ranges) { + __release_region(res, alloc->start, resource_size(alloc)); return -ENOMEM; } From 11cc92eb747aace5aa2b54b65b5cb8325a8981de Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 25 Dec 2020 22:30:58 +1100 Subject: [PATCH 321/326] genirq: Fix export of irq_to_desc() for powerpc KVM Commit 64a1b95bb9fe ("genirq: Restrict export of irq_to_desc()") removed the export of irq_to_desc() unless powerpc KVM is being built, because there is still a use of irq_to_desc() in modular code there. However it used: #ifdef CONFIG_KVM_BOOK3S_64_HV Which doesn't work when that symbol is =m, leading to a build failure: ERROR: modpost: "irq_to_desc" [arch/powerpc/kvm/kvm-hv.ko] undefined! Fix it by checking for the definedness of the correct symbol which is CONFIG_KVM_BOOK3S_64_HV_MODULE. Fixes: 64a1b95bb9fe ("genirq: Restrict export of irq_to_desc()") Signed-off-by: Michael Ellerman Signed-off-by: Linus Torvalds --- kernel/irq/irqdesc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 3d0bc38a0bcf..cc1a09406c6e 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -352,7 +352,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return radix_tree_lookup(&irq_desc_tree, irq); } -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE EXPORT_SYMBOL_GPL(irq_to_desc); #endif From 61d791365b72a89062fbbea69aa61479476da946 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 25 Dec 2020 15:41:13 -0800 Subject: [PATCH 322/326] drm/amd/display: avoid uninitialized variable warning clang (quite rightly) complains fairly loudly about the newly added mpc1_get_mpc_out_mux() function returning an uninitialized value if the 'opp_id' checks don't pass. This may not happen in practice, but the code really shouldn't return garbage if the sanity checks don't pass. So just initialize 'val' to zero to avoid the issue. Fixes: 110b055b2827 ("drm/amd/display: add getter routine to retrieve mpcc mux") Cc: Josip Pavic Cc: Bindu Ramamurthy Cc: Alex Deucher Signed-off-by: Linus Torvalds --- drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c index a46cb20596fe..100ce0e28fd5 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c @@ -470,7 +470,7 @@ void mpc1_cursor_lock(struct mpc *mpc, int opp_id, bool lock) unsigned int mpc1_get_mpc_out_mux(struct mpc *mpc, int opp_id) { struct dcn10_mpc *mpc10 = TO_DCN10_MPC(mpc); - uint32_t val; + uint32_t val = 0; if (opp_id < MAX_OPP && REG(MUX[opp_id])) REG_GET(MUX[opp_id], MPC_OUT_MUX, &val); From 275e88b06a277ccf89d9c471a777e9b4f8c552b0 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 18 Dec 2020 08:39:05 -0600 Subject: [PATCH 323/326] PCI: tegra: Fix host link initialization Commit b9ac0f9dc8ea ("PCI: dwc: Move dw_pcie_setup_rc() to DWC common code") broke enumeration of downstream devices on Tegra: In non-working case (next-20201211): 0001:00:00.0 PCI bridge: NVIDIA Corporation Device 1ad2 (rev a1) 0001:01:00.0 SATA controller: Marvell Technology Group Ltd. Device 9171 (rev 13) 0005:00:00.0 PCI bridge: NVIDIA Corporation Device 1ad0 (rev a1) In working case (v5.10-rc7): 0001:00:00.0 PCI bridge: Molex Incorporated Device 1ad2 (rev a1) 0001:01:00.0 SATA controller: Marvell Technology Group Ltd. Device 9171 (rev 13) 0005:00:00.0 PCI bridge: Molex Incorporated Device 1ad0 (rev a1) 0005:01:00.0 PCI bridge: PLX Technology, Inc. Device 3380 (rev ab) 0005:02:02.0 PCI bridge: PLX Technology, Inc. Device 3380 (rev ab) 0005:03:00.0 USB controller: PLX Technology, Inc. Device 3380 (rev ab) The problem seems to be dw_pcie_setup_rc() is now called twice before and after the link up handling. The fix is to move Tegra's link up handling to .start_link() function like other DWC drivers. Tegra is a bit more complicated than others as it re-inits the whole DWC controller to retry the link. With this, the initialization ordering is restored to match the prior sequence. Fixes: b9ac0f9dc8ea ("PCI: dwc: Move dw_pcie_setup_rc() to DWC common code") Link: https://lore.kernel.org/r/20201218143905.1614098-1-robh@kernel.org Reported-by: Mian Yousaf Kaukab Tested-by: Mian Yousaf Kaukab Signed-off-by: Rob Herring Signed-off-by: Bjorn Helgaas Cc: Lorenzo Pieralisi Cc: Thierry Reding Cc: Jonathan Hunter Cc: Vidya Sagar --- drivers/pci/controller/dwc/pcie-tegra194.c | 55 ++++++++++++---------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c index 5597b2a49598..6fa216e52d14 100644 --- a/drivers/pci/controller/dwc/pcie-tegra194.c +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -853,12 +853,14 @@ static void config_gen3_gen4_eq_presets(struct tegra_pcie_dw *pcie) dw_pcie_writel_dbi(pci, GEN3_RELATED_OFF, val); } -static void tegra_pcie_prepare_host(struct pcie_port *pp) +static int tegra_pcie_dw_host_init(struct pcie_port *pp) { struct dw_pcie *pci = to_dw_pcie_from_pp(pp); struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); u32 val; + pp->bridge->ops = &tegra_pci_ops; + if (!pcie->pcie_cap_base) pcie->pcie_cap_base = dw_pcie_find_capability(&pcie->pci, PCI_CAP_ID_EXP); @@ -907,10 +909,24 @@ static void tegra_pcie_prepare_host(struct pcie_port *pp) dw_pcie_writel_dbi(pci, CFG_TIMER_CTRL_MAX_FUNC_NUM_OFF, val); } - dw_pcie_setup_rc(pp); - clk_set_rate(pcie->core_clk, GEN4_CORE_CLK_FREQ); + return 0; +} + +static int tegra_pcie_dw_start_link(struct dw_pcie *pci) +{ + u32 val, offset, speed, tmp; + struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); + struct pcie_port *pp = &pci->pp; + bool retry = true; + + if (pcie->mode == DW_PCIE_EP_TYPE) { + enable_irq(pcie->pex_rst_irq); + return 0; + } + +retry_link: /* Assert RST */ val = appl_readl(pcie, APPL_PINMUX); val &= ~APPL_PINMUX_PEX_RST; @@ -929,19 +945,10 @@ static void tegra_pcie_prepare_host(struct pcie_port *pp) appl_writel(pcie, val, APPL_PINMUX); msleep(100); -} - -static int tegra_pcie_dw_host_init(struct pcie_port *pp) -{ - struct dw_pcie *pci = to_dw_pcie_from_pp(pp); - struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); - u32 val, tmp, offset, speed; - - pp->bridge->ops = &tegra_pci_ops; - - tegra_pcie_prepare_host(pp); if (dw_pcie_wait_for_link(pci)) { + if (!retry) + return 0; /* * There are some endpoints which can't get the link up if * root port has Data Link Feature (DLF) enabled. @@ -975,10 +982,11 @@ static int tegra_pcie_dw_host_init(struct pcie_port *pp) val &= ~PCI_DLF_EXCHANGE_ENABLE; dw_pcie_writel_dbi(pci, offset, val); - tegra_pcie_prepare_host(pp); + tegra_pcie_dw_host_init(pp); + dw_pcie_setup_rc(pp); - if (dw_pcie_wait_for_link(pci)) - return 0; + retry = false; + goto retry_link; } speed = dw_pcie_readw_dbi(pci, pcie->pcie_cap_base + PCI_EXP_LNKSTA) & @@ -998,15 +1006,6 @@ static int tegra_pcie_dw_link_up(struct dw_pcie *pci) return !!(val & PCI_EXP_LNKSTA_DLLLA); } -static int tegra_pcie_dw_start_link(struct dw_pcie *pci) -{ - struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); - - enable_irq(pcie->pex_rst_irq); - - return 0; -} - static void tegra_pcie_dw_stop_link(struct dw_pcie *pci) { struct tegra_pcie_dw *pcie = to_tegra_pcie(pci); @@ -2215,6 +2214,10 @@ static int tegra_pcie_dw_resume_noirq(struct device *dev) goto fail_host_init; } + ret = tegra_pcie_dw_start_link(&pcie->pci); + if (ret < 0) + goto fail_host_init; + /* Restore MSI interrupt vector */ dw_pcie_writel_dbi(&pcie->pci, PORT_LOGIC_MSI_CTRL_INT_0_EN, pcie->msi_ctrl_int); From 99e629f14b471d852d28ecf554093c4730ed0927 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 22 Dec 2020 15:07:43 +0000 Subject: [PATCH 324/326] PCI: dwc: Fix inverted condition of DMA mask setup warning Commit 660c486590aa ("PCI: dwc: Set 32-bit DMA mask for MSI target address allocation") added dma_mask_set() call to explicitly set 32-bit DMA mask for MSI message mapping, but for now it throws a warning on ret == 0, while dma_set_mask() returns 0 in case of success. Fix this by inverting the condition. [bhelgaas: join string to make it greppable] Fixes: 660c486590aa ("PCI: dwc: Set 32-bit DMA mask for MSI target address allocation") Link: https://lore.kernel.org/r/20201222150708.67983-1-alobakin@pm.me Signed-off-by: Alexander Lobakin Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/dwc/pcie-designware-host.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index 516b151e0ef3..8a84c005f32b 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -397,12 +397,8 @@ int dw_pcie_host_init(struct pcie_port *pp) pp); ret = dma_set_mask(pci->dev, DMA_BIT_MASK(32)); - if (!ret) { - dev_warn(pci->dev, - "Failed to set DMA mask to 32-bit. " - "Devices with only 32-bit MSI support" - " may not work properly\n"); - } + if (ret) + dev_warn(pci->dev, "Failed to set DMA mask to 32-bit. Devices with only 32-bit MSI support may not work properly\n"); pp->msi_data = dma_map_single_attrs(pci->dev, &pp->msi_msg, sizeof(pp->msi_msg), From c9a3c4e637ac2dce534f7e9e5a80aed93410ccad Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 25 Dec 2020 18:35:49 -0700 Subject: [PATCH 325/326] mfd: ab8500-debugfs: Remove extraneous curly brace Clang errors: drivers/mfd/ab8500-debugfs.c:1526:2: error: non-void function does not return a value [-Werror,-Wreturn-type] } ^ drivers/mfd/ab8500-debugfs.c:1528:2: error: expected identifier or '(' return 0; ^ drivers/mfd/ab8500-debugfs.c:1529:1: error: extraneous closing brace ('}') } ^ 3 errors generated. The cleanup in ab8500_interrupts_show left a curly brace around, remove it to fix the error. Fixes: 886c8121659d ("mfd: ab8500-debugfs: Remove the racy fiddling with irq_desc") Signed-off-by: Nathan Chancellor Signed-off-by: Linus Torvalds --- drivers/mfd/ab8500-debugfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/mfd/ab8500-debugfs.c b/drivers/mfd/ab8500-debugfs.c index a32039366a93..1cf84562351e 100644 --- a/drivers/mfd/ab8500-debugfs.c +++ b/drivers/mfd/ab8500-debugfs.c @@ -1521,7 +1521,6 @@ static int ab8500_interrupts_show(struct seq_file *s, void *p) line + irq_first, num_interrupts[line], num_wake_interrupts[line]); - } seq_putc(s, '\n'); } From f838f8d2b694cf9d524dc4423e9dd2db13892f3f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 26 Dec 2020 09:19:49 -0800 Subject: [PATCH 326/326] mfd: ab8500-debugfs: Remove extraneous seq_putc Commit c9a3c4e637ac ("mfd: ab8500-debugfs: Remove extraneous curly brace") removed a left-over curly brace that caused build failures, but Joe Perches points out that the subsequent 'seq_putc()' should also be removed, because the commit that caused all these problems already added the final '\n' to the seq_printf() above it. Reported-by: Joe Perches Fixes: 886c8121659d ("mfd: ab8500-debugfs: Remove the racy fiddling with irq_desc") Cc: Thomas Gleixner Cc: Nathan Chancellor Signed-off-by: Linus Torvalds --- drivers/mfd/ab8500-debugfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/mfd/ab8500-debugfs.c b/drivers/mfd/ab8500-debugfs.c index 1cf84562351e..e43dea89b094 100644 --- a/drivers/mfd/ab8500-debugfs.c +++ b/drivers/mfd/ab8500-debugfs.c @@ -1521,7 +1521,6 @@ static int ab8500_interrupts_show(struct seq_file *s, void *p) line + irq_first, num_interrupts[line], num_wake_interrupts[line]); - seq_putc(s, '\n'); } return 0;