From 84a24bf8c52e66b7ac89ada5e3cfbe72d65c1896 Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Thu, 15 Apr 2021 17:27:11 +0000 Subject: [PATCH 01/12] locking/qrwlock: Fix ordering in queued_write_lock_slowpath() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While this code is executed with the wait_lock held, a reader can acquire the lock without holding wait_lock. The writer side loops checking the value with the atomic_cond_read_acquire(), but only truly acquires the lock when the compare-and-exchange is completed successfully which isn’t ordered. This exposes the window between the acquire and the cmpxchg to an A-B-A problem which allows reads following the lock acquisition to observe values speculatively before the write lock is truly acquired. We've seen a problem in epoll where the reader does a xchg while holding the read lock, but the writer can see a value change out from under it. Writer | Reader -------------------------------------------------------------------------------- ep_scan_ready_list() | |- write_lock_irq() | |- queued_write_lock_slowpath() | |- atomic_cond_read_acquire() | | read_lock_irqsave(&ep->lock, flags); --> (observes value before unlock) | chain_epi_lockless() | | epi->next = xchg(&ep->ovflist, epi); | | read_unlock_irqrestore(&ep->lock, flags); | | | atomic_cmpxchg_relaxed() | |-- READ_ONCE(ep->ovflist); | A core can order the read of the ovflist ahead of the atomic_cmpxchg_relaxed(). Switching the cmpxchg to use acquire semantics addresses this issue at which point the atomic_cond_read can be switched to use relaxed semantics. Fixes: b519b56e378ee ("locking/qrwlock: Use atomic_cond_read_acquire() when spinning in qrwlock") Signed-off-by: Ali Saidi [peterz: use try_cmpxchg()] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steve Capper Acked-by: Will Deacon Acked-by: Waiman Long Tested-by: Steve Capper --- kernel/locking/qrwlock.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 4786dd271b45..b94f3831e963 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -60,6 +60,8 @@ EXPORT_SYMBOL(queued_read_lock_slowpath); */ void queued_write_lock_slowpath(struct qrwlock *lock) { + int cnts; + /* Put the writer into the wait queue */ arch_spin_lock(&lock->wait_lock); @@ -73,9 +75,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock) /* When no more readers or writers, set the locked flag */ do { - atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING); - } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING, - _QW_LOCKED) != _QW_WAITING); + cnts = atomic_cond_read_relaxed(&lock->cnts, VAL == _QW_WAITING); + } while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)); unlock: arch_spin_unlock(&lock->wait_lock); } From 0c89d87d1d43d9fa268d1dc489518564d58bf497 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Sat, 10 Apr 2021 15:35:23 +0800 Subject: [PATCH 02/12] preempt/dynamic: Fix typo in macro conditional statement Commit 40607ee97e4e ("preempt/dynamic: Provide irqentry_exit_cond_resched() static call") tried to provide irqentry_exit_cond_resched() static call in irqentry_exit, but has a typo in macro conditional statement. Fixes: 40607ee97e4e ("preempt/dynamic: Provide irqentry_exit_cond_resched() static call") Signed-off-by: Zhouyi Zhou Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210410073523.5493-1-zhouzhouyi@gmail.com --- kernel/entry/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 8442e5c9cfa2..2003d69bd6d5 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -422,7 +422,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) instrumentation_begin(); if (IS_ENABLED(CONFIG_PREEMPTION)) { -#ifdef CONFIG_PREEMT_DYNAMIC +#ifdef CONFIG_PREEMPT_DYNAMIC static_call(irqentry_exit_cond_resched)(); #else irqentry_exit_cond_resched(); From 5849cdf8c120e3979c57d34be55b92d90a77a47e Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 16 Apr 2021 14:02:07 +0200 Subject: [PATCH 03/12] x86/crash: Fix crash_setup_memmap_entries() out-of-bounds access Commit in Fixes: added support for kexec-ing a kernel on panic using a new system call. As part of it, it does prepare a memory map for the new kernel. However, while doing so, it wrongly accesses memory it has not allocated: it accesses the first element of the cmem->ranges[] array in memmap_exclude_ranges() but it has not allocated the memory for it in crash_setup_memmap_entries(). As KASAN reports: BUG: KASAN: vmalloc-out-of-bounds in crash_setup_memmap_entries+0x17e/0x3a0 Write of size 8 at addr ffffc90000426008 by task kexec/1187 (gdb) list *crash_setup_memmap_entries+0x17e 0xffffffff8107cafe is in crash_setup_memmap_entries (arch/x86/kernel/crash.c:322). 317 unsigned long long mend) 318 { 319 unsigned long start, end; 320 321 cmem->ranges[0].start = mstart; 322 cmem->ranges[0].end = mend; 323 cmem->nr_ranges = 1; 324 325 /* Exclude elf header region */ 326 start = image->arch.elf_load_addr; (gdb) Make sure the ranges array becomes a single element allocated. [ bp: Write a proper commit message. ] Fixes: dd5f726076cc ("kexec: support for kexec on panic using new system call") Signed-off-by: Mike Galbraith Signed-off-by: Borislav Petkov Reviewed-by: Dave Young Cc: Link: https://lkml.kernel.org/r/725fa3dc1da2737f0f6188a1a9701bead257ea9d.camel@gmx.de --- arch/x86/kernel/crash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index a8f3af257e26..b1deacbeb266 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -337,7 +337,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) struct crash_memmap_data cmd; struct crash_mem *cmem; - cmem = vzalloc(sizeof(struct crash_mem)); + cmem = vzalloc(struct_size(cmem, ranges, 1)); if (!cmem) return -ENOMEM; From f2211881e737cade55e0ee07cf6a26d91a35a6fe Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Thu, 15 Apr 2021 16:34:16 +0800 Subject: [PATCH 04/12] perf data: Fix error return code in perf_data__create_dir() Although 'ret' has been initialized to -1, but it will be reassigned by the "ret = open(...)" statement in the for loop. So that, the value of 'ret' is unknown when asprintf() failed. Reported-by: Hulk Robot Signed-off-by: Zhen Lei Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210415083417.3740-1-thunder.leizhen@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/data.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index f29af4fc3d09..8fca4779ae6a 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -35,7 +35,7 @@ void perf_data__close_dir(struct perf_data *data) int perf_data__create_dir(struct perf_data *data, int nr) { struct perf_data_file *files = NULL; - int i, ret = -1; + int i, ret; if (WARN_ON(!data->is_dir)) return -EINVAL; @@ -51,7 +51,8 @@ int perf_data__create_dir(struct perf_data *data, int nr) for (i = 0; i < nr; i++) { struct perf_data_file *file = &files[i]; - if (asprintf(&file->path, "%s/data.%d", data->path, i) < 0) + ret = asprintf(&file->path, "%s/data.%d", data->path, i); + if (ret < 0) goto out_err; ret = open(file->path, O_RDWR|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR); From 9d480158ee86ad606d3a8baaf81e6b71acbfd7d5 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 15 Apr 2021 14:22:43 -0700 Subject: [PATCH 05/12] perf/x86/intel/uncore: Remove uncore extra PCI dev HSWEP_PCI_PCU_3 There may be a kernel panic on the Haswell server and the Broadwell server, if the snbep_pci2phy_map_init() return error. The uncore_extra_pci_dev[HSWEP_PCI_PCU_3] is used in the cpu_init() to detect the existence of the SBOX, which is a MSR type of PMON unit. The uncore_extra_pci_dev is allocated in the uncore_pci_init(). If the snbep_pci2phy_map_init() returns error, perf doesn't initialize the PCI type of the PMON units, so the uncore_extra_pci_dev will not be allocated. But perf may continue initializing the MSR type of PMON units. A null dereference kernel panic will be triggered. The sockets in a Haswell server or a Broadwell server are identical. Only need to detect the existence of the SBOX once. Current perf probes all available PCU devices and stores them into the uncore_extra_pci_dev. It's unnecessary. Use the pci_get_device() to replace the uncore_extra_pci_dev. Only detect the existence of the SBOX on the first available PCU device once. Factor out hswep_has_limit_sbox(), since the Haswell server and the Broadwell server uses the same way to detect the existence of the SBOX. Add some macros to replace the magic number. Fixes: 5306c31c5733 ("perf/x86/uncore/hsw-ep: Handle systems with only two SBOXes") Reported-by: Steve Wahl Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Steve Wahl Link: https://lkml.kernel.org/r/1618521764-100923-1-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 59 ++++++++++++---------------- 1 file changed, 25 insertions(+), 34 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index b79951d0707c..9b8937631838 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1159,7 +1159,6 @@ enum { SNBEP_PCI_QPI_PORT0_FILTER, SNBEP_PCI_QPI_PORT1_FILTER, BDX_PCI_QPI_PORT2_FILTER, - HSWEP_PCI_PCU_3, }; static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) @@ -2857,22 +2856,33 @@ static struct intel_uncore_type *hswep_msr_uncores[] = { NULL, }; +#define HSWEP_PCU_DID 0x2fc0 +#define HSWEP_PCU_CAPID4_OFFET 0x94 +#define hswep_get_chop(_cap) (((_cap) >> 6) & 0x3) + +static bool hswep_has_limit_sbox(unsigned int device) +{ + struct pci_dev *dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, NULL); + u32 capid4; + + if (!dev) + return false; + + pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4); + if (!hswep_get_chop(capid4)) + return true; + + return false; +} + void hswep_uncore_cpu_init(void) { - int pkg = boot_cpu_data.logical_proc_id; - if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; /* Detect 6-8 core systems with only two SBOXes */ - if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) { - u32 capid4; - - pci_read_config_dword(uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3], - 0x94, &capid4); - if (((capid4 >> 6) & 0x3) == 0) - hswep_uncore_sbox.num_boxes = 2; - } + if (hswep_has_limit_sbox(HSWEP_PCU_DID)) + hswep_uncore_sbox.num_boxes = 2; uncore_msr_uncores = hswep_msr_uncores; } @@ -3135,11 +3145,6 @@ static const struct pci_device_id hswep_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, SNBEP_PCI_QPI_PORT1_FILTER), }, - { /* PCU.3 (for Capability registers) */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - HSWEP_PCI_PCU_3), - }, { /* end: all zeroes */ } }; @@ -3231,27 +3236,18 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = { EVENT_CONSTRAINT_END }; +#define BDX_PCU_DID 0x6fc0 + void bdx_uncore_cpu_init(void) { - int pkg = topology_phys_to_logical_pkg(boot_cpu_data.phys_proc_id); - if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; uncore_msr_uncores = bdx_msr_uncores; - /* BDX-DE doesn't have SBOX */ - if (boot_cpu_data.x86_model == 86) { - uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; /* Detect systems with no SBOXes */ - } else if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) { - struct pci_dev *pdev; - u32 capid4; + if ((boot_cpu_data.x86_model == 86) || hswep_has_limit_sbox(BDX_PCU_DID)) + uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; - pdev = uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]; - pci_read_config_dword(pdev, 0x94, &capid4); - if (((capid4 >> 6) & 0x3) == 0) - bdx_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; - } hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints; } @@ -3472,11 +3468,6 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, BDX_PCI_QPI_PORT2_FILTER), }, - { /* PCU.3 (for Capability registers) */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - HSWEP_PCI_PCU_3), - }, { /* end: all zeroes */ } }; From 4b2f1e59229b9da319d358828cdfa4ddbc140769 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 21 Apr 2021 17:18:34 -0700 Subject: [PATCH 06/12] perf/x86/kvm: Fix Broadwell Xeon stepping in isolation_ucodes[] The only stepping of Broadwell Xeon parts is stepping 1. Fix the relevant isolation_ucodes[] entry, which previously enumerated stepping 2. Although the original commit was characterized as an optimization, it is also a workaround for a correctness issue. If a PMI arrives between kvm's call to perf_guest_get_msrs() and the subsequent VM-entry, a stale value for the IA32_PEBS_ENABLE MSR may be restored at the next VM-exit. This is because, unbeknownst to kvm, PMI throttling may clear bits in the IA32_PEBS_ENABLE MSR. CPUs with "PEBS isolation" don't suffer from this issue, because perf_guest_get_msrs() doesn't report the IA32_PEBS_ENABLE value. Fixes: 9b545c04abd4f ("perf/x86/kvm: Avoid unnecessary work in guest filtering") Signed-off-by: Jim Mattson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Peter Shier Acked-by: Andi Kleen Link: https://lkml.kernel.org/r/20210422001834.1748319-1-jmattson@google.com --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 37ce38403cb8..c57ec8e27907 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4516,7 +4516,7 @@ static const struct x86_cpu_desc isolation_ucodes[] = { INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 3, 0x07000009), INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 4, 0x0f000009), INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 5, 0x0e000002), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 1, 0x0b000014), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000), From b14585d9f18dc617e975815570fe836be656b1da Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 20 Apr 2021 23:15:53 +0800 Subject: [PATCH 07/12] perf auxtrace: Fix potential NULL pointer dereference In the function auxtrace_parse_snapshot_options(), the callback pointer "itr->parse_snapshot_options" can be NULL if it has not been set during the AUX record initialization. This can cause tool crashing if the callback pointer "itr->parse_snapshot_options" is dereferenced without performing NULL check. Add a NULL check for the pointer "itr->parse_snapshot_options" before invoke the callback. Fixes: d20031bb63dd6dde ("perf tools: Add AUX area tracing Snapshot Mode") Signed-off-by: Leo Yan Acked-by: Adrian Hunter Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Tiezhu Yang Link: http://lore.kernel.org/lkml/20210420151554.2031768-1-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/auxtrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 5b6ccb90b397..1b4091a3b508 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -634,7 +634,7 @@ int auxtrace_parse_snapshot_options(struct auxtrace_record *itr, break; } - if (itr) + if (itr && itr->parse_snapshot_options) return itr->parse_snapshot_options(itr, opts, str); pr_err("No AUX area tracing to snapshot\n"); From 671b60cb6a897a5b3832fe57657152f2c3995e25 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 21 Apr 2021 14:04:00 +0200 Subject: [PATCH 08/12] perf ftrace: Fix access to pid in array when setting a pid filter Command 'perf ftrace -v -- ls' fails in s390 (at least 5.12.0rc6). The root cause is a missing pointer dereference which causes an array element address to be used as PID. Fix this by extracting the PID. Output before: # ./perf ftrace -v -- ls function_graph tracer is used write '-263732416' to tracing/set_ftrace_pid failed: Invalid argument failed to set ftrace pid # Output after: ./perf ftrace -v -- ls function_graph tracer is used # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 4) | rcu_read_lock_sched_held() { 4) 0.552 us | rcu_lockdep_current_cpu_online(); 4) 6.124 us | } Reported-by: Alexander Schmidt Signed-off-by: Thomas Richter Acked-by: Namhyung Kim Cc: Heiko Carstens Cc: Sumanth Korikkar Cc: Sven Schnelle Cc: Vasily Gorbik Link: http://lore.kernel.org/lkml/20210421120400.2126433-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index d49448a1060c..87cb11a7a3ee 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -289,7 +289,7 @@ static int set_tracing_pid(struct perf_ftrace *ftrace) for (i = 0; i < perf_thread_map__nr(ftrace->evlist->core.threads); i++) { scnprintf(buf, sizeof(buf), "%d", - ftrace->evlist->core.threads->map[i]); + perf_thread_map__pid(ftrace->evlist->core.threads, i)); if (append_tracing_file("set_ftrace_pid", buf) < 0) return -1; } From c6f87141254d16e281e4b4431af7316895207b8f Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Thu, 15 Apr 2021 17:27:44 +0800 Subject: [PATCH 09/12] perf map: Fix error return code in maps__clone() Although 'err' has been initialized to -ENOMEM, but it will be reassigned by the "err = unwind__prepare_access(...)" statement in the for loop. So that, the value of 'err' is unknown when map__clone() failed. Fixes: 6c502584438bda63 ("perf unwind: Call unwind__prepare_access for forked thread") Reported-by: Hulk Robot Signed-off-by: Zhen Lei Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: zhen lei Link: http://lore.kernel.org/lkml/20210415092744.3793-1-thunder.leizhen@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/map.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index fbc40a2c17d4..8af693d9678c 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -840,15 +840,18 @@ out: int maps__clone(struct thread *thread, struct maps *parent) { struct maps *maps = thread->maps; - int err = -ENOMEM; + int err; struct map *map; down_read(&parent->lock); maps__for_each_entry(parent, map) { struct map *new = map__clone(map); - if (new == NULL) + + if (new == NULL) { + err = -ENOMEM; goto out_unlock; + } err = unwind__prepare_access(maps, new, NULL); if (err) From 9c1a07442c95f6e64dc8de099e9f35ea73db7852 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 23 Apr 2021 16:23:20 +0800 Subject: [PATCH 10/12] KVM: x86/xen: Take srcu lock when accessing kvm_memslots() kvm_memslots() will be called by kvm_write_guest_offset_cached() so we should take the srcu lock. Let's pull the srcu lock operation from kvm_steal_time_set_preempted() again to fix xen part. Fixes: 30b5c851af7 ("KVM: x86/xen: Add support for vCPU runstate information") Signed-off-by: Wanpeng Li Message-Id: <1619166200-9215-1-git-send-email-wanpengli@tencent.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eca63625aee4..ee0dc58ac3a5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4025,7 +4025,6 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { struct kvm_host_map map; struct kvm_steal_time *st; - int idx; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -4033,15 +4032,9 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (vcpu->arch.st.preempted) return; - /* - * Take the srcu lock as memslots will be accessed to check the gfn - * cache generation against the memslots generation. - */ - idx = srcu_read_lock(&vcpu->kvm->srcu); - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, &vcpu->arch.st.cache, true)) - goto out; + return; st = map.hva + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); @@ -4049,20 +4042,25 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); - -out: - srcu_read_unlock(&vcpu->kvm->srcu, idx); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + int idx; + if (vcpu->preempted && !vcpu->arch.guest_state_protected) vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); if (kvm_xen_msr_enabled(vcpu->kvm)) kvm_xen_runstate_set_preempted(vcpu); else kvm_steal_time_set_preempted(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); From 799bac5512188522213e2d7eb78ca7094dfdf30c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 24 Apr 2021 09:32:35 -0700 Subject: [PATCH 11/12] Revert "net/rds: Avoid potential use after free in rds_send_remove_from_sock" This reverts commit 0c85a7e87465f2d4cbc768e245f4f45b2f299b05. The games with 'rm' are on (two separate instances) of a local variable, and make no difference. Quoting Aditya Pakki: "I was the author of the patch and it was the cause of the giant UMN revert. The patch is garbage and I was unaware of the steps involved in retracting it. I *believed* the maintainers would pull it, given it was already under Greg's list. The patch does not introduce any bugs but is pointless and is stupid. I accept my incompetence and for not requesting a revert earlier." Link: https://lwn.net/Articles/854319/ Requested-by: Aditya Pakki Cc: Santosh Shilimkar Cc: David S. Miller Cc: Al Viro Signed-off-by: Linus Torvalds --- net/rds/message.c | 1 - net/rds/send.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/net/rds/message.c b/net/rds/message.c index 4fc66ff0f1ec..799034e0f513 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -180,7 +180,6 @@ void rds_message_put(struct rds_message *rm) rds_message_purge(rm); kfree(rm); - rm = NULL; } } EXPORT_SYMBOL_GPL(rds_message_put); diff --git a/net/rds/send.c b/net/rds/send.c index fe5264b9d4b3..985d0b7713ac 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -665,7 +665,7 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status) unlock_and_drop: spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); - if (was_on_sock && rm) + if (was_on_sock) rds_message_put(rm); } From 9f4ad9e425a1d3b6a34617b8ea226d56a119a717 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 25 Apr 2021 13:49:08 -0700 Subject: [PATCH 12/12] Linux 5.12 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index bc19584fee59..3a10a8e08b6d 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 12 SUBLEVEL = 0 -EXTRAVERSION = -rc8 +EXTRAVERSION = NAME = Frozen Wasteland # *DOCUMENTATION*