From 9d7054fb3ac2e8d252aae1268f20623f244e644f Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Thu, 15 Jun 2023 14:51:45 +0200 Subject: [PATCH 01/25] spi: spi-geni-qcom: correctly handle -EPROBE_DEFER from dma_request_chan() Now spi_geni_grab_gpi_chan() errors are correctly reported, the -EPROBE_DEFER error should be returned from probe in case the GPI dma driver is built as module and/or not probed yet. Fixes: b59c122484ec ("spi: spi-geni-qcom: Add support for GPI dma") Fixes: 6532582c353f ("spi: spi-geni-qcom: fix error handling in spi_geni_grab_gpi_chan()") Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20230615-topic-sm8550-upstream-fix-spi-geni-qcom-probe-v2-1-670c3d9e8c9c@linaro.org Signed-off-by: Mark Brown --- drivers/spi/spi-geni-qcom.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/spi/spi-geni-qcom.c b/drivers/spi/spi-geni-qcom.c index a98b781b103a..b293428760bc 100644 --- a/drivers/spi/spi-geni-qcom.c +++ b/drivers/spi/spi-geni-qcom.c @@ -646,6 +646,8 @@ static int spi_geni_init(struct spi_geni_master *mas) geni_se_select_mode(se, GENI_GPI_DMA); dev_dbg(mas->dev, "Using GPI DMA mode for SPI\n"); break; + } else if (ret == -EPROBE_DEFER) { + goto out_pm; } /* * in case of failure to get gpi dma channel, we can still do the From 13bb06f8dd42071cb9a49f6e21099eea05d4b856 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Jun 2023 11:18:30 +0200 Subject: [PATCH 02/25] tick/common: Align tick period during sched_timer setup The tick period is aligned very early while the first clock_event_device is registered. At that point the system runs in periodic mode and switches later to one-shot mode if possible. The next wake-up event is programmed based on the aligned value (tick_next_period) but the delta value, that is used to program the clock_event_device, is computed based on ktime_get(). With the subtracted offset, the device fires earlier than the exact time frame. With a large enough offset the system programs the timer for the next wake-up and the remaining time left is too small to make any boot progress. The system hangs. Move the alignment later to the setup of tick_sched timer. At this point the system switches to oneshot mode and a high resolution clocksource is available. At this point it is safe to align tick_next_period because ktime_get() will now return accurate (not jiffies based) time. [bigeasy: Patch description + testing]. Fixes: e9523a0d81899 ("tick/common: Align tick period with the HZ tick.") Reported-by: Mathias Krause Reported-by: "Bhatnagar, Rishabh" Suggested-by: Mathias Krause Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Richard W.M. Jones Tested-by: Mathias Krause Acked-by: SeongJae Park Cc: stable@vger.kernel.org Link: https://lore.kernel.org/5a56290d-806e-b9a5-f37c-f21958b5a8c0@grsecurity.net Link: https://lore.kernel.org/12c6f9a3-d087-b824-0d05-0d18c9bc1bf3@amazon.com Link: https://lore.kernel.org/r/20230615091830.RxMV2xf_@linutronix.de --- kernel/time/tick-common.c | 13 +------------ kernel/time/tick-sched.c | 13 ++++++++++++- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 65b8658da829..e9138cd7a0f5 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -218,19 +218,8 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - ktime_t next_p; - u32 rem; - tick_do_timer_cpu = cpu; - - next_p = ktime_get(); - div_u64_rem(next_p, TICK_NSEC, &rem); - if (rem) { - next_p -= rem; - next_p += TICK_NSEC; - } - - tick_next_period = next_p; + tick_next_period = ktime_get(); #ifdef CONFIG_NO_HZ_FULL /* * The boot CPU may be nohz_full, in which case set diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 52254679ec48..42c0be3080bd 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -161,8 +161,19 @@ static ktime_t tick_init_jiffy_update(void) raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Did we start the jiffies update yet ? */ - if (last_jiffies_update == 0) + if (last_jiffies_update == 0) { + u32 rem; + + /* + * Ensure that the tick is aligned to a multiple of + * TICK_NSEC. + */ + div_u64_rem(tick_next_period, TICK_NSEC, &rem); + if (rem) + tick_next_period += TICK_NSEC - rem; + last_jiffies_update = tick_next_period; + } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); From 7257d930aadcd62d1c7971ab14f3b1126356abdc Mon Sep 17 00:00:00 2001 From: Teresa Remmet Date: Wed, 14 Jun 2023 14:52:40 +0200 Subject: [PATCH 03/25] regulator: pca9450: Fix LDO3OUT and LDO4OUT MASK L3_OUT and L4_OUT Bit fields range from Bit 0:4 and thus the mask should be 0x1F instead of 0x0F. Fixes: 0935ff5f1f0a ("regulator: pca9450: add pca9450 pmic driver") Signed-off-by: Teresa Remmet Reviewed-by: Frieder Schrempf Link: https://lore.kernel.org/r/20230614125240.3946519-1-t.remmet@phytec.de Signed-off-by: Mark Brown --- include/linux/regulator/pca9450.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h index 3c01c2bf84f5..505c908dbb81 100644 --- a/include/linux/regulator/pca9450.h +++ b/include/linux/regulator/pca9450.h @@ -196,11 +196,11 @@ enum { /* PCA9450_REG_LDO3_VOLT bits */ #define LDO3_EN_MASK 0xC0 -#define LDO3OUT_MASK 0x0F +#define LDO3OUT_MASK 0x1F /* PCA9450_REG_LDO4_VOLT bits */ #define LDO4_EN_MASK 0xC0 -#define LDO4OUT_MASK 0x0F +#define LDO4OUT_MASK 0x1F /* PCA9450_REG_LDO5_VOLT bits */ #define LDO5L_EN_MASK 0xC0 From 54abe19e00cfcc5a72773d15cd00ed19ab763439 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 6 Jun 2023 19:36:13 -0400 Subject: [PATCH 04/25] writeback: fix dereferencing NULL mapping->host on writeback_page_template When commit 19343b5bdd16 ("mm/page-writeback: introduce tracepoint for wait_on_page_writeback()") repurposed the writeback_dirty_page trace event as a template to create its new wait_on_page_writeback trace event, it ended up opening a window to NULL pointer dereference crashes due to the (infrequent) occurrence of a race where an access to a page in the swap-cache happens concurrently with the moment this page is being written to disk and the tracepoint is enabled: BUG: kernel NULL pointer dereference, address: 0000000000000040 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 800000010ec0a067 P4D 800000010ec0a067 PUD 102353067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 1320 Comm: shmem-worker Kdump: loaded Not tainted 6.4.0-rc5+ #13 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS edk2-20230301gitf80f052277c8-1.fc37 03/01/2023 RIP: 0010:trace_event_raw_event_writeback_folio_template+0x76/0xf0 Code: 4d 85 e4 74 5c 49 8b 3c 24 e8 06 98 ee ff 48 89 c7 e8 9e 8b ee ff ba 20 00 00 00 48 89 ef 48 89 c6 e8 fe d4 1a 00 49 8b 04 24 <48> 8b 40 40 48 89 43 28 49 8b 45 20 48 89 e7 48 89 43 30 e8 a2 4d RSP: 0000:ffffaad580b6fb60 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff90e38035c01c RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff90e38035c044 RBP: ffff90e38035c024 R08: 0000000000000002 R09: 0000000000000006 R10: ffff90e38035c02e R11: 0000000000000020 R12: ffff90e380bac000 R13: ffffe3a7456d9200 R14: 0000000000001b81 R15: ffffe3a7456d9200 FS: 00007f2e4e8a15c0(0000) GS:ffff90e3fbc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000040 CR3: 00000001150c6003 CR4: 0000000000170ee0 Call Trace: ? __die+0x20/0x70 ? page_fault_oops+0x76/0x170 ? kernelmode_fixup_or_oops+0x84/0x110 ? exc_page_fault+0x65/0x150 ? asm_exc_page_fault+0x22/0x30 ? trace_event_raw_event_writeback_folio_template+0x76/0xf0 folio_wait_writeback+0x6b/0x80 shmem_swapin_folio+0x24a/0x500 ? filemap_get_entry+0xe3/0x140 shmem_get_folio_gfp+0x36e/0x7c0 ? find_busiest_group+0x43/0x1a0 shmem_fault+0x76/0x2a0 ? __update_load_avg_cfs_rq+0x281/0x2f0 __do_fault+0x33/0x130 do_read_fault+0x118/0x160 do_pte_missing+0x1ed/0x2a0 __handle_mm_fault+0x566/0x630 handle_mm_fault+0x91/0x210 do_user_addr_fault+0x22c/0x740 exc_page_fault+0x65/0x150 asm_exc_page_fault+0x22/0x30 This problem arises from the fact that the repurposed writeback_dirty_page trace event code was written assuming that every pointer to mapping (struct address_space) would come from a file-mapped page-cache object, thus mapping->host would always be populated, and that was a valid case before commit 19343b5bdd16. The swap-cache address space (swapper_spaces), however, doesn't populate its ->host (struct inode) pointer, thus leading to the crashes in the corner-case aforementioned. commit 19343b5bdd16 ended up breaking the assignment of __entry->name and __entry->ino for the wait_on_page_writeback tracepoint -- both dependent on mapping->host carrying a pointer to a valid inode. The assignment of __entry->name was fixed by commit 68f23b89067f ("memcg: fix a crash in wb_workfn when a device disappears"), and this commit fixes the remaining case, for __entry->ino. Link: https://lkml.kernel.org/r/20230606233613.1290819-1-aquini@redhat.com Fixes: 19343b5bdd16 ("mm/page-writeback: introduce tracepoint for wait_on_page_writeback()") Signed-off-by: Rafael Aquini Reviewed-by: Yafang Shao Cc: Aristeu Rozanski Cc: Signed-off-by: Andrew Morton --- include/trace/events/writeback.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 86b2a82da546..54e353c9f919 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(writeback_folio_template, strscpy_pad(__entry->name, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); - __entry->ino = mapping ? mapping->host->i_ino : 0; + __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0; __entry->index = folio->index; ), From 77795f900e2a07c1cbedc375789aefb43843b6c2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Jun 2023 14:29:12 -0400 Subject: [PATCH 05/25] mm/mprotect: fix do_mprotect_pkey() limit check The return of do_mprotect_pkey() can still be incorrectly returned as success if there is a gap that spans to or beyond the end address passed in. Update the check to ensure that the end address has indeed been seen. Link: https://lore.kernel.org/all/CABi2SkXjN+5iFoBhxk71t3cmunTk-s=rB4T7qo0UQRh17s49PQ@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230606182912.586576-1-Liam.Howlett@oracle.com Fixes: 82f951340f25 ("mm/mprotect: fix do_mprotect_pkey() return on error") Signed-off-by: Liam R. Howlett Reported-by: Jeff Xu Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/mprotect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 92d3d3ca390a..c59e7561698c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -867,7 +867,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } tlb_finish_mmu(&tlb); - if (!error && vma_iter_end(&vmi) < end) + if (!error && tmp < end) error = -ENOMEM; out: From 95a301eefa82057571207edd06ea36218985a75e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 5 Jun 2023 21:11:07 +0100 Subject: [PATCH 06/25] mm/vmalloc: do not output a spurious warning when huge vmalloc() fails In __vmalloc_area_node() we always warn_alloc() when an allocation performed by vm_area_alloc_pages() fails unless it was due to a pending fatal signal. However, huge page allocations instigated either by vmalloc_huge() or __vmalloc_node_range() (or a caller that invokes this like kvmalloc() or kvmalloc_node()) always falls back to order-0 allocations if the huge page allocation fails. This renders the warning useless and noisy, especially as all callers appear to be aware that this may fallback. This has already resulted in at least one bug report from a user who was confused by this (see link). Therefore, simply update the code to only output this warning for order-0 pages when no fatal signal is pending. Link: https://bugzilla.suse.com/show_bug.cgi?id=1211410 Link: https://lkml.kernel.org/r/20230605201107.83298-1-lstoakes@gmail.com Fixes: 80b1d8fdfad1 ("mm: vmalloc: correct use of __GFP_NOWARN mask in __vmalloc_area_node()") Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Baoquan He Acked-by: Michal Hocko Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9683573f1225..1d13d71687d7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3098,11 +3098,20 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { - /* vm_area_alloc_pages() can also fail due to a fatal signal */ - if (!fatal_signal_pending(current)) + /* + * vm_area_alloc_pages() can fail due to insufficient memory but + * also:- + * + * - a pending fatal signal + * - insufficient huge page-order pages + * + * Since we always retry allocations at order-0 in the huge page + * case a warning for either is spurious. + */ + if (!fatal_signal_pending(current) && page_order == 0) warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, page order %u, failed to allocate pages", - area->nr_pages * PAGE_SIZE, page_order); + "vmalloc error: size %lu, failed to allocate pages", + area->nr_pages * PAGE_SIZE); goto fail; } From 935d44acf621aa0688fef8312dec3e5940f38f4e Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 7 Jun 2023 15:24:27 +0200 Subject: [PATCH 07/25] memfd: check for non-NULL file_seals in memfd_create() syscall Ensure that file_seals is non-NULL before using it in the memfd_create() syscall. One situation in which memfd_file_seals_ptr() could return a NULL pointer when CONFIG_SHMEM=n, oopsing the kernel. Link: https://lkml.kernel.org/r/20230607132427.2867435-1-roberto.sassu@huaweicloud.com Fixes: 47b9012ecdc7 ("shmem: add sealing support to hugetlb-backed memfd") Signed-off-by: Roberto Sassu Cc: Marc-Andr Lureau Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton --- mm/memfd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/memfd.c b/mm/memfd.c index 69b90c31d38c..e763e76f1106 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -371,12 +371,15 @@ SYSCALL_DEFINE2(memfd_create, inode->i_mode &= ~0111; file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; - *file_seals |= F_SEAL_EXEC; + if (file_seals) { + *file_seals &= ~F_SEAL_SEAL; + *file_seals |= F_SEAL_EXEC; + } } else if (flags & MFD_ALLOW_SEALING) { /* MFD_EXEC and MFD_ALLOW_SEALING are set */ file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; + if (file_seals) + *file_seals &= ~F_SEAL_SEAL; } fd_install(fd, file); From c8a8f3b4a95ace7683b615ad9c9aa0eac59013ae Mon Sep 17 00:00:00 2001 From: David Stevens Date: Wed, 7 Jun 2023 14:31:35 +0900 Subject: [PATCH 08/25] mm/khugepaged: fix iteration in collapse_file Remove an unnecessary call to xas_set(index) when iterating over the target range in collapse_file. The extra call to xas_set reset the xas cursor to the top of the tree, causing the xas_next call on the next iteration to walk the tree to index instead of advancing to index+1. This returned the same page again, which would cause collapse_file to fail because the page is already locked. This bug was hidden when CONFIG_DEBUG_VM was set. When that config was used, the xas_load in a subsequent VM_BUG_ON assert would walk xas from the top of the tree to index, causing the xas_next call on the next loop iteration to advance the cursor as expected. Link: https://lkml.kernel.org/r/20230607053135.2087354-1-stevensd@google.com Fixes: a2e17cc2efc7 ("mm/khugepaged: maintain page cache uptodate flag") Signed-off-by: David Stevens Reviewed-by: Peter Xu Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jiaqi Yan Cc: Kirill A . Shutemov Cc: Matthew Wilcox Cc: Yang Shi Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/khugepaged.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6b9d39d65b73..2d0d58fb4e7f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2070,7 +2070,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); - xas_set(&xas, index); VM_BUG_ON_PAGE(page != xas_load(&xas), page); From b7cb3821905b79b6ed474fd5ba34d1e187649139 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 8 Jun 2023 13:49:27 -0700 Subject: [PATCH 09/25] udmabuf: revert 'Add support for mapping hugepages (v4)' This effectively reverts commit 16c243e99d33 ("udmabuf: Add support for mapping hugepages (v4)"). Recently, Junxiao Chang found a BUG with page map counting as described here [1]. This issue pointed out that the udmabuf driver was making direct use of subpages of hugetlb pages. This is not a good idea, and no other mm code attempts such use. In addition to the mapcount issue, this also causes issues with hugetlb vmemmap optimization and page poisoning. For now, remove hugetlb support. If udmabuf wants to be used on hugetlb mappings, it should be changed to only use complete hugetlb pages. This will require different alignment and size requirements on the UDMABUF_CREATE API. [1] https://lore.kernel.org/linux-mm/20230512072036.1027784-1-junxiao.chang@intel.com/ Link: https://lkml.kernel.org/r/20230608204927.88711-1-mike.kravetz@oracle.com Fixes: 16c243e99d33 ("udmabuf: Add support for mapping hugepages (v4)") Signed-off-by: Mike Kravetz Acked-by: Greg Kroah-Hartman Acked-by: Vivek Kasireddy Acked-by: Gerd Hoffmann Cc: David Hildenbrand Cc: Dongwon Kim Cc: James Houghton Cc: Jerome Marchand Cc: Junxiao Chang Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- drivers/dma-buf/udmabuf.c | 47 +++++---------------------------------- 1 file changed, 6 insertions(+), 41 deletions(-) diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 01f2e86f3f7c..12cf6bb2e3ce 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -207,9 +206,7 @@ static long udmabuf_create(struct miscdevice *device, struct udmabuf *ubuf; struct dma_buf *buf; pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit; - struct page *page, *hpage = NULL; - pgoff_t subpgoff, maxsubpgs; - struct hstate *hpstate; + struct page *page; int seals, ret = -EINVAL; u32 i, flags; @@ -245,7 +242,7 @@ static long udmabuf_create(struct miscdevice *device, if (!memfd) goto err; mapping = memfd->f_mapping; - if (!shmem_mapping(mapping) && !is_file_hugepages(memfd)) + if (!shmem_mapping(mapping)) goto err; seals = memfd_fcntl(memfd, F_GET_SEALS, 0); if (seals == -EINVAL) @@ -256,48 +253,16 @@ static long udmabuf_create(struct miscdevice *device, goto err; pgoff = list[i].offset >> PAGE_SHIFT; pgcnt = list[i].size >> PAGE_SHIFT; - if (is_file_hugepages(memfd)) { - hpstate = hstate_file(memfd); - pgoff = list[i].offset >> huge_page_shift(hpstate); - subpgoff = (list[i].offset & - ~huge_page_mask(hpstate)) >> PAGE_SHIFT; - maxsubpgs = huge_page_size(hpstate) >> PAGE_SHIFT; - } for (pgidx = 0; pgidx < pgcnt; pgidx++) { - if (is_file_hugepages(memfd)) { - if (!hpage) { - hpage = find_get_page_flags(mapping, pgoff, - FGP_ACCESSED); - if (!hpage) { - ret = -EINVAL; - goto err; - } - } - page = hpage + subpgoff; - get_page(page); - subpgoff++; - if (subpgoff == maxsubpgs) { - put_page(hpage); - hpage = NULL; - subpgoff = 0; - pgoff++; - } - } else { - page = shmem_read_mapping_page(mapping, - pgoff + pgidx); - if (IS_ERR(page)) { - ret = PTR_ERR(page); - goto err; - } + page = shmem_read_mapping_page(mapping, pgoff + pgidx); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto err; } ubuf->pages[pgbuf++] = page; } fput(memfd); memfd = NULL; - if (hpage) { - put_page(hpage); - hpage = NULL; - } } exp_info.ops = &udmabuf_ops; From 2049a7d0cbc6ac8e370e836ed68597be04a7dc49 Mon Sep 17 00:00:00 2001 From: Prathu Baronia Date: Thu, 8 Jun 2023 21:14:49 +0530 Subject: [PATCH 10/25] scripts: fix the gfp flags header path in gfp-translate Since gfp flags have been shifted to gfp_types.h so update the path in the gfp-translate script. Link: https://lkml.kernel.org/r/20230608154450.21758-1-prathubaronia2011@gmail.com Fixes: cb5a065b4ea9c ("headers/deps: mm: Split out of ") Signed-off-by: Prathu Baronia Reviewed-by: David Hildenbrand Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Nicolas Schier Cc: Ingo Molnar Cc: Yury Norov Cc: Signed-off-by: Andrew Morton --- scripts/gfp-translate | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/gfp-translate b/scripts/gfp-translate index b2ce416d944b..6c9aed17cf56 100755 --- a/scripts/gfp-translate +++ b/scripts/gfp-translate @@ -63,11 +63,11 @@ fi # Extract GFP flags from the kernel source TMPFILE=`mktemp -t gfptranslate-XXXXXX` || exit 1 -grep -q ___GFP $SOURCE/include/linux/gfp.h +grep -q ___GFP $SOURCE/include/linux/gfp_types.h if [ $? -eq 0 ]; then - grep "^#define ___GFP" $SOURCE/include/linux/gfp.h | sed -e 's/u$//' | grep -v GFP_BITS > $TMPFILE + grep "^#define ___GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/u$//' | grep -v GFP_BITS > $TMPFILE else - grep "^#define __GFP" $SOURCE/include/linux/gfp.h | sed -e 's/(__force gfp_t)//' | sed -e 's/u)/)/' | grep -v GFP_BITS | sed -e 's/)\//) \//' > $TMPFILE + grep "^#define __GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/(__force gfp_t)//' | sed -e 's/u)/)/' | grep -v GFP_BITS | sed -e 's/)\//) \//' > $TMPFILE fi # Parse the flags From 6a59cb5158bff13b80f116305155fbe4967a5010 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 7 Jun 2023 15:13:35 -0700 Subject: [PATCH 11/25] scripts/gdb: fix SB_* constants parsing --0000000000009a0c9905fd9173ad Content-Transfer-Encoding: 8bit After f15afbd34d8f ("fs: fix undefined behavior in bit shift for SB_NOUSER") the constants were changed from plain integers which LX_VALUE() can parse to constants using the BIT() macro which causes the following: Reading symbols from build/linux-custom/vmlinux...done. Traceback (most recent call last): File "/home/fainelli/work/buildroot/output/arm64/build/linux-custom/vmlinux-gdb.py", line 25, in import linux.constants File "/home/fainelli/work/buildroot/output/arm64/build/linux-custom/scripts/gdb/linux/constants.py", line 5 LX_SB_RDONLY = ((((1UL))) << (0)) Use LX_GDBPARSED() which does not suffer from that issue. f15afbd34d8f ("fs: fix undefined behavior in bit shift for SB_NOUSER") Link: https://lkml.kernel.org/r/20230607221337.2781730-1-florian.fainelli@broadcom.com Signed-off-by: Florian Fainelli Acked-by: Christian Brauner Cc: Hao Ge Cc: Jan Kiszka Cc: Kieran Bingham Cc: Luis Chamberlain Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- scripts/gdb/linux/constants.py.in | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in index 471300ba176c..50a92c4e9984 100644 --- a/scripts/gdb/linux/constants.py.in +++ b/scripts/gdb/linux/constants.py.in @@ -48,12 +48,12 @@ if IS_BUILTIN(CONFIG_COMMON_CLK): LX_GDBPARSED(CLK_GET_RATE_NOCACHE) /* linux/fs.h */ -LX_VALUE(SB_RDONLY) -LX_VALUE(SB_SYNCHRONOUS) -LX_VALUE(SB_MANDLOCK) -LX_VALUE(SB_DIRSYNC) -LX_VALUE(SB_NOATIME) -LX_VALUE(SB_NODIRATIME) +LX_GDBPARSED(SB_RDONLY) +LX_GDBPARSED(SB_SYNCHRONOUS) +LX_GDBPARSED(SB_MANDLOCK) +LX_GDBPARSED(SB_DIRSYNC) +LX_GDBPARSED(SB_NOATIME) +LX_GDBPARSED(SB_NODIRATIME) /* linux/htimer.h */ LX_GDBPARSED(hrtimer_resolution) From 679bd7ebdd315bf457a4740b306ae99f1d0a403d Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 9 Jun 2023 12:57:32 +0900 Subject: [PATCH 12/25] nilfs2: fix buffer corruption due to concurrent device reads As a result of analysis of a syzbot report, it turned out that in three cases where nilfs2 allocates block device buffers directly via sb_getblk, concurrent reads to the device can corrupt the allocated buffers. Nilfs2 uses sb_getblk for segment summary blocks, that make up a log header, and the super root block, that is the trailer, and when moving and writing the second super block after fs resize. In any of these, since the uptodate flag is not set when storing metadata to be written in the allocated buffers, the stored metadata will be overwritten if a device read of the same block occurs concurrently before the write. This causes metadata corruption and misbehavior in the log write itself, causing warnings in nilfs_btree_assign() as reported. Fix these issues by setting an uptodate flag on the buffer head on the first or before modifying each buffer obtained with sb_getblk, and clearing the flag on failure. When setting the uptodate flag, the lock_buffer/unlock_buffer pair is used to perform necessary exclusive control, and the buffer is filled to ensure that uninitialized bytes are not mixed into the data read from others. As for buffers for segment summary blocks, they are filled incrementally, so if the uptodate flag was unset on their allocation, set the flag and zero fill the buffer once at that point. Also, regarding the superblock move routine, the starting point of the memset call to zerofill the block is incorrectly specified, which can cause a buffer overflow on file systems with block sizes greater than 4KiB. In addition, if the superblock is moved within a large block, it is necessary to assume the possibility that the data in the superblock will be destroyed by zero-filling before copying. So fix these potential issues as well. Link: https://lkml.kernel.org/r/20230609035732.20426-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: syzbot+31837fe952932efc8fb9@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/00000000000030000a05e981f475@google.com Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/segbuf.c | 6 ++++++ fs/nilfs2/segment.c | 7 +++++++ fs/nilfs2/super.c | 25 +++++++++++++++++++++++-- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 1362ccb64ec7..6e59dc19a732 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -101,6 +101,12 @@ int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf) if (unlikely(!bh)) return -ENOMEM; + lock_buffer(bh); + if (!buffer_uptodate(bh)) { + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); nilfs_segbuf_add_segsum_buffer(segbuf, bh); return 0; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index ac949fd7603f..c2553024bd25 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -981,10 +981,13 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, unsigned int isz, srsz; bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root; + + lock_buffer(bh_sr); raw_sr = (struct nilfs_super_root *)bh_sr->b_data; isz = nilfs->ns_inode_size; srsz = NILFS_SR_BYTES(isz); + raw_sr->sr_sum = 0; /* Ensure initialization within this update */ raw_sr->sr_bytes = cpu_to_le16(srsz); raw_sr->sr_nongc_ctime = cpu_to_le64(nilfs_doing_gc() ? @@ -998,6 +1001,8 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr + NILFS_SR_SUFILE_OFFSET(isz), 1); memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz); + set_buffer_uptodate(bh_sr); + unlock_buffer(bh_sr); } static void nilfs_redirty_inodes(struct list_head *head) @@ -1780,6 +1785,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) list_for_each_entry(segbuf, logs, sb_list) { list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { + clear_buffer_uptodate(bh); if (bh->b_page != bd_page) { if (bd_page) end_page_writeback(bd_page); @@ -1791,6 +1797,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) b_assoc_buffers) { clear_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { + clear_buffer_uptodate(bh); if (bh->b_page != bd_page) { end_page_writeback(bd_page); bd_page = bh->b_page; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 77f1e5778d1c..9ba4933087af 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -372,10 +372,31 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) goto out; } nsbp = (void *)nsbh->b_data + offset; - memset(nsbp, 0, nilfs->ns_blocksize); + + lock_buffer(nsbh); + if (sb2i >= 0) { + /* + * The position of the second superblock only changes by 4KiB, + * which is larger than the maximum superblock data size + * (= 1KiB), so there is no need to use memmove() to allow + * overlap between source and destination. + */ + memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize); + + /* + * Zero fill after copy to avoid overwriting in case of move + * within the same block. + */ + memset(nsbh->b_data, 0, offset); + memset((void *)nsbp + nilfs->ns_sbsize, 0, + nsbh->b_size - offset - nilfs->ns_sbsize); + } else { + memset(nsbh->b_data, 0, nsbh->b_size); + } + set_buffer_uptodate(nsbh); + unlock_buffer(nsbh); if (sb2i >= 0) { - memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize); brelse(nilfs->ns_sbh[sb2i]); nilfs->ns_sbh[sb2i] = nsbh; nilfs->ns_sbp[sb2i] = nsbp; From 47a7c01c3efc6581f5dcca40928baeb38e1e40c2 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:12 +0000 Subject: [PATCH 13/25] Revert "mm: shrinkers: convert shrinker_rwsem to mutex" Patch series "revert shrinker_srcu related changes". This patch (of 7): This reverts commit cf2e309ebca7bb0916771839f9b580b06c778530. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. After discussion, we will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So revert the shrinker_mutex back to shrinker_rwsem first. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-1-qi.zheng@linux.dev Link: https://lkml.kernel.org/r/20230609081518.3039120-2-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Cc: Yujie Liu Signed-off-by: Andrew Morton --- drivers/md/dm-cache-metadata.c | 2 +- drivers/md/dm-thin-metadata.c | 2 +- fs/super.c | 2 +- mm/shrinker_debug.c | 14 +++++++------- mm/vmscan.c | 34 +++++++++++++++++----------------- 5 files changed, 27 insertions(+), 27 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 9e0c69958587..acffed750e3e 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1828,7 +1828,7 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs cmd root_lock). - * - must take shrinker_mutex without holding cmd->root_lock + * - must take shrinker_rwsem without holding cmd->root_lock */ new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, CACHE_MAX_CONCURRENT_LOCKS); diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 9f5cb52c5763..fd464fb024c3 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1887,7 +1887,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs pmd root_lock). - * - must take shrinker_mutex without holding pmd->root_lock + * - must take shrinker_rwsem without holding pmd->root_lock */ new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, THIN_MAX_CONCURRENT_LOCKS); diff --git a/fs/super.c b/fs/super.c index 34afe411cf2b..04bc62ab7dfe 100644 --- a/fs/super.c +++ b/fs/super.c @@ -54,7 +54,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = { * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the - * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we + * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index fe10436d9911..2be15b8a6d0b 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -8,7 +8,7 @@ #include /* defined in vmscan.c */ -extern struct mutex shrinker_mutex; +extern struct rw_semaphore shrinker_rwsem; extern struct list_head shrinker_list; extern struct srcu_struct shrinker_srcu; @@ -168,7 +168,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker) char buf[128]; int id; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); /* debugfs isn't initialized yet, add debugfs entries later. */ if (!shrinker_debugfs_root) @@ -211,7 +211,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) if (!new) return -ENOMEM; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); old = shrinker->name; shrinker->name = new; @@ -229,7 +229,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) shrinker->debugfs_entry = entry; } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); kfree_const(old); @@ -242,7 +242,7 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, { struct dentry *entry = shrinker->debugfs_entry; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); kfree_const(shrinker->name); shrinker->name = NULL; @@ -271,14 +271,14 @@ static int __init shrinker_debugfs_init(void) shrinker_debugfs_root = dentry; /* Create debugfs entries for shrinkers registered at boot */ - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); list_for_each_entry(shrinker, &shrinker_list, list) if (!shrinker->debugfs_entry) { ret = shrinker_debugfs_add(shrinker); if (ret) break; } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 6d0cd2840cf0..4730dba253c8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include @@ -190,7 +190,7 @@ struct scan_control { int vm_swappiness = 60; LIST_HEAD(shrinker_list); -DEFINE_MUTEX(shrinker_mutex); +DECLARE_RWSEM(shrinker_rwsem); DEFINE_SRCU(shrinker_srcu); static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0); @@ -213,7 +213,7 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, { return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, &shrinker_srcu, - lockdep_is_held(&shrinker_mutex)); + lockdep_is_held(&shrinker_rwsem)); } static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, @@ -292,7 +292,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) int nid, size, ret = 0; int map_size, defer_size = 0; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); map_size = shrinker_map_size(shrinker_nr_max); defer_size = shrinker_defer_size(shrinker_nr_max); size = map_size + defer_size; @@ -308,7 +308,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) info->map_nr_max = shrinker_nr_max; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -324,7 +324,7 @@ static int expand_shrinker_info(int new_id) if (!root_mem_cgroup) goto out; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); map_size = shrinker_map_size(new_nr_max); defer_size = shrinker_defer_size(new_nr_max); @@ -374,7 +374,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) if (mem_cgroup_disabled()) return -ENOSYS; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -388,7 +388,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) shrinker->id = id; ret = 0; unlock: - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -398,7 +398,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) BUG_ON(id < 0); - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); idr_remove(&shrinker_idr, id); } @@ -433,7 +433,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -442,7 +442,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) atomic_long_add(nr, &parent_info->nr_deferred[i]); } } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); } static bool cgroup_reclaim(struct scan_control *sc) @@ -743,9 +743,9 @@ void free_prealloced_shrinker(struct shrinker *shrinker) shrinker->name = NULL; #endif if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return; } @@ -755,11 +755,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker) void register_shrinker_prepared(struct shrinker *shrinker) { - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); list_add_tail_rcu(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); } static int __register_shrinker(struct shrinker *shrinker) @@ -810,13 +810,13 @@ void unregister_shrinker(struct shrinker *shrinker) if (!(shrinker->flags & SHRINKER_REGISTERED)) return; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); list_del_rcu(&shrinker->list); shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); atomic_inc(&shrinker_srcu_generation); synchronize_srcu(&shrinker_srcu); From 07252b0f97150b03050f74f42250f275566d70b9 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:13 +0000 Subject: [PATCH 14/25] Revert "mm: vmscan: remove shrinker_rwsem from synchronize_shrinkers()" This reverts commit 1643db98d9b314e0a592d152603094fbf7ab906e. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. We will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So we still need shrinker_rwsem in synchronize_shrinkers() after reverting the shrinker_srcu related changes. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-3-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4730dba253c8..0ba0e1180f3f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -831,11 +831,15 @@ EXPORT_SYMBOL(unregister_shrinker); /** * synchronize_shrinkers - Wait for all running shrinkers to complete. * - * This is useful to guarantee that all shrinker invocations have seen an - * update, before freeing memory. + * This is equivalent to calling unregister_shrink() and register_shrinker(), + * but atomically and with less overhead. This is useful to guarantee that all + * shrinker invocations have seen an update, before freeing memory, similar to + * rcu. */ void synchronize_shrinkers(void) { + down_write(&shrinker_rwsem); + up_write(&shrinker_rwsem); atomic_inc(&shrinker_srcu_generation); synchronize_srcu(&shrinker_srcu); } From c534f7cca6b9b1c0dc97d6e9c5587858d4330cd9 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:14 +0000 Subject: [PATCH 15/25] Revert "mm: vmscan: hold write lock to reparent shrinker nr_deferred" This reverts commit b3cabea3c9153fd42fe5cb851ac58b51ea2b32b8. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. We will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. Because there will be other readers after reverting the shrinker_srcu related changes, so it is better to restore to hold read lock to reparent shrinker nr_deferred. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-4-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 0ba0e1180f3f..d1d309fc3212 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -433,7 +433,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - down_write(&shrinker_rwsem); + down_read(&shrinker_rwsem); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -442,7 +442,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) atomic_long_add(nr, &parent_info->nr_deferred[i]); } } - up_write(&shrinker_rwsem); + up_read(&shrinker_rwsem); } static bool cgroup_reclaim(struct scan_control *sc) From 1a554ecc971406e291cea867112f7f2e377e810e Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:15 +0000 Subject: [PATCH 16/25] Revert "mm: shrinkers: make count and scan in shrinker debugfs lockless" This reverts commit 20cd1892fcc3efc10a7ac327cc3790494bec46b5. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. We will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So revert the shrinker_srcu related changes first. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-5-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/shrinker_debug.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 2be15b8a6d0b..3ab53fad8876 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -5,12 +5,10 @@ #include #include #include -#include /* defined in vmscan.c */ extern struct rw_semaphore shrinker_rwsem; extern struct list_head shrinker_list; -extern struct srcu_struct shrinker_srcu; static DEFINE_IDA(shrinker_debugfs_ida); static struct dentry *shrinker_debugfs_root; @@ -51,13 +49,18 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) struct mem_cgroup *memcg; unsigned long total; bool memcg_aware; - int ret = 0, nid, srcu_idx; + int ret, nid; count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); if (!count_per_node) return -ENOMEM; - srcu_idx = srcu_read_lock(&shrinker_srcu); + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + kfree(count_per_node); + return ret; + } + rcu_read_lock(); memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE; @@ -88,7 +91,8 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) } } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - srcu_read_unlock(&shrinker_srcu, srcu_idx); + rcu_read_unlock(); + up_read(&shrinker_rwsem); kfree(count_per_node); return ret; @@ -111,8 +115,9 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, .gfp_mask = GFP_KERNEL, }; struct mem_cgroup *memcg = NULL; - int nid, srcu_idx; + int nid; char kbuf[72]; + ssize_t ret; read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1); if (copy_from_user(kbuf, buf, read_len)) @@ -141,7 +146,11 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return -EINVAL; } - srcu_idx = srcu_read_lock(&shrinker_srcu); + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + mem_cgroup_put(memcg); + return ret; + } sc.nid = nid; sc.memcg = memcg; @@ -150,7 +159,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, shrinker->scan_objects(shrinker, &sc); - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); mem_cgroup_put(memcg); return size; From d6ecbcd70fffcffe79d45f3be7b8fcf5d8e04a3d Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:16 +0000 Subject: [PATCH 17/25] Revert "mm: vmscan: add shrinker_srcu_generation" This reverts commit 475733dda5aedba9e086379aafe6b5ffd53e8f5e. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. We will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So revert the shrinker_srcu related changes first. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-6-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d1d309fc3212..50775b73d0c7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -192,7 +192,6 @@ int vm_swappiness = 60; LIST_HEAD(shrinker_list); DECLARE_RWSEM(shrinker_rwsem); DEFINE_SRCU(shrinker_srcu); -static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -818,7 +817,6 @@ void unregister_shrinker(struct shrinker *shrinker) debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); up_write(&shrinker_rwsem); - atomic_inc(&shrinker_srcu_generation); synchronize_srcu(&shrinker_srcu); shrinker_debugfs_remove(debugfs_entry, debugfs_id); @@ -840,7 +838,6 @@ void synchronize_shrinkers(void) { down_write(&shrinker_rwsem); up_write(&shrinker_rwsem); - atomic_inc(&shrinker_srcu_generation); synchronize_srcu(&shrinker_srcu); } EXPORT_SYMBOL(synchronize_shrinkers); @@ -950,20 +947,18 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; - int srcu_idx, generation; - int i = 0; + int srcu_idx; + int i; if (!mem_cgroup_online(memcg)) return 0; -again: srcu_idx = srcu_read_lock(&shrinker_srcu); info = shrinker_info_srcu(memcg, nid); if (unlikely(!info)) goto unlock; - generation = atomic_read(&shrinker_srcu_generation); - for_each_set_bit_from(i, info->map, info->map_nr_max) { + for_each_set_bit(i, info->map, info->map_nr_max) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -1009,11 +1004,6 @@ again: set_shrinker_bit(memcg, nid, i); } freed += ret; - if (atomic_read(&shrinker_srcu_generation) != generation) { - srcu_read_unlock(&shrinker_srcu, srcu_idx); - i++; - goto again; - } } unlock: srcu_read_unlock(&shrinker_srcu, srcu_idx); @@ -1053,7 +1043,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, { unsigned long ret, freed = 0; struct shrinker *shrinker; - int srcu_idx, generation; + int srcu_idx; /* * The root memcg might be allocated even though memcg is disabled @@ -1067,7 +1057,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, srcu_idx = srcu_read_lock(&shrinker_srcu); - generation = atomic_read(&shrinker_srcu_generation); list_for_each_entry_srcu(shrinker, &shrinker_list, list, srcu_read_lock_held(&shrinker_srcu)) { struct shrink_control sc = { @@ -1080,11 +1069,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; freed += ret; - - if (atomic_read(&shrinker_srcu_generation) != generation) { - freed = freed ? : 1; - break; - } } srcu_read_unlock(&shrinker_srcu, srcu_idx); From 7cee3603192a198b23c034a7372b599081cbee88 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:17 +0000 Subject: [PATCH 18/25] Revert "mm: vmscan: make memcg slab shrink lockless" This reverts commit caa05325c9126c77ebf114edce51536a0d0a9a08. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. After discussion, we will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So revert the shrinker_srcu related changes first. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-7-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 50775b73d0c7..a008d7f2d0fc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -210,21 +210,8 @@ static inline int shrinker_defer_size(int nr_items) static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { - return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, - &shrinker_srcu, - lockdep_is_held(&shrinker_rwsem)); -} - -static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, - int nid) -{ - return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, - &shrinker_srcu); -} - -static void free_shrinker_info_rcu(struct rcu_head *head) -{ - kvfree(container_of(head, struct shrinker_info, rcu)); + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); } static int expand_one_shrinker_info(struct mem_cgroup *memcg, @@ -265,7 +252,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, defer_size - old_defer_size); rcu_assign_pointer(pn->shrinker_info, new); - call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu); + kvfree_rcu(old, rcu); } return 0; @@ -351,16 +338,15 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; - int srcu_idx; - srcu_idx = srcu_read_lock(&shrinker_srcu); - info = shrinker_info_srcu(memcg, nid); + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); set_bit(shrinker_id, info->map); } - srcu_read_unlock(&shrinker_srcu, srcu_idx); + rcu_read_unlock(); } } @@ -374,6 +360,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) return -ENOSYS; down_write(&shrinker_rwsem); + /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -407,7 +394,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_srcu(memcg, nid); + info = shrinker_info_protected(memcg, nid); return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); } @@ -416,7 +403,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_srcu(memcg, nid); + info = shrinker_info_protected(memcg, nid); return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); } @@ -947,14 +934,15 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; - int srcu_idx; int i; if (!mem_cgroup_online(memcg)) return 0; - srcu_idx = srcu_read_lock(&shrinker_srcu); - info = shrinker_info_srcu(memcg, nid); + if (!down_read_trylock(&shrinker_rwsem)) + return 0; + + info = shrinker_info_protected(memcg, nid); if (unlikely(!info)) goto unlock; @@ -1004,9 +992,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, set_shrinker_bit(memcg, nid, i); } freed += ret; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } } unlock: - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); return freed; } #else /* CONFIG_MEMCG */ From 71c3ad65fabec9620d3f548b2da948c79c7ad9d5 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:18 +0000 Subject: [PATCH 19/25] Revert "mm: vmscan: make global slab shrink lockless" This reverts commit f95bdb700bc6bb74e1199b1f5f90c613e152cfa7. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. After discussion, we will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So revert the shrinker_srcu related changes first. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-8-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index a008d7f2d0fc..5bf98d0a22c9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include @@ -191,7 +190,6 @@ int vm_swappiness = 60; LIST_HEAD(shrinker_list); DECLARE_RWSEM(shrinker_rwsem); -DEFINE_SRCU(shrinker_srcu); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -742,7 +740,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker) void register_shrinker_prepared(struct shrinker *shrinker) { down_write(&shrinker_rwsem); - list_add_tail_rcu(&shrinker->list, &shrinker_list); + list_add_tail(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); up_write(&shrinker_rwsem); @@ -797,15 +795,13 @@ void unregister_shrinker(struct shrinker *shrinker) return; down_write(&shrinker_rwsem); - list_del_rcu(&shrinker->list); + list_del(&shrinker->list); shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); up_write(&shrinker_rwsem); - synchronize_srcu(&shrinker_srcu); - shrinker_debugfs_remove(debugfs_entry, debugfs_id); kfree(shrinker->nr_deferred); @@ -825,7 +821,6 @@ void synchronize_shrinkers(void) { down_write(&shrinker_rwsem); up_write(&shrinker_rwsem); - synchronize_srcu(&shrinker_srcu); } EXPORT_SYMBOL(synchronize_shrinkers); @@ -1036,7 +1031,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, { unsigned long ret, freed = 0; struct shrinker *shrinker; - int srcu_idx; /* * The root memcg might be allocated even though memcg is disabled @@ -1048,10 +1042,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); - srcu_idx = srcu_read_lock(&shrinker_srcu); + if (!down_read_trylock(&shrinker_rwsem)) + goto out; - list_for_each_entry_srcu(shrinker, &shrinker_list, list, - srcu_read_lock_held(&shrinker_srcu)) { + list_for_each_entry(shrinker, &shrinker_list, list) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -1062,9 +1056,19 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; freed += ret; + /* + * Bail out if someone want to register a new shrinker to + * prevent the registration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } } - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); +out: cond_resched(); return freed; } From 782e53d0c14420858dbf0f8f797973c150d3b6d7 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 12 Jun 2023 11:14:56 +0900 Subject: [PATCH 20/25] nilfs2: prevent general protection fault in nilfs_clear_dirty_page() In a syzbot stress test that deliberately causes file system errors on nilfs2 with a corrupted disk image, it has been reported that nilfs_clear_dirty_page() called from nilfs_clear_dirty_pages() can cause a general protection fault. In nilfs_clear_dirty_pages(), when looking up dirty pages from the page cache and calling nilfs_clear_dirty_page() for each dirty page/folio retrieved, the back reference from the argument page to "mapping" may have been changed to NULL (and possibly others). It is necessary to check this after locking the page/folio. So, fix this issue by not calling nilfs_clear_dirty_page() on a page/folio after locking it in nilfs_clear_dirty_pages() if the back reference "mapping" from the page/folio is different from the "mapping" that held the page/folio just before. Link: https://lkml.kernel.org/r/20230612021456.3682-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: syzbot+53369d11851d8f26735c@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/000000000000da4f6b05eb9bf593@google.com Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/page.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 5cf30827f244..b4e54d079b7d 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -370,7 +370,15 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) struct folio *folio = fbatch.folios[i]; folio_lock(folio); - nilfs_clear_dirty_page(&folio->page, silent); + + /* + * This folio may have been removed from the address + * space by truncation or invalidation when the lock + * was acquired. Skip processing in that case. + */ + if (likely(folio->mapping == mapping)) + nilfs_clear_dirty_page(&folio->page, silent); + folio_unlock(folio); } folio_batch_release(&fbatch); From 823b37e8a7104ca2bcf1945be77f33d4d5740f55 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 15 Jun 2023 09:18:20 +0100 Subject: [PATCH 21/25] mailmap: add entries for Ben Dooks I am going to be losing my sifive.com address soon and I also realised my old Simtec address (from >10 years ago) is also not been updates so update .mailmap for both. Link: https://lkml.kernel.org/r/20230615081820.79485-1-ben.dooks@codethink.co.uk Signed-off-by: Ben Dooks Signed-off-by: Ben Dooks Signed-off-by: Andrew Morton --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 650689d00930..c94da2a63d0f 100644 --- a/.mailmap +++ b/.mailmap @@ -70,6 +70,8 @@ Baolin Wang Baolin Wang Bart Van Assche Bart Van Assche +Ben Dooks +Ben Dooks Ben Gardner Ben M Cahill Ben Widawsky From 0518dbe97fe629fea255318841cf3ef1b4532d66 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 14 Jun 2023 22:18:58 +0100 Subject: [PATCH 22/25] selftests/mm: fix cross compilation with LLVM Currently the MM selftests attempt to work out the target architecture by using CROSS_COMPILE or otherwise querying the host machine, storing the target architecture in a variable called MACHINE rather than the usual ARCH though as far as I can tell (including for x86_64) the value is the same as we would use for architecture. When cross compiling with LLVM we don't need a CROSS_COMPILE as LLVM can support many target architectures in a single build so this logic does not work, CROSS_COMPILE is not set and we end up selecting tests for the host rather than target architecture. Fix this by using the more standard ARCH to describe the architecture, taking it from the environment if specified. Link: https://lkml.kernel.org/r/20230614-kselftest-mm-llvm-v1-1-180523f277d3@kernel.org Signed-off-by: Mark Brown Cc: Nick Desaulniers Cc: Albert Ou Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Shuah Khan Cc: Tom Rix Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 23af4633f0f4..4f0c50c33ba7 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -5,12 +5,15 @@ LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h include local_config.mk +ifeq ($(ARCH),) + ifeq ($(CROSS_COMPILE),) uname_M := $(shell uname -m 2>/dev/null || echo not) else uname_M := $(shell echo $(CROSS_COMPILE) | grep -o '^[a-z0-9]\+') endif -MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') +ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') +endif # Without this, failed build products remain, with up-to-date timestamps, # thus tricking Make (and you!) into believing that All Is Well, in subsequent @@ -65,7 +68,7 @@ TEST_GEN_PROGS += ksm_tests TEST_GEN_PROGS += ksm_functional_tests TEST_GEN_PROGS += mdwe_test -ifeq ($(MACHINE),x86_64) +ifeq ($(ARCH),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c) CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) @@ -87,13 +90,13 @@ TEST_GEN_PROGS += $(BINARIES_64) endif else -ifneq (,$(findstring $(MACHINE),ppc64)) +ifneq (,$(findstring $(ARCH),ppc64)) TEST_GEN_PROGS += protection_keys endif endif -ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sparc64 x86_64)) +ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sparc64 x86_64)) TEST_GEN_PROGS += va_high_addr_switch TEST_GEN_PROGS += virtual_address_range TEST_GEN_PROGS += write_to_hugetlbfs @@ -112,7 +115,7 @@ $(TEST_GEN_PROGS): vm_util.c $(OUTPUT)/uffd-stress: uffd-common.c $(OUTPUT)/uffd-unit-tests: uffd-common.c -ifeq ($(MACHINE),x86_64) +ifeq ($(ARCH),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) From c8e796895e2310b6130e7577248da1d771431a77 Mon Sep 17 00:00:00 2001 From: Russ Weight Date: Tue, 20 Jun 2023 13:28:24 -0700 Subject: [PATCH 23/25] regmap: spi-avmm: Fix regmap_bus max_raw_write The max_raw_write member of the regmap_spi_avmm_bus structure is defined as: .max_raw_write = SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT SPI_AVMM_VAL_SIZE == 4 and MAX_WRITE_CNT == 1 so this results in a maximum write transfer size of 4 bytes which provides only enough space to transfer the address of the target register. It provides no space for the value to be transferred. This bug became an issue (divide-by-zero in _regmap_raw_write()) after the following was accepted into mainline: commit 3981514180c9 ("regmap: Account for register length when chunking") Change max_raw_write to include space (4 additional bytes) for both the register address and value: .max_raw_write = SPI_AVMM_REG_SIZE + SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT Fixes: 7f9fb67358a2 ("regmap: add Intel SPI Slave to AVMM Bus Bridge support") Reviewed-by: Matthew Gerlach Signed-off-by: Russ Weight Link: https://lore.kernel.org/r/20230620202824.380313-1-russell.h.weight@intel.com Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-spi-avmm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/regmap/regmap-spi-avmm.c b/drivers/base/regmap/regmap-spi-avmm.c index 4c2b94b3e30b..6af692844c19 100644 --- a/drivers/base/regmap/regmap-spi-avmm.c +++ b/drivers/base/regmap/regmap-spi-avmm.c @@ -660,7 +660,7 @@ static const struct regmap_bus regmap_spi_avmm_bus = { .reg_format_endian_default = REGMAP_ENDIAN_NATIVE, .val_format_endian_default = REGMAP_ENDIAN_NATIVE, .max_raw_read = SPI_AVMM_VAL_SIZE * MAX_READ_CNT, - .max_raw_write = SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT, + .max_raw_write = SPI_AVMM_REG_SIZE + SPI_AVMM_VAL_SIZE * MAX_WRITE_CNT, .free_context = spi_avmm_bridge_ctx_free, }; From afd384f0dbea2229fd11159efb86a5b41051c4a9 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 8 Jun 2023 17:42:53 -0400 Subject: [PATCH 24/25] Revert "virtio-blk: support completion batching for the IRQ path" This reverts commit 07b679f70d73483930e8d3c293942416d9cd5c13. This change appears to have broken things... We now see applications hanging during disk accesses. e.g. multi-port virtio-blk device running in h/w (FPGA) Host running a simple 'fio' test. [global] thread=1 direct=1 ioengine=libaio norandommap=1 group_reporting=1 bs=4K rw=read iodepth=128 runtime=1 numjobs=4 time_based [job0] filename=/dev/vda [job1] filename=/dev/vdb [job2] filename=/dev/vdc ... [job15] filename=/dev/vdp i.e. 16 disks; 4 queues per disk; simple burst of 4KB reads This is repeatedly run in a loop. After a few, normally <10 seconds, fio hangs. With 64 queues (16 disks), failure occurs within a few seconds; with 8 queues (2 disks) it may take ~hour before hanging. Last message: fio-3.19 Starting 8 threads Jobs: 1 (f=1): [_(7),R(1)][68.3%][eta 03h:11m:06s] I think this means at the end of the run 1 queue was left incomplete. 'diskstats' (run while fio is hung) shows no outstanding transactions. e.g. $ cat /proc/diskstats ... 252 0 vda 1843140071 0 14745120568 712568645 0 0 0 0 0 3117947 712568645 0 0 0 0 0 0 252 16 vdb 1816291511 0 14530332088 704905623 0 0 0 0 0 3117711 704905623 0 0 0 0 0 0 ... Other stats (in the h/w, and added to the virtio-blk driver ([a]virtio_queue_rq(), [b]virtblk_handle_req(), [c]virtblk_request_done()) all agree, and show every request had a completion, and that virtblk_request_done() never gets called. e.g. PF= 0 vq=0 1 2 3 [a]request_count - 839416590 813148916 105586179 84988123 [b]completion1_count - 839416590 813148916 105586179 84988123 [c]completion2_count - 0 0 0 0 PF= 1 vq=0 1 2 3 [a]request_count - 823335887 812516140 104582672 75856549 [b]completion1_count - 823335887 812516140 104582672 75856549 [c]completion2_count - 0 0 0 0 i.e. the issue is after the virtio-blk driver. This change was introduced in kernel 6.3.0. I am seeing this using 6.3.3. If I run with an earlier kernel (5.15), it does not occur. If I make a simple patch to the 6.3.3 virtio-blk driver, to skip the blk_mq_add_to_batch()call, it does not fail. e.g. kernel 5.15 - this is OK virtio_blk.c,virtblk_done() [irq handler] if (likely(!blk_should_fake_timeout(req->q))) { blk_mq_complete_request(req); } kernel 6.3.3 - this fails virtio_blk.c,virtblk_handle_req() [irq handler] if (likely(!blk_should_fake_timeout(req->q))) { if (!blk_mq_complete_request_remote(req)) { if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) { virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed } } } If I do, kernel 6.3.3 - this is OK virtio_blk.c,virtblk_handle_req() [irq handler] if (likely(!blk_should_fake_timeout(req->q))) { if (!blk_mq_complete_request_remote(req)) { virtblk_request_done(req); //force this here... if (!blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) { virtblk_request_done(req); //this never gets called... so blk_mq_add_to_batch() must always succeed } } } Perhaps you might like to fix/test/revert this change... Martin Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306090826.C1fZmdMe-lkp@intel.com/ Cc: Suwan Kim Tested-by: edliaw@google.com Reported-by: "Roberts, Martin" Message-Id: <336455b4f630f329380a8f53ee8cad3868764d5c.1686295549.git.mst@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/block/virtio_blk.c | 82 +++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 45 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 2b918e28acaa..b47358da92a2 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -348,63 +348,33 @@ static inline void virtblk_request_done(struct request *req) blk_mq_end_request(req, status); } -static void virtblk_complete_batch(struct io_comp_batch *iob) -{ - struct request *req; - - rq_list_for_each(&iob->req_list, req) { - virtblk_unmap_data(req, blk_mq_rq_to_pdu(req)); - virtblk_cleanup_cmd(req); - } - blk_mq_end_request_batch(iob); -} - -static int virtblk_handle_req(struct virtio_blk_vq *vq, - struct io_comp_batch *iob) -{ - struct virtblk_req *vbr; - int req_done = 0; - unsigned int len; - - while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) { - struct request *req = blk_mq_rq_from_pdu(vbr); - - if (likely(!blk_should_fake_timeout(req->q)) && - !blk_mq_complete_request_remote(req) && - !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), - virtblk_complete_batch)) - virtblk_request_done(req); - req_done++; - } - - return req_done; -} - static void virtblk_done(struct virtqueue *vq) { struct virtio_blk *vblk = vq->vdev->priv; - struct virtio_blk_vq *vblk_vq = &vblk->vqs[vq->index]; - int req_done = 0; + bool req_done = false; + int qid = vq->index; + struct virtblk_req *vbr; unsigned long flags; - DEFINE_IO_COMP_BATCH(iob); + unsigned int len; - spin_lock_irqsave(&vblk_vq->lock, flags); + spin_lock_irqsave(&vblk->vqs[qid].lock, flags); do { virtqueue_disable_cb(vq); - req_done += virtblk_handle_req(vblk_vq, &iob); + while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) { + struct request *req = blk_mq_rq_from_pdu(vbr); + if (likely(!blk_should_fake_timeout(req->q))) + blk_mq_complete_request(req); + req_done = true; + } if (unlikely(virtqueue_is_broken(vq))) break; } while (!virtqueue_enable_cb(vq)); - if (req_done) { - if (!rq_list_empty(iob.req_list)) - iob.complete(&iob); - - /* In case queue is stopped waiting for more buffers. */ + /* In case queue is stopped waiting for more buffers. */ + if (req_done) blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); - } - spin_unlock_irqrestore(&vblk_vq->lock, flags); + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); } static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx) @@ -1283,15 +1253,37 @@ static void virtblk_map_queues(struct blk_mq_tag_set *set) } } +static void virtblk_complete_batch(struct io_comp_batch *iob) +{ + struct request *req; + + rq_list_for_each(&iob->req_list, req) { + virtblk_unmap_data(req, blk_mq_rq_to_pdu(req)); + virtblk_cleanup_cmd(req); + } + blk_mq_end_request_batch(iob); +} + static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct virtio_blk *vblk = hctx->queue->queuedata; struct virtio_blk_vq *vq = get_virtio_blk_vq(hctx); + struct virtblk_req *vbr; unsigned long flags; + unsigned int len; int found = 0; spin_lock_irqsave(&vq->lock, flags); - found = virtblk_handle_req(vq, iob); + + while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) { + struct request *req = blk_mq_rq_from_pdu(vbr); + + found++; + if (!blk_mq_complete_request_remote(req) && + !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), + virtblk_complete_batch)) + virtblk_request_done(req); + } if (found) blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); From 69cbeb61ff9093a9155cb19a36d633033f71093a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 21 Jun 2023 10:58:46 -0700 Subject: [PATCH 25/25] Revert "efi: random: refresh non-volatile random seed when RNG is initialized" This reverts commit e7b813b32a42a3a6281a4fd9ae7700a0257c1d50 (and the subsequent fix for it: 41a15855c1ee "efi: random: fix NULL-deref when refreshing seed"). It turns otu to cause non-deterministic boot stalls on at least a HP 6730b laptop. Reported-and-bisected-by: Sami Korkalainen Link: https://lore.kernel.org/all/GQUnKz2al3yke5mB2i1kp3SzNHjK8vi6KJEh7rnLrOQ24OrlljeCyeWveLW9pICEmB9Qc8PKdNt3w1t_g3-Uvxq1l8Wj67PpoMeWDoH8PKk=@proton.me/ Cc: Jason A. Donenfeld Cc: Bagas Sanjaya Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- drivers/firmware/efi/efi.c | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index abeff7dc0b58..34b9e7876538 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -361,24 +361,6 @@ static void __init efi_debugfs_init(void) static inline void efi_debugfs_init(void) {} #endif -static void refresh_nv_rng_seed(struct work_struct *work) -{ - u8 seed[EFI_RANDOM_SEED_SIZE]; - - get_random_bytes(seed, sizeof(seed)); - efi.set_variable(L"RandomSeed", &LINUX_EFI_RANDOM_SEED_TABLE_GUID, - EFI_VARIABLE_NON_VOLATILE | EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS, sizeof(seed), seed); - memzero_explicit(seed, sizeof(seed)); -} -static int refresh_nv_rng_seed_notification(struct notifier_block *nb, unsigned long action, void *data) -{ - static DECLARE_WORK(work, refresh_nv_rng_seed); - schedule_work(&work); - return NOTIFY_DONE; -} -static struct notifier_block refresh_nv_rng_seed_nb = { .notifier_call = refresh_nv_rng_seed_notification }; - /* * We register the efi subsystem with the firmware subsystem and the * efivars subsystem with the efi subsystem, if the system was booted with @@ -451,9 +433,6 @@ static int __init efisubsys_init(void) platform_device_register_simple("efi_secret", 0, NULL, 0); #endif - if (efi_rt_services_supported(EFI_RT_SUPPORTED_SET_VARIABLE)) - execute_with_initialized_rng(&refresh_nv_rng_seed_nb); - return 0; err_remove_group: