From 0a65579cdd28bed3e84e1b4929c3080da4f06d79 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:32 +0800 Subject: [PATCH 001/474] swiotlb: Refactor swiotlb init functions Add a new function, swiotlb_init_io_tlb_mem, for the io_tlb_mem struct initialization to make the code reusable. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Acked-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 50 ++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index e50df8d8f87e..414db5fc8de9 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -168,9 +168,28 @@ void __init swiotlb_update_mem_attributes(void) memset(vaddr, 0, bytes); } +static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, + unsigned long nslabs, bool late_alloc) +{ + void *vaddr = phys_to_virt(start); + unsigned long bytes = nslabs << IO_TLB_SHIFT, i; + + mem->nslabs = nslabs; + mem->start = start; + mem->end = mem->start + bytes; + mem->index = 0; + mem->late_alloc = late_alloc; + spin_lock_init(&mem->lock); + for (i = 0; i < mem->nslabs; i++) { + mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); + mem->slots[i].orig_addr = INVALID_PHYS_ADDR; + mem->slots[i].alloc_size = 0; + } + memset(vaddr, 0, bytes); +} + int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { - unsigned long bytes = nslabs << IO_TLB_SHIFT, i; struct io_tlb_mem *mem; size_t alloc_size; @@ -186,16 +205,8 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) if (!mem) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); - mem->nslabs = nslabs; - mem->start = __pa(tlb); - mem->end = mem->start + bytes; - mem->index = 0; - spin_lock_init(&mem->lock); - for (i = 0; i < mem->nslabs; i++) { - mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); - mem->slots[i].orig_addr = INVALID_PHYS_ADDR; - mem->slots[i].alloc_size = 0; - } + + swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false); io_tlb_default_mem = mem; if (verbose) @@ -282,8 +293,8 @@ swiotlb_late_init_with_default_size(size_t default_size) int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { - unsigned long bytes = nslabs << IO_TLB_SHIFT, i; struct io_tlb_mem *mem; + unsigned long bytes = nslabs << IO_TLB_SHIFT; if (swiotlb_force == SWIOTLB_NO_FORCE) return 0; @@ -297,20 +308,9 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) if (!mem) return -ENOMEM; - mem->nslabs = nslabs; - mem->start = virt_to_phys(tlb); - mem->end = mem->start + bytes; - mem->index = 0; - mem->late_alloc = 1; - spin_lock_init(&mem->lock); - for (i = 0; i < mem->nslabs; i++) { - mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); - mem->slots[i].orig_addr = INVALID_PHYS_ADDR; - mem->slots[i].alloc_size = 0; - } - + memset(mem, 0, sizeof(*mem)); set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); - memset(tlb, 0, bytes); + swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true); io_tlb_default_mem = mem; swiotlb_print_info(); From 6e675a1c455ea7579c7eaf1a38fe64267039d6fe Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:33 +0800 Subject: [PATCH 002/474] swiotlb: Refactor swiotlb_create_debugfs Split the debugfs creation to make the code reusable for supporting different bounce buffer pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 414db5fc8de9..ae6a151d0a41 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -669,19 +669,26 @@ bool is_swiotlb_active(void) EXPORT_SYMBOL_GPL(is_swiotlb_active); #ifdef CONFIG_DEBUG_FS +static struct dentry *debugfs_dir; -static int __init swiotlb_create_debugfs(void) +static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem) +{ + debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); + debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used); +} + +static int __init swiotlb_create_default_debugfs(void) { struct io_tlb_mem *mem = io_tlb_default_mem; - if (!mem) - return 0; - mem->debugfs = debugfs_create_dir("swiotlb", NULL); - debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); - debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used); + debugfs_dir = debugfs_create_dir("swiotlb", NULL); + if (mem) { + mem->debugfs = debugfs_dir; + swiotlb_create_debugfs_files(mem); + } return 0; } -late_initcall(swiotlb_create_debugfs); +late_initcall(swiotlb_create_default_debugfs); #endif From 69031f500865ee3eee19566a1b9c40a189817eaa Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:34 +0800 Subject: [PATCH 003/474] swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used Always have the pointer to the swiotlb pool used in struct device. This could help simplify the code for other pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/base/core.c | 4 ++++ include/linux/device.h | 4 ++++ kernel/dma/swiotlb.c | 8 ++++---- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index cadcade65825..ea5b85354526 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include /* for dma_default_coherent */ @@ -2846,6 +2847,9 @@ void device_initialize(struct device *dev) defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) dev->dma_coherent = dma_default_coherent; #endif +#ifdef CONFIG_SWIOTLB + dev->dma_io_tlb_mem = io_tlb_default_mem; +#endif } EXPORT_SYMBOL_GPL(device_initialize); diff --git a/include/linux/device.h b/include/linux/device.h index 59940f1744c1..2a22875238a6 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -423,6 +423,7 @@ struct dev_links_info { * @dma_pools: Dma pools (if dma'ble device). * @dma_mem: Internal for coherent mem override. * @cma_area: Contiguous memory area for dma allocations + * @dma_io_tlb_mem: Pointer to the swiotlb pool used. Not for driver use. * @archdata: For arch-specific additions. * @of_node: Associated device tree node. * @fwnode: Associated device node supplied by platform firmware. @@ -531,6 +532,9 @@ struct device { #ifdef CONFIG_DMA_CMA struct cma *cma_area; /* contiguous memory area for dma allocations */ +#endif +#ifdef CONFIG_SWIOTLB + struct io_tlb_mem *dma_io_tlb_mem; #endif /* arch specific additions */ struct dev_archdata archdata; diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index ae6a151d0a41..33d413beddd4 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -348,7 +348,7 @@ static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = mem->slots[index].orig_addr; size_t alloc_size = mem->slots[index].alloc_size; @@ -429,7 +429,7 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) static int find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, mem->start) & boundary_mask; @@ -506,7 +506,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, enum dma_data_direction dir, unsigned long attrs) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned int i; int index; @@ -557,7 +557,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, size_t mapping_size, enum dma_data_direction dir, unsigned long attrs) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = hwdev->dma_io_tlb_mem; unsigned long flags; unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; From 7fd856aa7f4261ddac62ea59d8383fef22a0690e Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:35 +0800 Subject: [PATCH 004/474] swiotlb: Update is_swiotlb_buffer to add a struct device argument Update is_swiotlb_buffer to add a struct device argument. This will be useful later to allow for different pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/iommu/dma-iommu.c | 12 ++++++------ drivers/xen/swiotlb-xen.c | 2 +- include/linux/swiotlb.h | 7 ++++--- kernel/dma/direct.c | 6 +++--- kernel/dma/direct.h | 6 +++--- 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 98ba927aee1a..4e34e8b26579 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -506,7 +506,7 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr, __iommu_dma_unmap(dev, dma_addr, size); - if (unlikely(is_swiotlb_buffer(phys))) + if (unlikely(is_swiotlb_buffer(dev, phys))) swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); } @@ -577,7 +577,7 @@ static dma_addr_t __iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys, } iova = __iommu_dma_map(dev, phys, aligned_size, prot, dma_mask); - if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(phys)) + if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(dev, phys)) swiotlb_tbl_unmap_single(dev, phys, org_size, dir, attrs); return iova; } @@ -783,7 +783,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu(phys, size, dir); - if (is_swiotlb_buffer(phys)) + if (is_swiotlb_buffer(dev, phys)) swiotlb_sync_single_for_cpu(dev, phys, size, dir); } @@ -796,7 +796,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev, return; phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); - if (is_swiotlb_buffer(phys)) + if (is_swiotlb_buffer(dev, phys)) swiotlb_sync_single_for_device(dev, phys, size, dir); if (!dev_is_dma_coherent(dev)) @@ -817,7 +817,7 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir); - if (is_swiotlb_buffer(sg_phys(sg))) + if (is_swiotlb_buffer(dev, sg_phys(sg))) swiotlb_sync_single_for_cpu(dev, sg_phys(sg), sg->length, dir); } @@ -834,7 +834,7 @@ static void iommu_dma_sync_sg_for_device(struct device *dev, return; for_each_sg(sgl, sg, nelems, i) { - if (is_swiotlb_buffer(sg_phys(sg))) + if (is_swiotlb_buffer(dev, sg_phys(sg))) swiotlb_sync_single_for_device(dev, sg_phys(sg), sg->length, dir); diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 24d11861ac7d..0c4fb34f11ab 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -100,7 +100,7 @@ static int is_xen_swiotlb_buffer(struct device *dev, dma_addr_t dma_addr) * in our domain. Therefore _only_ check address within our domain. */ if (pfn_valid(PFN_DOWN(paddr))) - return is_swiotlb_buffer(paddr); + return is_swiotlb_buffer(dev, paddr); return 0; } diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 216854a5e513..d1f3d95881cd 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -2,6 +2,7 @@ #ifndef __LINUX_SWIOTLB_H #define __LINUX_SWIOTLB_H +#include #include #include #include @@ -101,9 +102,9 @@ struct io_tlb_mem { }; extern struct io_tlb_mem *io_tlb_default_mem; -static inline bool is_swiotlb_buffer(phys_addr_t paddr) +static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; return mem && paddr >= mem->start && paddr < mem->end; } @@ -115,7 +116,7 @@ bool is_swiotlb_active(void); void __init swiotlb_adjust_size(unsigned long size); #else #define swiotlb_force SWIOTLB_NO_FORCE -static inline bool is_swiotlb_buffer(phys_addr_t paddr) +static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) { return false; } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index f737e3347059..84c9feb5474a 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -343,7 +343,7 @@ void dma_direct_sync_sg_for_device(struct device *dev, for_each_sg(sgl, sg, nents, i) { phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg)); - if (unlikely(is_swiotlb_buffer(paddr))) + if (unlikely(is_swiotlb_buffer(dev, paddr))) swiotlb_sync_single_for_device(dev, paddr, sg->length, dir); @@ -369,7 +369,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu(paddr, sg->length, dir); - if (unlikely(is_swiotlb_buffer(paddr))) + if (unlikely(is_swiotlb_buffer(dev, paddr))) swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir); @@ -504,7 +504,7 @@ size_t dma_direct_max_mapping_size(struct device *dev) bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr) { return !dev_is_dma_coherent(dev) || - is_swiotlb_buffer(dma_to_phys(dev, dma_addr)); + is_swiotlb_buffer(dev, dma_to_phys(dev, dma_addr)); } /** diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 50afc05b6f1d..13e9e7158d94 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -56,7 +56,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev, { phys_addr_t paddr = dma_to_phys(dev, addr); - if (unlikely(is_swiotlb_buffer(paddr))) + if (unlikely(is_swiotlb_buffer(dev, paddr))) swiotlb_sync_single_for_device(dev, paddr, size, dir); if (!dev_is_dma_coherent(dev)) @@ -73,7 +73,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, arch_sync_dma_for_cpu_all(); } - if (unlikely(is_swiotlb_buffer(paddr))) + if (unlikely(is_swiotlb_buffer(dev, paddr))) swiotlb_sync_single_for_cpu(dev, paddr, size, dir); if (dir == DMA_FROM_DEVICE) @@ -113,7 +113,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) dma_direct_sync_single_for_cpu(dev, addr, size, dir); - if (unlikely(is_swiotlb_buffer(phys))) + if (unlikely(is_swiotlb_buffer(dev, phys))) swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); } #endif /* _KERNEL_DMA_DIRECT_H */ From 6f2beb268a5d35504a636c4a3b7aaa76ec32d96c Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:36 +0800 Subject: [PATCH 005/474] swiotlb: Update is_swiotlb_active to add a struct device argument Update is_swiotlb_active to add a struct device argument. This will be useful later to allow for different pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +- drivers/gpu/drm/nouveau/nouveau_ttm.c | 2 +- drivers/pci/xen-pcifront.c | 2 +- include/linux/swiotlb.h | 4 ++-- kernel/dma/direct.c | 2 +- kernel/dma/swiotlb.c | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c index ce6b664b10aa..89a894354263 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c @@ -42,7 +42,7 @@ static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj) max_order = MAX_ORDER; #ifdef CONFIG_SWIOTLB - if (is_swiotlb_active()) { + if (is_swiotlb_active(obj->base.dev->dev)) { unsigned int max_segment; max_segment = swiotlb_max_segment(); diff --git a/drivers/gpu/drm/nouveau/nouveau_ttm.c b/drivers/gpu/drm/nouveau/nouveau_ttm.c index f4c2e46b6fe1..2ca9d9a9e5d5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_ttm.c +++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c @@ -276,7 +276,7 @@ nouveau_ttm_init(struct nouveau_drm *drm) } #if IS_ENABLED(CONFIG_SWIOTLB) && IS_ENABLED(CONFIG_X86) - need_swiotlb = is_swiotlb_active(); + need_swiotlb = is_swiotlb_active(dev->dev); #endif ret = ttm_device_init(&drm->ttm.bdev, &nouveau_bo_driver, drm->dev->dev, diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index b7a8f3a1921f..0d56985bfe81 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -693,7 +693,7 @@ static int pcifront_connect_and_init_dma(struct pcifront_device *pdev) spin_unlock(&pcifront_dev_lock); - if (!err && !is_swiotlb_active()) { + if (!err && !is_swiotlb_active(&pdev->xdev->dev)) { err = pci_xen_swiotlb_init_late(); if (err) dev_err(&pdev->xdev->dev, "Could not setup SWIOTLB!\n"); diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index d1f3d95881cd..dd1c30a83058 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -112,7 +112,7 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) void __init swiotlb_exit(void); unsigned int swiotlb_max_segment(void); size_t swiotlb_max_mapping_size(struct device *dev); -bool is_swiotlb_active(void); +bool is_swiotlb_active(struct device *dev); void __init swiotlb_adjust_size(unsigned long size); #else #define swiotlb_force SWIOTLB_NO_FORCE @@ -132,7 +132,7 @@ static inline size_t swiotlb_max_mapping_size(struct device *dev) return SIZE_MAX; } -static inline bool is_swiotlb_active(void) +static inline bool is_swiotlb_active(struct device *dev) { return false; } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 84c9feb5474a..7a88c34d0867 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -495,7 +495,7 @@ int dma_direct_supported(struct device *dev, u64 mask) size_t dma_direct_max_mapping_size(struct device *dev) { /* If SWIOTLB is active, use its maximum mapping size */ - if (is_swiotlb_active() && + if (is_swiotlb_active(dev) && (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE)) return swiotlb_max_mapping_size(dev); return SIZE_MAX; diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 33d413beddd4..d8677d6637dd 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -662,9 +662,9 @@ size_t swiotlb_max_mapping_size(struct device *dev) return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE; } -bool is_swiotlb_active(void) +bool is_swiotlb_active(struct device *dev) { - return io_tlb_default_mem != NULL; + return dev->dma_io_tlb_mem != NULL; } EXPORT_SYMBOL_GPL(is_swiotlb_active); From 903cd0f315fe426c6a64c54ed389de0becb663dc Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Thu, 24 Jun 2021 23:55:20 +0800 Subject: [PATCH 006/474] swiotlb: Use is_swiotlb_force_bounce for swiotlb data bouncing Propagate the swiotlb_force into io_tlb_default_mem->force_bounce and use it to determine whether to bounce the data or not. This will be useful later to allow for different pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk [v2: Includes Will's fix] --- drivers/xen/swiotlb-xen.c | 2 +- include/linux/swiotlb.h | 13 +++++++++++++ kernel/dma/direct.c | 2 +- kernel/dma/direct.h | 2 +- kernel/dma/swiotlb.c | 4 ++++ 5 files changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 0c4fb34f11ab..785ec7e8be01 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -374,7 +374,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, if (dma_capable(dev, dev_addr, size, true) && !range_straddles_page_boundary(phys, size) && !xen_arch_need_swiotlb(dev, phys, dev_addr) && - swiotlb_force != SWIOTLB_FORCE) + !is_swiotlb_force_bounce(dev)) goto done; /* diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index dd1c30a83058..da348671b0d5 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -84,6 +84,7 @@ extern enum swiotlb_force swiotlb_force; * unmap calls. * @debugfs: The dentry to debugfs. * @late_alloc: %true if allocated using the page allocator + * @force_bounce: %true if swiotlb bouncing is forced */ struct io_tlb_mem { phys_addr_t start; @@ -94,6 +95,7 @@ struct io_tlb_mem { spinlock_t lock; struct dentry *debugfs; bool late_alloc; + bool force_bounce; struct io_tlb_slot { phys_addr_t orig_addr; size_t alloc_size; @@ -109,6 +111,13 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) return mem && paddr >= mem->start && paddr < mem->end; } +static inline bool is_swiotlb_force_bounce(struct device *dev) +{ + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + + return mem && mem->force_bounce; +} + void __init swiotlb_exit(void); unsigned int swiotlb_max_segment(void); size_t swiotlb_max_mapping_size(struct device *dev); @@ -120,6 +129,10 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) { return false; } +static inline bool is_swiotlb_force_bounce(struct device *dev) +{ + return false; +} static inline void swiotlb_exit(void) { } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 7a88c34d0867..a92465b4eb12 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -496,7 +496,7 @@ size_t dma_direct_max_mapping_size(struct device *dev) { /* If SWIOTLB is active, use its maximum mapping size */ if (is_swiotlb_active(dev) && - (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE)) + (dma_addressing_limited(dev) || is_swiotlb_force_bounce(dev))) return swiotlb_max_mapping_size(dev); return SIZE_MAX; } diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 13e9e7158d94..4632b0f4f72e 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -87,7 +87,7 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (unlikely(swiotlb_force == SWIOTLB_FORCE)) + if (is_swiotlb_force_bounce(dev)) return swiotlb_map(dev, phys, size, dir, attrs); if (unlikely(!dma_capable(dev, dma_addr, size, true))) { diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d8677d6637dd..04319dd22d28 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -179,6 +179,10 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->end = mem->start + bytes; mem->index = 0; mem->late_alloc = late_alloc; + + if (swiotlb_force == SWIOTLB_FORCE) + mem->force_bounce = true; + spin_lock_init(&mem->lock); for (i = 0; i < mem->nslabs; i++) { mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); From 36f7b2f3ca5f0bee00abf9ea52748ce0644a21c6 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Thu, 24 Jun 2021 23:55:21 +0800 Subject: [PATCH 007/474] swiotlb: Move alloc_size to swiotlb_find_slots Rename find_slots to swiotlb_find_slots and move the maintenance of alloc_size to it for better code reusability later. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 04319dd22d28..0116a630dc13 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -430,8 +430,8 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) * Find a suitable number of IO TLB entries size that will fit this request and * allocate a buffer from that IO TLB pool. */ -static int find_slots(struct device *dev, phys_addr_t orig_addr, - size_t alloc_size) +static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, + size_t alloc_size) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned long boundary_mask = dma_get_seg_boundary(dev); @@ -442,6 +442,7 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr, dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1); unsigned int nslots = nr_slots(alloc_size), stride; unsigned int index, wrap, count = 0, i; + unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned long flags; BUG_ON(!nslots); @@ -486,8 +487,11 @@ not_found: return -1; found: - for (i = index; i < index + nslots; i++) + for (i = index; i < index + nslots; i++) { mem->slots[i].list = 0; + mem->slots[i].alloc_size = + alloc_size - (offset + ((i - index) << IO_TLB_SHIFT)); + } for (i = index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list; i--) @@ -528,7 +532,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return (phys_addr_t)DMA_MAPPING_ERROR; } - index = find_slots(dev, orig_addr, alloc_size + offset); + index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset); if (index == -1) { if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, @@ -542,11 +546,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * This is needed when we sync the memory. Then we sync the buffer if * needed. */ - for (i = 0; i < nr_slots(alloc_size + offset); i++) { + for (i = 0; i < nr_slots(alloc_size + offset); i++) mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); - mem->slots[index + i].alloc_size = - alloc_size - (i << IO_TLB_SHIFT); - } tlb_addr = slot_addr(mem->start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) From 70347877231eeb505a27abe80af7ae3d3a281519 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:39 +0800 Subject: [PATCH 008/474] swiotlb: Refactor swiotlb_tbl_unmap_single Add a new function, swiotlb_release_slots, to make the code reusable for supporting different bounce buffer pools. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 0116a630dc13..23b6df3a6ab7 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -555,27 +555,15 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return tlb_addr; } -/* - * tlb_addr is the physical address of the bounce buffer to unmap. - */ -void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t mapping_size, enum dma_data_direction dir, - unsigned long attrs) +static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) { - struct io_tlb_mem *mem = hwdev->dma_io_tlb_mem; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned long flags; - unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); + unsigned int offset = swiotlb_align_offset(dev, tlb_addr); int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; int nslots = nr_slots(mem->slots[index].alloc_size + offset); int count, i; - /* - * First, sync the memory before unmapping the entry - */ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE); - /* * Return the buffer to the free list by setting the corresponding * entries to indicate the number of contiguous entries available. @@ -610,6 +598,23 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, spin_unlock_irqrestore(&mem->lock, flags); } +/* + * tlb_addr is the physical address of the bounce buffer to unmap. + */ +void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr, + size_t mapping_size, enum dma_data_direction dir, + unsigned long attrs) +{ + /* + * First, sync the memory before unmapping the entry + */ + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE); + + swiotlb_release_slots(dev, tlb_addr); +} + void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { From f4111e39a52aa5d5136d890bbd1aa87c1c8fe3bc Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:40 +0800 Subject: [PATCH 009/474] swiotlb: Add restricted DMA alloc/free support Add the functions, swiotlb_{alloc,free} and is_swiotlb_for_alloc to support the memory allocation from restricted DMA pool. The restricted DMA pool is preferred if available. Note that since coherent allocation needs remapping, one must set up another device coherent pool by shared-dma-pool and use dma_alloc_from_dev_coherent instead for atomic coherent allocation. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 26 ++++++++++++++++++++++ kernel/dma/direct.c | 49 +++++++++++++++++++++++++++++++---------- kernel/dma/swiotlb.c | 38 ++++++++++++++++++++++++++++++-- 3 files changed, 99 insertions(+), 14 deletions(-) diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index da348671b0d5..3b9454d1e498 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -85,6 +85,7 @@ extern enum swiotlb_force swiotlb_force; * @debugfs: The dentry to debugfs. * @late_alloc: %true if allocated using the page allocator * @force_bounce: %true if swiotlb bouncing is forced + * @for_alloc: %true if the pool is used for memory allocation */ struct io_tlb_mem { phys_addr_t start; @@ -96,6 +97,7 @@ struct io_tlb_mem { struct dentry *debugfs; bool late_alloc; bool force_bounce; + bool for_alloc; struct io_tlb_slot { phys_addr_t orig_addr; size_t alloc_size; @@ -158,4 +160,28 @@ static inline void swiotlb_adjust_size(unsigned long size) extern void swiotlb_print_info(void); extern void swiotlb_set_max_segment(unsigned int); +#ifdef CONFIG_DMA_RESTRICTED_POOL +struct page *swiotlb_alloc(struct device *dev, size_t size); +bool swiotlb_free(struct device *dev, struct page *page, size_t size); + +static inline bool is_swiotlb_for_alloc(struct device *dev) +{ + return dev->dma_io_tlb_mem->for_alloc; +} +#else +static inline struct page *swiotlb_alloc(struct device *dev, size_t size) +{ + return NULL; +} +static inline bool swiotlb_free(struct device *dev, struct page *page, + size_t size) +{ + return false; +} +static inline bool is_swiotlb_for_alloc(struct device *dev) +{ + return false; +} +#endif /* CONFIG_DMA_RESTRICTED_POOL */ + #endif /* __LINUX_SWIOTLB_H */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index a92465b4eb12..2de33e5d302b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -75,6 +75,15 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } +static void __dma_direct_free_pages(struct device *dev, struct page *page, + size_t size) +{ + if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) && + swiotlb_free(dev, page, size)) + return; + dma_free_contiguous(dev, page, size); +} + static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp) { @@ -86,6 +95,16 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_limit); + if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) && + is_swiotlb_for_alloc(dev)) { + page = swiotlb_alloc(dev, size); + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + __dma_direct_free_pages(dev, page, size); + return NULL; + } + return page; + } + page = dma_alloc_contiguous(dev, size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, size); @@ -142,7 +161,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, gfp |= __GFP_NOWARN; if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && - !force_dma_unencrypted(dev)) { + !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) { page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); if (!page) return NULL; @@ -155,18 +174,23 @@ void *dma_direct_alloc(struct device *dev, size_t size, } if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !dev_is_dma_coherent(dev)) + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev) && + !is_swiotlb_for_alloc(dev)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); /* * Remapping or decrypting memory may block. If either is required and * we can't block, allocate the memory from the atomic pools. + * If restricted DMA (i.e., is_swiotlb_for_alloc) is required, one must + * set up another device coherent pool by shared-dma-pool and use + * dma_alloc_from_dev_coherent instead. */ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && !gfpflags_allow_blocking(gfp) && (force_dma_unencrypted(dev) || - (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev)))) + (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !dev_is_dma_coherent(dev))) && + !is_swiotlb_for_alloc(dev)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); /* we always manually zero the memory once we are done */ @@ -237,7 +261,7 @@ out_encrypt_pages: return NULL; } out_free_pages: - dma_free_contiguous(dev, page, size); + __dma_direct_free_pages(dev, page, size); return NULL; } @@ -247,15 +271,15 @@ void dma_direct_free(struct device *dev, size_t size, unsigned int page_order = get_order(size); if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && - !force_dma_unencrypted(dev)) { + !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ dma_free_contiguous(dev, cpu_addr, size); return; } if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !dev_is_dma_coherent(dev)) { + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev) && + !is_swiotlb_for_alloc(dev)) { arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); return; } @@ -273,7 +297,7 @@ void dma_direct_free(struct device *dev, size_t size, else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) arch_dma_clear_uncached(cpu_addr, size); - dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size); + __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size); } struct page *dma_direct_alloc_pages(struct device *dev, size_t size, @@ -283,7 +307,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, void *ret; if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && - force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp)) + force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) && + !is_swiotlb_for_alloc(dev)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); page = __dma_direct_alloc_pages(dev, size, gfp); @@ -310,7 +335,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return page; out_free_pages: - dma_free_contiguous(dev, page, size); + __dma_direct_free_pages(dev, page, size); return NULL; } @@ -329,7 +354,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)vaddr, 1 << page_order); - dma_free_contiguous(dev, page, size); + __dma_direct_free_pages(dev, page, size); } #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 23b6df3a6ab7..44fc3d10f017 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -462,8 +462,9 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, index = wrap = wrap_index(mem, ALIGN(mem->index, stride)); do { - if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != - (orig_addr & iotlb_align_mask)) { + if (orig_addr && + (slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != + (orig_addr & iotlb_align_mask)) { index = wrap_index(mem, index + 1); continue; } @@ -702,3 +703,36 @@ static int __init swiotlb_create_default_debugfs(void) late_initcall(swiotlb_create_default_debugfs); #endif + +#ifdef CONFIG_DMA_RESTRICTED_POOL +struct page *swiotlb_alloc(struct device *dev, size_t size) +{ + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + phys_addr_t tlb_addr; + int index; + + if (!mem) + return NULL; + + index = swiotlb_find_slots(dev, 0, size); + if (index == -1) + return NULL; + + tlb_addr = slot_addr(mem->start, index); + + return pfn_to_page(PFN_DOWN(tlb_addr)); +} + +bool swiotlb_free(struct device *dev, struct page *page, size_t size) +{ + phys_addr_t tlb_addr = page_to_phys(page); + + if (!is_swiotlb_buffer(dev, tlb_addr)) + return false; + + swiotlb_release_slots(dev, tlb_addr); + + return true; +} + +#endif /* CONFIG_DMA_RESTRICTED_POOL */ From 0b84e4f8b793eb4045fd64f6f514165a7974cd16 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:41 +0800 Subject: [PATCH 010/474] swiotlb: Add restricted DMA pool initialization Add the initialization function to create restricted DMA pools from matching reserved-memory nodes. Regardless of swiotlb setting, the restricted DMA pool is preferred if available. The restricted DMA pools provide a basic level of protection against the DMA overwriting buffer contents at unexpected times. However, to protect against general data leakage and system memory corruption, the system needs to provide a way to lock down the memory access, e.g., MPU. Signed-off-by: Claire Chang Reviewed-by: Christoph Hellwig Tested-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 3 +- kernel/dma/Kconfig | 14 ++++++++ kernel/dma/swiotlb.c | 76 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 1 deletion(-) diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 3b9454d1e498..39284ff2a6cd 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -73,7 +73,8 @@ extern enum swiotlb_force swiotlb_force; * range check to see if the memory was in fact allocated by this * API. * @nslabs: The number of IO TLB blocks (in groups of 64) between @start and - * @end. This is command line adjustable via setup_io_tlb_npages. + * @end. For default swiotlb, this is command line adjustable via + * setup_io_tlb_npages. * @used: The number of used IO TLB block. * @list: The free list describing the number of free entries available * from each index. diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 77b405508743..3e961dc39634 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -80,6 +80,20 @@ config SWIOTLB bool select NEED_DMA_MAP_STATE +config DMA_RESTRICTED_POOL + bool "DMA Restricted Pool" + depends on OF && OF_RESERVED_MEM + select SWIOTLB + help + This enables support for restricted DMA pools which provide a level of + DMA memory protection on systems with limited hardware protection + capabilities, such as those lacking an IOMMU. + + For more information see + + and . + If unsure, say "n". + # # Should be selected if we can mmap non-coherent mappings to userspace. # The only thing that is really required is a way to set an uncached bit diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 44fc3d10f017..0ffbaae9fba2 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -39,6 +39,13 @@ #ifdef CONFIG_DEBUG_FS #include #endif +#ifdef CONFIG_DMA_RESTRICTED_POOL +#include +#include +#include +#include +#include +#endif #include #include @@ -735,4 +742,73 @@ bool swiotlb_free(struct device *dev, struct page *page, size_t size) return true; } +static int rmem_swiotlb_device_init(struct reserved_mem *rmem, + struct device *dev) +{ + struct io_tlb_mem *mem = rmem->priv; + unsigned long nslabs = rmem->size >> IO_TLB_SHIFT; + + /* + * Since multiple devices can share the same pool, the private data, + * io_tlb_mem struct, will be initialized by the first device attached + * to it. + */ + if (!mem) { + mem = kzalloc(struct_size(mem, slots, nslabs), GFP_KERNEL); + if (!mem) + return -ENOMEM; + + set_memory_decrypted((unsigned long)phys_to_virt(rmem->base), + rmem->size >> PAGE_SHIFT); + swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false); + mem->force_bounce = true; + mem->for_alloc = true; + + rmem->priv = mem; + + if (IS_ENABLED(CONFIG_DEBUG_FS)) { + mem->debugfs = + debugfs_create_dir(rmem->name, debugfs_dir); + swiotlb_create_debugfs_files(mem); + } + } + + dev->dma_io_tlb_mem = mem; + + return 0; +} + +static void rmem_swiotlb_device_release(struct reserved_mem *rmem, + struct device *dev) +{ + dev->dma_io_tlb_mem = io_tlb_default_mem; +} + +static const struct reserved_mem_ops rmem_swiotlb_ops = { + .device_init = rmem_swiotlb_device_init, + .device_release = rmem_swiotlb_device_release, +}; + +static int __init rmem_swiotlb_setup(struct reserved_mem *rmem) +{ + unsigned long node = rmem->fdt_node; + + if (of_get_flat_dt_prop(node, "reusable", NULL) || + of_get_flat_dt_prop(node, "linux,cma-default", NULL) || + of_get_flat_dt_prop(node, "linux,dma-default", NULL) || + of_get_flat_dt_prop(node, "no-map", NULL)) + return -EINVAL; + + if (PageHighMem(pfn_to_page(PHYS_PFN(rmem->base)))) { + pr_err("Restricted DMA pool must be accessible within the linear mapping."); + return -EINVAL; + } + + rmem->ops = &rmem_swiotlb_ops; + pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n", + &rmem->base, (unsigned long)rmem->size / SZ_1M); + return 0; +} + +RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup); #endif /* CONFIG_DMA_RESTRICTED_POOL */ From b12fe999545cf3fd89e1a178ee4e541a9331da82 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:42 +0800 Subject: [PATCH 011/474] dt-bindings: of: Add restricted DMA pool Introduce the new compatible string, restricted-dma-pool, for restricted DMA. One can specify the address and length of the restricted DMA memory region by restricted-dma-pool in the reserved-memory node. Signed-off-by: Claire Chang Tested-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- .../reserved-memory/reserved-memory.txt | 36 +++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt b/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt index e8d3096d922c..39b5f4c5a511 100644 --- a/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt +++ b/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt @@ -51,6 +51,23 @@ compatible (optional) - standard definition used as a shared pool of DMA buffers for a set of devices. It can be used by an operating system to instantiate the necessary pool management subsystem if necessary. + - restricted-dma-pool: This indicates a region of memory meant to be + used as a pool of restricted DMA buffers for a set of devices. The + memory region would be the only region accessible to those devices. + When using this, the no-map and reusable properties must not be set, + so the operating system can create a virtual mapping that will be used + for synchronization. The main purpose for restricted DMA is to + mitigate the lack of DMA access control on systems without an IOMMU, + which could result in the DMA accessing the system memory at + unexpected times and/or unexpected addresses, possibly leading to data + leakage or corruption. The feature on its own provides a basic level + of protection against the DMA overwriting buffer contents at + unexpected times. However, to protect against general data leakage and + system memory corruption, the system needs to provide way to lock down + the memory access, e.g., MPU. Note that since coherent allocation + needs remapping, one must set up another device coherent pool by + shared-dma-pool and use dma_alloc_from_dev_coherent instead for atomic + coherent allocation. - vendor specific string in the form ,[-] no-map (optional) - empty property - Indicates the operating system must not create a virtual mapping @@ -85,10 +102,11 @@ memory-region-names (optional) - a list of names, one for each corresponding Example ------- -This example defines 3 contiguous regions are defined for Linux kernel: +This example defines 4 contiguous regions for Linux kernel: one default of all device drivers (named linux,cma@72000000 and 64MiB in size), -one dedicated to the framebuffer device (named framebuffer@78000000, 8MiB), and -one for multimedia processing (named multimedia-memory@77000000, 64MiB). +one dedicated to the framebuffer device (named framebuffer@78000000, 8MiB), +one for multimedia processing (named multimedia-memory@77000000, 64MiB), and +one for restricted dma pool (named restricted_dma_reserved@0x50000000, 64MiB). / { #address-cells = <1>; @@ -120,6 +138,11 @@ one for multimedia processing (named multimedia-memory@77000000, 64MiB). compatible = "acme,multimedia-memory"; reg = <0x77000000 0x4000000>; }; + + restricted_dma_reserved: restricted_dma_reserved { + compatible = "restricted-dma-pool"; + reg = <0x50000000 0x4000000>; + }; }; /* ... */ @@ -138,4 +161,11 @@ one for multimedia processing (named multimedia-memory@77000000, 64MiB). memory-region = <&multimedia_reserved>; /* ... */ }; + + pcie_device: pcie_device@0,0 { + reg = <0x83010000 0x0 0x00000000 0x0 0x00100000 + 0x83010000 0x0 0x00100000 0x0 0x00100000>; + memory-region = <&restricted_dma_reserved>; + /* ... */ + }; }; From fec9b625095f7308e52a6922619cd4abaa9534a8 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Sat, 19 Jun 2021 11:40:43 +0800 Subject: [PATCH 012/474] of: Add plumbing for restricted DMA pool If a device is not behind an IOMMU, we look up the device node and set up the restricted DMA when the restricted-dma-pool is presented. Signed-off-by: Claire Chang Tested-by: Stefano Stabellini Tested-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- drivers/of/address.c | 33 +++++++++++++++++++++++++++++++++ drivers/of/device.c | 3 +++ drivers/of/of_private.h | 6 ++++++ 3 files changed, 42 insertions(+) diff --git a/drivers/of/address.c b/drivers/of/address.c index 94f017d808c4..973257434398 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -995,6 +996,38 @@ out: of_node_put(node); return ret; } + +int of_dma_set_restricted_buffer(struct device *dev, struct device_node *np) +{ + struct device_node *node, *of_node = dev->of_node; + int count, i; + + count = of_property_count_elems_of_size(of_node, "memory-region", + sizeof(u32)); + /* + * If dev->of_node doesn't exist or doesn't contain memory-region, try + * the OF node having DMA configuration. + */ + if (count <= 0) { + of_node = np; + count = of_property_count_elems_of_size( + of_node, "memory-region", sizeof(u32)); + } + + for (i = 0; i < count; i++) { + node = of_parse_phandle(of_node, "memory-region", i); + /* + * There might be multiple memory regions, but only one + * restricted-dma-pool region is allowed. + */ + if (of_device_is_compatible(node, "restricted-dma-pool") && + of_device_is_available(node)) + return of_reserved_mem_device_init_by_idx(dev, of_node, + i); + } + + return 0; +} #endif /* CONFIG_HAS_DMA */ /** diff --git a/drivers/of/device.c b/drivers/of/device.c index c5a9473a5fb1..2defdca418ec 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -165,6 +165,9 @@ int of_dma_configure_id(struct device *dev, struct device_node *np, arch_setup_dma_ops(dev, dma_start, size, iommu, coherent); + if (!iommu) + return of_dma_set_restricted_buffer(dev, np); + return 0; } EXPORT_SYMBOL_GPL(of_dma_configure_id); diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h index 631489f7f8c0..376462798f7e 100644 --- a/drivers/of/of_private.h +++ b/drivers/of/of_private.h @@ -163,12 +163,18 @@ struct bus_dma_region; #if defined(CONFIG_OF_ADDRESS) && defined(CONFIG_HAS_DMA) int of_dma_get_range(struct device_node *np, const struct bus_dma_region **map); +int of_dma_set_restricted_buffer(struct device *dev, struct device_node *np); #else static inline int of_dma_get_range(struct device_node *np, const struct bus_dma_region **map) { return -ENODEV; } +static inline int of_dma_set_restricted_buffer(struct device *dev, + struct device_node *np) +{ + return -ENODEV; +} #endif void fdt_init_reserved_mem(void); From 09a4a79d42ced8ca7fc250b05e45ac1cb23a9c52 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Thu, 1 Jul 2021 11:31:30 +0800 Subject: [PATCH 013/474] swiotlb: fix implicit debugfs declarations Factor out the debugfs bits from rmem_swiotlb_device_init() into a separate rmem_swiotlb_debugfs_init() to fix the implicit debugfs declarations. Fixes: 461021875c50 ("swiotlb: Add restricted DMA pool initialization") Reported-by: kernel test robot Signed-off-by: Claire Chang Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 0ffbaae9fba2..b7f76bca89bf 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -712,6 +712,21 @@ late_initcall(swiotlb_create_default_debugfs); #endif #ifdef CONFIG_DMA_RESTRICTED_POOL + +#ifdef CONFIG_DEBUG_FS +static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem) +{ + struct io_tlb_mem *mem = rmem->priv; + + mem->debugfs = debugfs_create_dir(rmem->name, debugfs_dir); + swiotlb_create_debugfs_files(mem); +} +#else +static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem) +{ +} +#endif + struct page *swiotlb_alloc(struct device *dev, size_t size) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; @@ -766,11 +781,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, rmem->priv = mem; - if (IS_ENABLED(CONFIG_DEBUG_FS)) { - mem->debugfs = - debugfs_create_dir(rmem->name, debugfs_dir); - swiotlb_create_debugfs_files(mem); - } + rmem_swiotlb_debugfs_init(rmem); } dev->dma_io_tlb_mem = mem; From 868c9ddc182bc6728bb380cbfb3170734f72c599 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Wed, 7 Jul 2021 14:12:54 +0900 Subject: [PATCH 014/474] swiotlb: add overflow checks to swiotlb_bounce This is a follow-up on 5f89468e2f06 ("swiotlb: manipulate orig_addr when tlb_addr has offset") which fixed unaligned dma mappings, making sure the following overflows are caught: - offset of the start of the slot within the device bigger than requested address' offset, in other words if the base address given in swiotlb_tbl_map_single to create the mapping (orig_addr) was after the requested address for the sync (tlb_offset) in the same block: |------------------------------------------| block <----------------------------> mapped part of the block ^ orig_addr ^ invalid tlb_addr for sync - if the resulting offset was bigger than the allocation size this one could happen if the mapping was not until the end. e.g. |------------------------------------------| block <---------------------> mapped part of the block ^ ^ orig_addr invalid tlb_addr Both should never happen so print a warning and bail out without trying to adjust the sizes/offsets: the first one could try to sync from orig_addr to whatever is left of the requested size, but the later really has nothing to sync there... Signed-off-by: Dominique Martinet Cc: Konrad Rzeszutek Wilk Reviewed-by: Bumyong Lee Cc: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b7f76bca89bf..f1a9ae7fad8f 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -365,13 +365,27 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = phys_to_virt(tlb_addr); - unsigned int tlb_offset; + unsigned int tlb_offset, orig_addr_offset; if (orig_addr == INVALID_PHYS_ADDR) return; - tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) - - swiotlb_align_offset(dev, orig_addr); + tlb_offset = tlb_addr & (IO_TLB_SIZE - 1); + orig_addr_offset = swiotlb_align_offset(dev, orig_addr); + if (tlb_offset < orig_addr_offset) { + dev_WARN_ONCE(dev, 1, + "Access before mapping start detected. orig offset %u, requested offset %u.\n", + orig_addr_offset, tlb_offset); + return; + } + + tlb_offset -= orig_addr_offset; + if (tlb_offset > alloc_size) { + dev_WARN_ONCE(dev, 1, + "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu+%u.\n", + alloc_size, size, tlb_offset); + return; + } orig_addr += tlb_offset; alloc_size -= tlb_offset; From 14d72af7ab00d9c2eb5b473b4fdf3b751b8cf8e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E7=90=B0=E6=9D=B0=20=28Zhou=20Yanjie=29?= Date: Fri, 16 Jul 2021 00:57:07 +0800 Subject: [PATCH 015/474] MIPS: Ingenic: Add system type for new Ingenic SoCs. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add JZ4730, JZ4750, JZ4755, JZ4760, JZ4760B, X2000H, and X2100 system type for cat /proc/cpuinfo to give out JZ4730, JZ4750, JZ4755, JZ4760, JZ4760B, X2000H, and X2100. Signed-off-by: 周琰杰 (Zhou Yanjie) Reviewed-by: Paul Cercueil Signed-off-by: Thomas Bogendoerfer --- arch/mips/generic/board-ingenic.c | 21 +++++++++++++++++++++ arch/mips/include/asm/bootinfo.h | 3 +++ arch/mips/include/asm/cpu.h | 4 ++-- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/arch/mips/generic/board-ingenic.c b/arch/mips/generic/board-ingenic.c index 0cec0bea13d6..e86e0016c548 100644 --- a/arch/mips/generic/board-ingenic.c +++ b/arch/mips/generic/board-ingenic.c @@ -21,6 +21,10 @@ static __init char *ingenic_get_system_type(unsigned long machtype) { switch (machtype) { + case MACH_INGENIC_X2100: + return "X2100"; + case MACH_INGENIC_X2000H: + return "X2000H"; case MACH_INGENIC_X2000E: return "X2000E"; case MACH_INGENIC_X2000: @@ -37,8 +41,18 @@ static __init char *ingenic_get_system_type(unsigned long machtype) return "JZ4775"; case MACH_INGENIC_JZ4770: return "JZ4770"; + case MACH_INGENIC_JZ4760B: + return "JZ4760B"; + case MACH_INGENIC_JZ4760: + return "JZ4760"; + case MACH_INGENIC_JZ4755: + return "JZ4755"; + case MACH_INGENIC_JZ4750: + return "JZ4750"; case MACH_INGENIC_JZ4725B: return "JZ4725B"; + case MACH_INGENIC_JZ4730: + return "JZ4730"; default: return "JZ4740"; } @@ -61,8 +75,13 @@ static __init const void *ingenic_fixup_fdt(const void *fdt, const void *match_d } static const struct of_device_id ingenic_of_match[] __initconst = { + { .compatible = "ingenic,jz4730", .data = (void *)MACH_INGENIC_JZ4730 }, { .compatible = "ingenic,jz4740", .data = (void *)MACH_INGENIC_JZ4740 }, { .compatible = "ingenic,jz4725b", .data = (void *)MACH_INGENIC_JZ4725B }, + { .compatible = "ingenic,jz4750", .data = (void *)MACH_INGENIC_JZ4750 }, + { .compatible = "ingenic,jz4755", .data = (void *)MACH_INGENIC_JZ4755 }, + { .compatible = "ingenic,jz4760", .data = (void *)MACH_INGENIC_JZ4760 }, + { .compatible = "ingenic,jz4760b", .data = (void *)MACH_INGENIC_JZ4760B }, { .compatible = "ingenic,jz4770", .data = (void *)MACH_INGENIC_JZ4770 }, { .compatible = "ingenic,jz4775", .data = (void *)MACH_INGENIC_JZ4775 }, { .compatible = "ingenic,jz4780", .data = (void *)MACH_INGENIC_JZ4780 }, @@ -71,6 +90,8 @@ static const struct of_device_id ingenic_of_match[] __initconst = { { .compatible = "ingenic,x1830", .data = (void *)MACH_INGENIC_X1830 }, { .compatible = "ingenic,x2000", .data = (void *)MACH_INGENIC_X2000 }, { .compatible = "ingenic,x2000e", .data = (void *)MACH_INGENIC_X2000E }, + { .compatible = "ingenic,x2000h", .data = (void *)MACH_INGENIC_X2000H }, + { .compatible = "ingenic,x2100", .data = (void *)MACH_INGENIC_X2100 }, {} }; diff --git a/arch/mips/include/asm/bootinfo.h b/arch/mips/include/asm/bootinfo.h index 4c2e8173e6ec..2128ba903391 100644 --- a/arch/mips/include/asm/bootinfo.h +++ b/arch/mips/include/asm/bootinfo.h @@ -75,6 +75,7 @@ enum ingenic_machine_type { MACH_INGENIC_JZ4750, MACH_INGENIC_JZ4755, MACH_INGENIC_JZ4760, + MACH_INGENIC_JZ4760B, MACH_INGENIC_JZ4770, MACH_INGENIC_JZ4775, MACH_INGENIC_JZ4780, @@ -83,6 +84,8 @@ enum ingenic_machine_type { MACH_INGENIC_X1830, MACH_INGENIC_X2000, MACH_INGENIC_X2000E, + MACH_INGENIC_X2000H, + MACH_INGENIC_X2100, }; extern char *system_type; diff --git a/arch/mips/include/asm/cpu.h b/arch/mips/include/asm/cpu.h index 9e6211e6d76b..d45a52f65b7a 100644 --- a/arch/mips/include/asm/cpu.h +++ b/arch/mips/include/asm/cpu.h @@ -46,8 +46,8 @@ #define PRID_COMP_NETLOGIC 0x0c0000 #define PRID_COMP_CAVIUM 0x0d0000 #define PRID_COMP_LOONGSON 0x140000 -#define PRID_COMP_INGENIC_13 0x130000 /* X2000 */ -#define PRID_COMP_INGENIC_D0 0xd00000 /* JZ4740, JZ4750, X1830 */ +#define PRID_COMP_INGENIC_13 0x130000 /* X2000, X2100 */ +#define PRID_COMP_INGENIC_D0 0xd00000 /* JZ4730, JZ4740, JZ4750, JZ4755, JZ4760, X1830 */ #define PRID_COMP_INGENIC_D1 0xd10000 /* JZ4770, JZ4775, X1000 */ #define PRID_COMP_INGENIC_E1 0xe10000 /* JZ4780 */ From e98b461bb057aaea6fa766260788c08825213837 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 13 Jul 2021 10:49:15 -0700 Subject: [PATCH 016/474] MIPS: octeon: Remove vestiges of CONFIG_CAVIUM_RESERVE32 The config option CAVIUM_RESERVE32 is not used. Remove the dead code controlled by it. Signed-off-by: Joe Perches Signed-off-by: Thomas Bogendoerfer --- .../cavium-octeon/executive/cvmx-cmd-queue.c | 21 ++-------- arch/mips/cavium-octeon/setup.c | 38 +------------------ 2 files changed, 5 insertions(+), 54 deletions(-) diff --git a/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c b/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c index 3839feba68f2..fb42e8e21ea0 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c +++ b/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c @@ -57,27 +57,14 @@ EXPORT_SYMBOL_GPL(__cvmx_cmd_queue_state_ptr); static cvmx_cmd_queue_result_t __cvmx_cmd_queue_init_state_ptr(void) { char *alloc_name = "cvmx_cmd_queues"; -#if defined(CONFIG_CAVIUM_RESERVE32) && CONFIG_CAVIUM_RESERVE32 - extern uint64_t octeon_reserve32_memory; -#endif if (likely(__cvmx_cmd_queue_state_ptr)) return CVMX_CMD_QUEUE_SUCCESS; -#if defined(CONFIG_CAVIUM_RESERVE32) && CONFIG_CAVIUM_RESERVE32 - if (octeon_reserve32_memory) - __cvmx_cmd_queue_state_ptr = - cvmx_bootmem_alloc_named_range(sizeof(*__cvmx_cmd_queue_state_ptr), - octeon_reserve32_memory, - octeon_reserve32_memory + - (CONFIG_CAVIUM_RESERVE32 << - 20) - 1, 128, alloc_name); - else -#endif - __cvmx_cmd_queue_state_ptr = - cvmx_bootmem_alloc_named(sizeof(*__cvmx_cmd_queue_state_ptr), - 128, - alloc_name); + __cvmx_cmd_queue_state_ptr = + cvmx_bootmem_alloc_named(sizeof(*__cvmx_cmd_queue_state_ptr), + 128, + alloc_name); if (__cvmx_cmd_queue_state_ptr) memset(__cvmx_cmd_queue_state_ptr, 0, sizeof(*__cvmx_cmd_queue_state_ptr)); diff --git a/arch/mips/cavium-octeon/setup.c b/arch/mips/cavium-octeon/setup.c index ce4e2806159b..0ddd3cc16ee4 100644 --- a/arch/mips/cavium-octeon/setup.c +++ b/arch/mips/cavium-octeon/setup.c @@ -284,11 +284,6 @@ void octeon_crash_smp_send_stop(void) #endif /* CONFIG_KEXEC */ -#ifdef CONFIG_CAVIUM_RESERVE32 -uint64_t octeon_reserve32_memory; -EXPORT_SYMBOL(octeon_reserve32_memory); -#endif - #ifdef CONFIG_KEXEC /* crashkernel cmdline parameter is parsed _after_ memory setup * we also parse it here (workaround for EHB5200) */ @@ -665,9 +660,7 @@ void __init prom_init(void) int i; u64 t; int argc; -#ifdef CONFIG_CAVIUM_RESERVE32 - int64_t addr = -1; -#endif + /* * The bootloader passes a pointer to the boot descriptor in * $a3, this is available as fw_arg3. @@ -782,25 +775,6 @@ void __init prom_init(void) cvmx_write_csr(CVMX_LED_UDD_DATX(1), 0); cvmx_write_csr(CVMX_LED_EN, 1); } -#ifdef CONFIG_CAVIUM_RESERVE32 - /* - * We need to temporarily allocate all memory in the reserve32 - * region. This makes sure the kernel doesn't allocate this - * memory when it is getting memory from the - * bootloader. Later, after the memory allocations are - * complete, the reserve32 will be freed. - * - * Allocate memory for RESERVED32 aligned on 2MB boundary. This - * is in case we later use hugetlb entries with it. - */ - addr = cvmx_bootmem_phy_named_block_alloc(CONFIG_CAVIUM_RESERVE32 << 20, - 0, 0, 2 << 20, - "CAVIUM_RESERVE32", 0); - if (addr < 0) - pr_err("Failed to allocate CAVIUM_RESERVE32 memory area\n"); - else - octeon_reserve32_memory = addr; -#endif #ifdef CONFIG_CAVIUM_OCTEON_LOCK_L2 if (cvmx_read_csr(CVMX_L2D_FUS3) & (3ull << 34)) { @@ -1078,16 +1052,6 @@ void __init plat_mem_setup(void) cvmx_bootmem_unlock(); #endif /* CONFIG_CRASH_DUMP */ -#ifdef CONFIG_CAVIUM_RESERVE32 - /* - * Now that we've allocated the kernel memory it is safe to - * free the reserved region. We free it here so that builtin - * drivers can use the memory. - */ - if (octeon_reserve32_memory) - cvmx_bootmem_free_named("CAVIUM_RESERVE32"); -#endif /* CONFIG_CAVIUM_RESERVE32 */ - if (total == 0) panic("Unable to allocate memory from " "cvmx_bootmem_phy_alloc"); From faff43da31ae469ff28d71af4aa47f0c08b0b26c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 19 Jul 2021 21:16:17 -0700 Subject: [PATCH 017/474] mips: cavium-octeon: clean up kernel-doc in cvmx-interrupt-decodes.c Fix kernel-doc warnings reported by kernel test robot: arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c:49: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * __cvmx_interrupt_gmxx_rxx_int_en_enable enables all interrupt bits in cvmx_gmxx_rxx_int_en_t arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c:230: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * __cvmx_interrupt_pcsx_intx_en_reg_enable enables all interrupt bits in cvmx_pcsx_intx_en_reg_t arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c:271: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * __cvmx_interrupt_pcsxx_int_en_reg_enable enables all interrupt bits in cvmx_pcsxx_int_en_reg_t arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c:301: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * __cvmx_interrupt_spxx_int_msk_enable enables all interrupt bits in cvmx_spxx_int_msk_t arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c:340: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * __cvmx_interrupt_stxx_int_msk_enable enables all interrupt bits in cvmx_stxx_int_msk_t Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Aditya Srivastava Cc: David Daney Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Signed-off-by: Thomas Bogendoerfer --- .../executive/cvmx-interrupt-decodes.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c b/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c index 2f415d9d0f3c..67d6da21d49f 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c +++ b/arch/mips/cavium-octeon/executive/cvmx-interrupt-decodes.c @@ -46,7 +46,9 @@ /** - * __cvmx_interrupt_gmxx_rxx_int_en_enable enables all interrupt bits in cvmx_gmxx_rxx_int_en_t + * __cvmx_interrupt_gmxx_rxx_int_en_enable - enable all interrupt bits in cvmx_gmxx_rxx_int_en_t + * @index: interrupt register offset + * @block: interrupt register block_id */ void __cvmx_interrupt_gmxx_rxx_int_en_enable(int index, int block) { @@ -227,7 +229,9 @@ void __cvmx_interrupt_gmxx_rxx_int_en_enable(int index, int block) cvmx_write_csr(CVMX_GMXX_RXX_INT_EN(index, block), gmx_rx_int_en.u64); } /** - * __cvmx_interrupt_pcsx_intx_en_reg_enable enables all interrupt bits in cvmx_pcsx_intx_en_reg_t + * __cvmx_interrupt_pcsx_intx_en_reg_enable - enable all interrupt bits in cvmx_pcsx_intx_en_reg_t + * @index: interrupt register offset + * @block: interrupt register block_id */ void __cvmx_interrupt_pcsx_intx_en_reg_enable(int index, int block) { @@ -268,7 +272,8 @@ void __cvmx_interrupt_pcsx_intx_en_reg_enable(int index, int block) cvmx_write_csr(CVMX_PCSX_INTX_EN_REG(index, block), pcs_int_en_reg.u64); } /** - * __cvmx_interrupt_pcsxx_int_en_reg_enable enables all interrupt bits in cvmx_pcsxx_int_en_reg_t + * __cvmx_interrupt_pcsxx_int_en_reg_enable - enable all interrupt bits in cvmx_pcsxx_int_en_reg_t + * @index: interrupt register block_id */ void __cvmx_interrupt_pcsxx_int_en_reg_enable(int index) { @@ -298,7 +303,8 @@ void __cvmx_interrupt_pcsxx_int_en_reg_enable(int index) } /** - * __cvmx_interrupt_spxx_int_msk_enable enables all interrupt bits in cvmx_spxx_int_msk_t + * __cvmx_interrupt_spxx_int_msk_enable - enable all interrupt bits in cvmx_spxx_int_msk_t + * @index: interrupt register block_id */ void __cvmx_interrupt_spxx_int_msk_enable(int index) { @@ -337,7 +343,8 @@ void __cvmx_interrupt_spxx_int_msk_enable(int index) cvmx_write_csr(CVMX_SPXX_INT_MSK(index), spx_int_msk.u64); } /** - * __cvmx_interrupt_stxx_int_msk_enable enables all interrupt bits in cvmx_stxx_int_msk_t + * __cvmx_interrupt_stxx_int_msk_enable - enable all interrupt bits in cvmx_stxx_int_msk_t + * @index: interrupt register block_id */ void __cvmx_interrupt_stxx_int_msk_enable(int index) { From 73b9919f3c17851b4c1d90fb6c343d9c438c554c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 19 Jul 2021 21:34:32 -0700 Subject: [PATCH 018/474] mips: netlogic: fix kernel-doc complaints in fmn-config.c Clean up kernel-doc warnings in netlogic/xlr/fmn-config.c by using correct kernel-doc format. Fixes these warnings: arch/mips/netlogic/xlr/fmn-config.c:106: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Configure bucket size and credits for a device. 'size' is the size of arch/mips/netlogic/xlr/fmn-config.c:181: warning: expecting prototype for Setup the FMN details for each devices according to the device available(). Prototype was for xlr_board_info_setup() instead Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Ganesan Ramalingam Cc: John Crispin Cc: Aditya Srivastava Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Signed-off-by: Thomas Bogendoerfer --- arch/mips/netlogic/xlr/fmn-config.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/mips/netlogic/xlr/fmn-config.c b/arch/mips/netlogic/xlr/fmn-config.c index c7622c6e5f67..15483537e8cf 100644 --- a/arch/mips/netlogic/xlr/fmn-config.c +++ b/arch/mips/netlogic/xlr/fmn-config.c @@ -103,18 +103,19 @@ static void check_credit_distribution(void) } /** - * Configure bucket size and credits for a device. 'size' is the size of - * the buckets for the device. This size is distributed among all the CPUs - * so that all of them can send messages to the device. - * - * The device is also given 'cpu_credits' to send messages to the CPUs - * + * setup_fmn_cc - Configure bucket size and credits for a device. * @dev_info: FMN information structure for each devices * @start_stn_id: Starting station id of dev_info * @end_stn_id: End station id of dev_info * @num_buckets: Total number of buckets for den_info * @cpu_credits: Allowed credits to cpu for each devices pointing by dev_info * @size: Size of the each buckets in the device station + * + * 'size' is the size of the buckets for the device. This size is + * distributed among all the CPUs + * so that all of them can send messages to the device. + * + * The device is also given 'cpu_credits' to send messages to the CPUs */ static void setup_fmn_cc(struct xlr_fmn_info *dev_info, int start_stn_id, int end_stn_id, int num_buckets, int cpu_credits, int size) @@ -174,6 +175,8 @@ static void setup_cpu_fmninfo(struct xlr_fmn_info *cpu, int num_core) } /** + * xlr_board_info_setup - Setup FMN details + * * Setup the FMN details for each devices according to the device available * in each variant of XLR/XLS processor */ From d17eef2767d84b7dc4e1b8029be1eacb1d8679f6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 1 Apr 2021 00:06:56 +0900 Subject: [PATCH 019/474] mips: replace deprecated EXTRA_CFLAGS with ccflags-y As Documentation/kbuild/makefiles.rst says, EXTRA_CFLAGS is deprecated. Replace it with ccflags-y. Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Bogendoerfer --- arch/mips/kvm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile index c67250a956b8..18c69eff1d3f 100644 --- a/arch/mips/kvm/Makefile +++ b/arch/mips/kvm/Makefile @@ -4,7 +4,7 @@ common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o binary_stats.o) -EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm +ccflags-y += -Ivirt/kvm -Iarch/mips/kvm common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o From d656132d2a2abc06917d822f7adcda86fd6dd192 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 1 Apr 2021 00:06:57 +0900 Subject: [PATCH 020/474] mips: clean up kvm Makefile You can use kvm-y instead of kvm-objs to create the composite module. kvm-$(CONFIG_...) looks cleaner. Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Bogendoerfer --- arch/mips/kvm/Makefile | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile index 18c69eff1d3f..d3710959da55 100644 --- a/arch/mips/kvm/Makefile +++ b/arch/mips/kvm/Makefile @@ -2,21 +2,18 @@ # Makefile for KVM support for MIPS # -common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o binary_stats.o) - ccflags-y += -Ivirt/kvm -Iarch/mips/kvm -common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o +kvm-y := $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o binary_stats.o) +kvm-$(CONFIG_CPU_HAS_MSA) += msa.o -kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \ +kvm-y += mips.o emulate.o entry.o \ interrupt.o stats.o \ fpu.o -kvm-objs += hypcall.o -kvm-objs += mmu.o -ifdef CONFIG_CPU_LOONGSON64 -kvm-objs += loongson_ipi.o -endif +kvm-y += hypcall.o +kvm-y += mmu.o +kvm-$(CONFIG_CPU_LOONGSON64) += loongson_ipi.o -kvm-objs += vz.o +kvm-y += vz.o obj-$(CONFIG_KVM) += kvm.o obj-y += callback.o tlb.o From 85044eb08d0a37b1b6bcb3504bfd660a85ba5b7b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 20 Jul 2021 14:38:23 +0100 Subject: [PATCH 021/474] of: Return success from of_dma_set_restricted_buffer() when !OF_ADDRESS When CONFIG_OF_ADDRESS=n, of_dma_set_restricted_buffer() returns -ENODEV and breaks the boot for sparc[64] machines. Return 0 instead, since the function is essentially a glorified NOP in this configuration. Cc: Claire Chang Cc: Konrad Rzeszutek Wilk Reported-by: Guenter Roeck Suggested-by: Robin Murphy Tested-by: Guenter Roeck Tested-by: Claire Chang Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210702030807.GA2685166@roeck-us.net Fixes: fec9b625095f ("of: Add plumbing for restricted DMA pool") Signed-off-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- drivers/of/of_private.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h index 376462798f7e..f557bd22b0cf 100644 --- a/drivers/of/of_private.h +++ b/drivers/of/of_private.h @@ -173,7 +173,8 @@ static inline int of_dma_get_range(struct device_node *np, static inline int of_dma_set_restricted_buffer(struct device *dev, struct device_node *np) { - return -ENODEV; + /* Do nothing, successfully. */ + return 0; } #endif From 463e862ac63ef27fca423782536f6465abc3f180 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 20 Jul 2021 14:38:24 +0100 Subject: [PATCH 022/474] swiotlb: Convert io_default_tlb_mem to static allocation Since commit 69031f500865 ("swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used"), 'struct device' may hold a copy of the global 'io_default_tlb_mem' pointer if the device is using swiotlb for DMA. A subsequent call to swiotlb_exit() will therefore leave dangling pointers behind in these device structures, resulting in KASAN splats such as: | BUG: KASAN: use-after-free in __iommu_dma_unmap_swiotlb+0x64/0xb0 | Read of size 8 at addr ffff8881d7830000 by task swapper/0/0 | | CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.12.0-rc3-debug #1 | Hardware name: HP HP Desktop M01-F1xxx/87D6, BIOS F.12 12/17/2020 | Call Trace: | | dump_stack+0x9c/0xcf | print_address_description.constprop.0+0x18/0x130 | kasan_report.cold+0x7f/0x111 | __iommu_dma_unmap_swiotlb+0x64/0xb0 | nvme_pci_complete_rq+0x73/0x130 | blk_complete_reqs+0x6f/0x80 | __do_softirq+0xfc/0x3be Convert 'io_default_tlb_mem' to a static structure, so that the per-device pointers remain valid after swiotlb_exit() has been invoked. All users are updated to reference the static structure directly, using the 'nslabs' field to determine whether swiotlb has been initialised. The 'slots' array is still allocated dynamically and referenced via a pointer rather than a flexible array member. Cc: Claire Chang Cc: Christoph Hellwig Cc: Robin Murphy Cc: Konrad Rzeszutek Wilk Fixes: 69031f500865 ("swiotlb: Set dev->dma_io_tlb_mem to the swiotlb pool used") Reported-by: Nathan Chancellor Tested-by: Nathan Chancellor Tested-by: Claire Chang Reviewed-by: Christoph Hellwig Signed-off-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- drivers/base/core.c | 2 +- drivers/xen/swiotlb-xen.c | 4 +-- include/linux/swiotlb.h | 4 +-- kernel/dma/swiotlb.c | 66 +++++++++++++++++++++------------------ 4 files changed, 41 insertions(+), 35 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index ea5b85354526..b49824001cfa 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2848,7 +2848,7 @@ void device_initialize(struct device *dev) dev->dma_coherent = dma_default_coherent; #endif #ifdef CONFIG_SWIOTLB - dev->dma_io_tlb_mem = io_tlb_default_mem; + dev->dma_io_tlb_mem = &io_tlb_default_mem; #endif } EXPORT_SYMBOL_GPL(device_initialize); diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 785ec7e8be01..f06d9b4f1e0f 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -164,7 +164,7 @@ int __ref xen_swiotlb_init(void) int rc = -ENOMEM; char *start; - if (io_tlb_default_mem != NULL) { + if (io_tlb_default_mem.nslabs) { pr_warn("swiotlb buffer already initialized\n"); return -EEXIST; } @@ -547,7 +547,7 @@ xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, static int xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) { - return xen_phys_to_dma(hwdev, io_tlb_default_mem->end - 1) <= mask; + return xen_phys_to_dma(hwdev, io_tlb_default_mem.end - 1) <= mask; } const struct dma_map_ops xen_swiotlb_dma_ops = { diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 39284ff2a6cd..b0cb2a9973f4 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -103,9 +103,9 @@ struct io_tlb_mem { phys_addr_t orig_addr; size_t alloc_size; unsigned int list; - } slots[]; + } *slots; }; -extern struct io_tlb_mem *io_tlb_default_mem; +extern struct io_tlb_mem io_tlb_default_mem; static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr) { diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f1a9ae7fad8f..7948f274f9bb 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -70,7 +70,7 @@ enum swiotlb_force swiotlb_force; -struct io_tlb_mem *io_tlb_default_mem; +struct io_tlb_mem io_tlb_default_mem; /* * Max segment that we can provide which (if pages are contingous) will @@ -101,7 +101,7 @@ early_param("swiotlb", setup_io_tlb_npages); unsigned int swiotlb_max_segment(void) { - return io_tlb_default_mem ? max_segment : 0; + return io_tlb_default_mem.nslabs ? max_segment : 0; } EXPORT_SYMBOL_GPL(swiotlb_max_segment); @@ -134,9 +134,9 @@ void __init swiotlb_adjust_size(unsigned long size) void swiotlb_print_info(void) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = &io_tlb_default_mem; - if (!mem) { + if (!mem->nslabs) { pr_warn("No low mem\n"); return; } @@ -163,11 +163,11 @@ static inline unsigned long nr_slots(u64 val) */ void __init swiotlb_update_mem_attributes(void) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = &io_tlb_default_mem; void *vaddr; unsigned long bytes; - if (!mem || mem->late_alloc) + if (!mem->nslabs || mem->late_alloc) return; vaddr = phys_to_virt(mem->start); bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT); @@ -201,25 +201,24 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { - struct io_tlb_mem *mem; + struct io_tlb_mem *mem = &io_tlb_default_mem; size_t alloc_size; if (swiotlb_force == SWIOTLB_NO_FORCE) return 0; /* protect against double initialization */ - if (WARN_ON_ONCE(io_tlb_default_mem)) + if (WARN_ON_ONCE(mem->nslabs)) return -ENOMEM; - alloc_size = PAGE_ALIGN(struct_size(mem, slots, nslabs)); - mem = memblock_alloc(alloc_size, PAGE_SIZE); - if (!mem) + alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs)); + mem->slots = memblock_alloc(alloc_size, PAGE_SIZE); + if (!mem->slots) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false); - io_tlb_default_mem = mem; if (verbose) swiotlb_print_info(); swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); @@ -304,26 +303,24 @@ swiotlb_late_init_with_default_size(size_t default_size) int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { - struct io_tlb_mem *mem; + struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long bytes = nslabs << IO_TLB_SHIFT; if (swiotlb_force == SWIOTLB_NO_FORCE) return 0; /* protect against double initialization */ - if (WARN_ON_ONCE(io_tlb_default_mem)) + if (WARN_ON_ONCE(mem->nslabs)) return -ENOMEM; - mem = (void *)__get_free_pages(GFP_KERNEL, - get_order(struct_size(mem, slots, nslabs))); - if (!mem) + mem->slots = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(array_size(sizeof(*mem->slots), nslabs))); + if (!mem->slots) return -ENOMEM; - memset(mem, 0, sizeof(*mem)); set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true); - io_tlb_default_mem = mem; swiotlb_print_info(); swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); return 0; @@ -331,18 +328,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) void __init swiotlb_exit(void) { - struct io_tlb_mem *mem = io_tlb_default_mem; size_t size; + struct io_tlb_mem *mem = &io_tlb_default_mem; - if (!mem) + if (!mem->nslabs) return; - size = struct_size(mem, slots, mem->nslabs); + size = array_size(sizeof(*mem->slots), mem->nslabs); if (mem->late_alloc) - free_pages((unsigned long)mem, get_order(size)); + free_pages((unsigned long)mem->slots, get_order(size)); else - memblock_free_late(__pa(mem), PAGE_ALIGN(size)); - io_tlb_default_mem = NULL; + memblock_free_late(__pa(mem->slots), PAGE_ALIGN(size)); + memset(mem, 0, sizeof(*mem)); } /* @@ -696,7 +693,9 @@ size_t swiotlb_max_mapping_size(struct device *dev) bool is_swiotlb_active(struct device *dev) { - return dev->dma_io_tlb_mem != NULL; + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + + return mem && mem->nslabs; } EXPORT_SYMBOL_GPL(is_swiotlb_active); @@ -711,10 +710,10 @@ static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem) static int __init swiotlb_create_default_debugfs(void) { - struct io_tlb_mem *mem = io_tlb_default_mem; + struct io_tlb_mem *mem = &io_tlb_default_mem; debugfs_dir = debugfs_create_dir("swiotlb", NULL); - if (mem) { + if (mem->nslabs) { mem->debugfs = debugfs_dir; swiotlb_create_debugfs_files(mem); } @@ -783,10 +782,17 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, * to it. */ if (!mem) { - mem = kzalloc(struct_size(mem, slots, nslabs), GFP_KERNEL); + mem = kzalloc(sizeof(*mem), GFP_KERNEL); if (!mem) return -ENOMEM; + mem->slots = kzalloc(array_size(sizeof(*mem->slots), nslabs), + GFP_KERNEL); + if (!mem->slots) { + kfree(mem); + return -ENOMEM; + } + set_memory_decrypted((unsigned long)phys_to_virt(rmem->base), rmem->size >> PAGE_SHIFT); swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false); @@ -806,7 +812,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, static void rmem_swiotlb_device_release(struct reserved_mem *rmem, struct device *dev) { - dev->dma_io_tlb_mem = io_tlb_default_mem; + dev->dma_io_tlb_mem = &io_tlb_default_mem; } static const struct reserved_mem_ops rmem_swiotlb_ops = { From 1efd3fc0ccf52e1aa5f0bf5b0d82847180d20951 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 20 Jul 2021 14:38:25 +0100 Subject: [PATCH 023/474] swiotlb: Emit diagnostic in swiotlb_exit() A recent debugging session would have been made a little bit easier if we had noticed sooner that swiotlb_exit() was being called during boot. Add a simple diagnostic message to swiotlb_exit() to complement the one from swiotlb_print_info() during initialisation. Cc: Claire Chang Cc: Christoph Hellwig Cc: Robin Murphy Link: https://lore.kernel.org/r/20210705190352.GA19461@willie-the-truck Suggested-by: Konrad Rzeszutek Wilk Tested-by: Nathan Chancellor Tested-by: Claire Chang Reviewed-by: Christoph Hellwig Signed-off-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 7948f274f9bb..b3c793ed9e64 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -334,6 +334,7 @@ void __init swiotlb_exit(void) if (!mem->nslabs) return; + pr_info("tearing down default memory pool\n"); size = array_size(sizeof(*mem->slots), mem->nslabs); if (mem->late_alloc) free_pages((unsigned long)mem->slots, get_order(size)); From ad6c00283163cb7ad52cdf97d2850547446f7d98 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 20 Jul 2021 14:38:26 +0100 Subject: [PATCH 024/474] swiotlb: Free tbl memory in swiotlb_exit() Although swiotlb_exit() frees the 'slots' metadata array referenced by 'io_tlb_default_mem', it leaves the underlying buffer pages allocated despite no longer being usable. Extend swiotlb_exit() to free the buffer pages as well as the slots array. Cc: Claire Chang Cc: Christoph Hellwig Cc: Robin Murphy Cc: Konrad Rzeszutek Wilk Tested-by: Nathan Chancellor Tested-by: Claire Chang Reviewed-by: Christoph Hellwig Signed-off-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b3c793ed9e64..87c40517e822 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -328,18 +328,27 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) void __init swiotlb_exit(void) { - size_t size; struct io_tlb_mem *mem = &io_tlb_default_mem; + unsigned long tbl_vaddr; + size_t tbl_size, slots_size; if (!mem->nslabs) return; pr_info("tearing down default memory pool\n"); - size = array_size(sizeof(*mem->slots), mem->nslabs); - if (mem->late_alloc) - free_pages((unsigned long)mem->slots, get_order(size)); - else - memblock_free_late(__pa(mem->slots), PAGE_ALIGN(size)); + tbl_vaddr = (unsigned long)phys_to_virt(mem->start); + tbl_size = PAGE_ALIGN(mem->end - mem->start); + slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs)); + + set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT); + if (mem->late_alloc) { + free_pages(tbl_vaddr, get_order(tbl_size)); + free_pages((unsigned long)mem->slots, get_order(slots_size)); + } else { + memblock_free_late(mem->start, tbl_size); + memblock_free_late(__pa(mem->slots), slots_size); + } + memset(mem, 0, sizeof(*mem)); } From 93ebb6828723b8aef114415c4dc3518342f7dcad Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Sat, 24 Jul 2021 01:17:46 +0200 Subject: [PATCH 025/474] s390/pv: fix the forcing of the swiotlb Since commit 903cd0f315fe ("swiotlb: Use is_swiotlb_force_bounce for swiotlb data bouncing") if code sets swiotlb_force it needs to do so before the swiotlb is initialised. Otherwise io_tlb_default_mem->force_bounce will not get set to true, and devices that use (the default) swiotlb will not bounce despite switolb_force having the value of SWIOTLB_FORCE. Let us restore swiotlb functionality for PV by fulfilling this new requirement. This change addresses what turned out to be a fragility in commit 64e1f0c531d1 ("s390/mm: force swiotlb for protected virtualization"), which ain't exactly broken in its original context, but could give us some more headache if people backport the broken change and forget this fix. Signed-off-by: Halil Pasic Tested-by: Christian Borntraeger Reviewed-by: Christian Borntraeger Fixes: 903cd0f315fe ("swiotlb: Use is_swiotlb_force_bounce for swiotlb data bouncing") Fixes: 64e1f0c531d1 ("s390/mm: force swiotlb for protected virtualization") Cc: stable@vger.kernel.org #5.3+ Signed-off-by: Konrad Rzeszutek Wilk --- arch/s390/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 8ac710de1ab1..07bbee9b7320 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -186,9 +186,9 @@ static void pv_init(void) return; /* make sure bounce buffers are shared */ + swiotlb_force = SWIOTLB_FORCE; swiotlb_init(1); swiotlb_update_mem_attributes(); - swiotlb_force = SWIOTLB_FORCE; } void __init mem_init(void) From 374c15594c4ee0dfcceb38852bd43be09070f402 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:42 -0700 Subject: [PATCH 026/474] iommu/io-pgtable: Introduce unmap_pages() as a page table op The io-pgtable code expects to operate on a single block or granule of memory that is supported by the IOMMU hardware when unmapping memory. This means that when a large buffer that consists of multiple such blocks is unmapped, the io-pgtable code will walk the page tables to the correct level to unmap each block, even for blocks that are virtually contiguous and at the same level, which can incur an overhead in performance. Introduce the unmap_pages() page table op to express to the io-pgtable code that it should unmap a number of blocks of the same size, instead of a single block. Doing so allows multiple blocks to be unmapped in one call to the io-pgtable code, reducing the number of page table walks, and indirect calls. Signed-off-by: Isaac J. Manjarres Suggested-by: Will Deacon Signed-off-by: Will Deacon Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/1623850736-389584-2-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- include/linux/io-pgtable.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 4d40dfa75b55..9391c5fa71e6 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -144,6 +144,7 @@ struct io_pgtable_cfg { * * @map: Map a physically contiguous memory region. * @unmap: Unmap a physically contiguous memory region. + * @unmap_pages: Unmap a range of virtually contiguous pages of the same size. * @iova_to_phys: Translate iova to physical address. * * These functions map directly onto the iommu_ops member functions with @@ -154,6 +155,9 @@ struct io_pgtable_ops { phys_addr_t paddr, size_t size, int prot, gfp_t gfp); size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova, size_t size, struct iommu_iotlb_gather *gather); + size_t (*unmap_pages)(struct io_pgtable_ops *ops, unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *gather); phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops, unsigned long iova); }; From cacffb7f7b45ba7649eedea4c196c6e9f1863bf3 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:43 -0700 Subject: [PATCH 027/474] iommu: Add an unmap_pages() op for IOMMU drivers Add a callback for IOMMU drivers to provide a path for the IOMMU framework to call into an IOMMU driver, which can call into the io-pgtable code, to unmap a virtually contiguous range of pages of the same size. For IOMMU drivers that do not specify an unmap_pages() callback, the existing logic of unmapping memory one page block at a time will be used. Signed-off-by: Isaac J. Manjarres Suggested-by: Will Deacon Signed-off-by: Will Deacon Acked-by: Lu Baolu Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/1623850736-389584-3-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 32d448050bf7..25a844121be5 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -181,6 +181,7 @@ struct iommu_iotlb_gather { * @detach_dev: detach device from an iommu domain * @map: map a physically contiguous memory region to an iommu domain * @unmap: unmap a physically contiguous memory region from an iommu domain + * @unmap_pages: unmap a number of pages of the same size from an iommu domain * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain * @iotlb_sync_map: Sync mappings created recently using @map to the hardware * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush @@ -231,6 +232,9 @@ struct iommu_ops { phys_addr_t paddr, size_t size, int prot, gfp_t gfp); size_t (*unmap)(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *iotlb_gather); + size_t (*unmap_pages)(struct iommu_domain *domain, unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *iotlb_gather); void (*flush_iotlb_all)(struct iommu_domain *domain); void (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova, size_t size); From ca073b55d16a83ba7e73cd313312abc68f07f293 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:44 -0700 Subject: [PATCH 028/474] iommu/io-pgtable: Introduce map_pages() as a page table op Mapping memory into io-pgtables follows the same semantics that unmapping memory used to follow (i.e. a buffer will be mapped one page block per call to the io-pgtable code). This means that it can be optimized in the same way that unmapping memory was, so add a map_pages() callback to the io-pgtable ops structure, so that a range of pages of the same size can be mapped within the same call. Signed-off-by: Isaac J. Manjarres Suggested-by: Will Deacon Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/1623850736-389584-4-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- include/linux/io-pgtable.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 9391c5fa71e6..c43f3b899d2a 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -143,6 +143,7 @@ struct io_pgtable_cfg { * struct io_pgtable_ops - Page table manipulation API for IOMMU drivers. * * @map: Map a physically contiguous memory region. + * @map_pages: Map a physically contiguous range of pages of the same size. * @unmap: Unmap a physically contiguous memory region. * @unmap_pages: Unmap a range of virtually contiguous pages of the same size. * @iova_to_phys: Translate iova to physical address. @@ -153,6 +154,9 @@ struct io_pgtable_cfg { struct io_pgtable_ops { int (*map)(struct io_pgtable_ops *ops, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); + int (*map_pages)(struct io_pgtable_ops *ops, unsigned long iova, + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped); size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova, size_t size, struct iommu_iotlb_gather *gather); size_t (*unmap_pages)(struct io_pgtable_ops *ops, unsigned long iova, From 910c4406ccc9613de0a54abf910edc4bf8a575c0 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:45 -0700 Subject: [PATCH 029/474] iommu: Add a map_pages() op for IOMMU drivers Add a callback for IOMMU drivers to provide a path for the IOMMU framework to call into an IOMMU driver, which can call into the io-pgtable code, to map a physically contiguous rnage of pages of the same size. For IOMMU drivers that do not specify a map_pages() callback, the existing logic of mapping memory one page block at a time will be used. Signed-off-by: Isaac J. Manjarres Suggested-by: Will Deacon Acked-by: Lu Baolu Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/1623850736-389584-5-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 25a844121be5..d7989d4a7404 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -180,6 +180,8 @@ struct iommu_iotlb_gather { * @attach_dev: attach device to an iommu domain * @detach_dev: detach device from an iommu domain * @map: map a physically contiguous memory region to an iommu domain + * @map_pages: map a physically contiguous set of pages of the same size to + * an iommu domain. * @unmap: unmap a physically contiguous memory region from an iommu domain * @unmap_pages: unmap a number of pages of the same size from an iommu domain * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain @@ -230,6 +232,9 @@ struct iommu_ops { void (*detach_dev)(struct iommu_domain *domain, struct device *dev); int (*map)(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); + int (*map_pages)(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped); size_t (*unmap)(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *iotlb_gather); size_t (*unmap_pages)(struct iommu_domain *domain, unsigned long iova, From e7d6fff6b3d34230bac1b2c2607646d7b8638cb7 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 16 Jun 2021 06:38:46 -0700 Subject: [PATCH 030/474] iommu: Use bitmap to calculate page size in iommu_pgsize() Avoid the potential for shifting values by amounts greater than the width of their type by using a bitmap to compute page size in iommu_pgsize(). Signed-off-by: Will Deacon Signed-off-by: Isaac J. Manjarres Signed-off-by: Georgi Djakov Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/1623850736-389584-6-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5419c4b9f27a..80e471ada358 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -2378,30 +2379,22 @@ static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long addr_merge, size_t size) { unsigned int pgsize_idx; + unsigned long pgsizes; size_t pgsize; - /* Max page size that still fits into 'size' */ - pgsize_idx = __fls(size); + /* Page sizes supported by the hardware and small enough for @size */ + pgsizes = domain->pgsize_bitmap & GENMASK(__fls(size), 0); - /* need to consider alignment requirements ? */ - if (likely(addr_merge)) { - /* Max page size allowed by address */ - unsigned int align_pgsize_idx = __ffs(addr_merge); - pgsize_idx = min(pgsize_idx, align_pgsize_idx); - } + /* Constrain the page sizes further based on the maximum alignment */ + if (likely(addr_merge)) + pgsizes &= GENMASK(__ffs(addr_merge), 0); - /* build a mask of acceptable page sizes */ - pgsize = (1UL << (pgsize_idx + 1)) - 1; + /* Make sure we have at least one suitable page size */ + BUG_ON(!pgsizes); - /* throw away page sizes not supported by the hardware */ - pgsize &= domain->pgsize_bitmap; - - /* make sure we're still sane */ - BUG_ON(!pgsize); - - /* pick the biggest page */ - pgsize_idx = __fls(pgsize); - pgsize = 1UL << pgsize_idx; + /* Pick the biggest page size remaining */ + pgsize_idx = __fls(pgsizes); + pgsize = BIT(pgsize_idx); return pgsize; } From 89d5b9601f70b72f524fdb7bc6cf1b1e8e20e867 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 16 Jun 2021 06:38:47 -0700 Subject: [PATCH 031/474] iommu: Split 'addr_merge' argument to iommu_pgsize() into separate parts The 'addr_merge' parameter to iommu_pgsize() is a fabricated address intended to describe the alignment requirements to consider when choosing an appropriate page size. On the iommu_map() path, this address is the logical OR of the virtual and physical addresses. Subsequent improvements to iommu_pgsize() will need to check the alignment of the virtual and physical components of 'addr_merge' independently, so pass them in as separate parameters and reconstruct 'addr_merge' locally. No functional change. Signed-off-by: Will Deacon Signed-off-by: Isaac J. Manjarres Signed-off-by: Georgi Djakov Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/1623850736-389584-7-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 80e471ada358..80e14c139d40 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2375,12 +2375,13 @@ phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) } EXPORT_SYMBOL_GPL(iommu_iova_to_phys); -static size_t iommu_pgsize(struct iommu_domain *domain, - unsigned long addr_merge, size_t size) +static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t size) { unsigned int pgsize_idx; unsigned long pgsizes; size_t pgsize; + unsigned long addr_merge = paddr | iova; /* Page sizes supported by the hardware and small enough for @size */ pgsizes = domain->pgsize_bitmap & GENMASK(__fls(size), 0); @@ -2433,7 +2434,7 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size); while (size) { - size_t pgsize = iommu_pgsize(domain, iova | paddr, size); + size_t pgsize = iommu_pgsize(domain, iova, paddr, size); pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n", iova, &paddr, pgsize); @@ -2521,8 +2522,9 @@ static size_t __iommu_unmap(struct iommu_domain *domain, * or we hit an area that isn't mapped. */ while (unmapped < size) { - size_t pgsize = iommu_pgsize(domain, iova, size - unmapped); + size_t pgsize; + pgsize = iommu_pgsize(domain, iova, iova, size - unmapped); unmapped_page = ops->unmap(domain, iova, pgsize, iotlb_gather); if (!unmapped_page) break; From b1d99dc5f983b4ae8ab9841981c5dbd5530f6344 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 16 Jun 2021 06:38:48 -0700 Subject: [PATCH 032/474] iommu: Hook up '->unmap_pages' driver callback Extend iommu_pgsize() to populate an optional 'count' parameter so that we can direct unmapping operation to the ->unmap_pages callback if it has been provided by the driver. Signed-off-by: Will Deacon Signed-off-by: Isaac J. Manjarres Signed-off-by: Georgi Djakov Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/1623850736-389584-8-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 59 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 80e14c139d40..725622c7e603 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2376,11 +2376,11 @@ phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) EXPORT_SYMBOL_GPL(iommu_iova_to_phys); static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size) + phys_addr_t paddr, size_t size, size_t *count) { - unsigned int pgsize_idx; + unsigned int pgsize_idx, pgsize_idx_next; unsigned long pgsizes; - size_t pgsize; + size_t offset, pgsize, pgsize_next; unsigned long addr_merge = paddr | iova; /* Page sizes supported by the hardware and small enough for @size */ @@ -2396,7 +2396,36 @@ static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova, /* Pick the biggest page size remaining */ pgsize_idx = __fls(pgsizes); pgsize = BIT(pgsize_idx); + if (!count) + return pgsize; + /* Find the next biggest support page size, if it exists */ + pgsizes = domain->pgsize_bitmap & ~GENMASK(pgsize_idx, 0); + if (!pgsizes) + goto out_set_count; + + pgsize_idx_next = __ffs(pgsizes); + pgsize_next = BIT(pgsize_idx_next); + + /* + * There's no point trying a bigger page size unless the virtual + * and physical addresses are similarly offset within the larger page. + */ + if ((iova ^ paddr) & (pgsize_next - 1)) + goto out_set_count; + + /* Calculate the offset to the next page size alignment boundary */ + offset = pgsize_next - (addr_merge & (pgsize_next - 1)); + + /* + * If size is big enough to accommodate the larger page, reduce + * the number of smaller pages. + */ + if (offset + pgsize_next <= size) + size = offset; + +out_set_count: + *count = size >> pgsize_idx; return pgsize; } @@ -2434,7 +2463,7 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size); while (size) { - size_t pgsize = iommu_pgsize(domain, iova, paddr, size); + size_t pgsize = iommu_pgsize(domain, iova, paddr, size, NULL); pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n", iova, &paddr, pgsize); @@ -2485,6 +2514,19 @@ int iommu_map_atomic(struct iommu_domain *domain, unsigned long iova, } EXPORT_SYMBOL_GPL(iommu_map_atomic); +static size_t __iommu_unmap_pages(struct iommu_domain *domain, + unsigned long iova, size_t size, + struct iommu_iotlb_gather *iotlb_gather) +{ + const struct iommu_ops *ops = domain->ops; + size_t pgsize, count; + + pgsize = iommu_pgsize(domain, iova, iova, size, &count); + return ops->unmap_pages ? + ops->unmap_pages(domain, iova, pgsize, count, iotlb_gather) : + ops->unmap(domain, iova, pgsize, iotlb_gather); +} + static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *iotlb_gather) @@ -2494,7 +2536,7 @@ static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long orig_iova = iova; unsigned int min_pagesz; - if (unlikely(ops->unmap == NULL || + if (unlikely(!(ops->unmap || ops->unmap_pages) || domain->pgsize_bitmap == 0UL)) return 0; @@ -2522,10 +2564,9 @@ static size_t __iommu_unmap(struct iommu_domain *domain, * or we hit an area that isn't mapped. */ while (unmapped < size) { - size_t pgsize; - - pgsize = iommu_pgsize(domain, iova, iova, size - unmapped); - unmapped_page = ops->unmap(domain, iova, pgsize, iotlb_gather); + unmapped_page = __iommu_unmap_pages(domain, iova, + size - unmapped, + iotlb_gather); if (!unmapped_page) break; From 647c57764b37c0ddfa6bf775b03d5e5cc407086b Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:49 -0700 Subject: [PATCH 033/474] iommu: Add support for the map_pages() callback Since iommu_pgsize can calculate how many pages of the same size can be mapped/unmapped before the next largest page size boundary, add support for invoking an IOMMU driver's map_pages() callback, if it provides one. Signed-off-by: Isaac J. Manjarres Suggested-by: Will Deacon Signed-off-by: Georgi Djakov Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/1623850736-389584-9-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 725622c7e603..70a729ce88b1 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2429,6 +2429,30 @@ out_set_count: return pgsize; } +static int __iommu_map_pages(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t size, int prot, + gfp_t gfp, size_t *mapped) +{ + const struct iommu_ops *ops = domain->ops; + size_t pgsize, count; + int ret; + + pgsize = iommu_pgsize(domain, iova, paddr, size, &count); + + pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n", + iova, &paddr, pgsize, count); + + if (ops->map_pages) { + ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot, + gfp, mapped); + } else { + ret = ops->map(domain, iova, paddr, pgsize, prot, gfp); + *mapped = ret ? 0 : pgsize; + } + + return ret; +} + static int __iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { @@ -2439,7 +2463,7 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t orig_paddr = paddr; int ret = 0; - if (unlikely(ops->map == NULL || + if (unlikely(!(ops->map || ops->map_pages) || domain->pgsize_bitmap == 0UL)) return -ENODEV; @@ -2463,18 +2487,21 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size); while (size) { - size_t pgsize = iommu_pgsize(domain, iova, paddr, size, NULL); + size_t mapped = 0; - pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx\n", - iova, &paddr, pgsize); - ret = ops->map(domain, iova, paddr, pgsize, prot, gfp); + ret = __iommu_map_pages(domain, iova, paddr, size, prot, gfp, + &mapped); + /* + * Some pages may have been mapped, even if an error occurred, + * so we should account for those so they can be unmapped. + */ + size -= mapped; if (ret) break; - iova += pgsize; - paddr += pgsize; - size -= pgsize; + iova += mapped; + paddr += mapped; } /* unroll mapping in case something went wrong */ From 41e1eb2546e9c8200d32f11e4b47d86d156a5a97 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:50 -0700 Subject: [PATCH 034/474] iommu/io-pgtable-arm: Prepare PTE methods for handling multiple entries The PTE methods currently operate on a single entry. In preparation for manipulating multiple PTEs in one map or unmap call, allow them to handle multiple PTEs. Signed-off-by: Isaac J. Manjarres Suggested-by: Robin Murphy Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/1623850736-389584-10-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-arm.c | 74 +++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 32 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 87def58e79b5..ea66b10c04c4 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -232,20 +232,23 @@ static void __arm_lpae_free_pages(void *pages, size_t size, free_pages((unsigned long)pages, get_order(size)); } -static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, +static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries, struct io_pgtable_cfg *cfg) { dma_sync_single_for_device(cfg->iommu_dev, __arm_lpae_dma_addr(ptep), - sizeof(*ptep), DMA_TO_DEVICE); + sizeof(*ptep) * num_entries, DMA_TO_DEVICE); } static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte, - struct io_pgtable_cfg *cfg) + int num_entries, struct io_pgtable_cfg *cfg) { - *ptep = pte; + int i; + + for (i = 0; i < num_entries; i++) + ptep[i] = pte; if (!cfg->coherent_walk) - __arm_lpae_sync_pte(ptep, cfg); + __arm_lpae_sync_pte(ptep, num_entries, cfg); } static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, @@ -255,47 +258,54 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, phys_addr_t paddr, arm_lpae_iopte prot, - int lvl, arm_lpae_iopte *ptep) + int lvl, int num_entries, arm_lpae_iopte *ptep) { arm_lpae_iopte pte = prot; + struct io_pgtable_cfg *cfg = &data->iop.cfg; + size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data); + int i; if (data->iop.fmt != ARM_MALI_LPAE && lvl == ARM_LPAE_MAX_LEVELS - 1) pte |= ARM_LPAE_PTE_TYPE_PAGE; else pte |= ARM_LPAE_PTE_TYPE_BLOCK; - pte |= paddr_to_iopte(paddr, data); + for (i = 0; i < num_entries; i++) + ptep[i] = pte | paddr_to_iopte(paddr + i * sz, data); - __arm_lpae_set_pte(ptep, pte, &data->iop.cfg); + if (!cfg->coherent_walk) + __arm_lpae_sync_pte(ptep, num_entries, cfg); } static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, unsigned long iova, phys_addr_t paddr, - arm_lpae_iopte prot, int lvl, + arm_lpae_iopte prot, int lvl, int num_entries, arm_lpae_iopte *ptep) { - arm_lpae_iopte pte = *ptep; + int i; - if (iopte_leaf(pte, lvl, data->iop.fmt)) { - /* We require an unmap first */ - WARN_ON(!selftest_running); - return -EEXIST; - } else if (iopte_type(pte) == ARM_LPAE_PTE_TYPE_TABLE) { - /* - * We need to unmap and free the old table before - * overwriting it with a block entry. - */ - arm_lpae_iopte *tblp; - size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data); + for (i = 0; i < num_entries; i++) + if (iopte_leaf(ptep[i], lvl, data->iop.fmt)) { + /* We require an unmap first */ + WARN_ON(!selftest_running); + return -EEXIST; + } else if (iopte_type(ptep[i]) == ARM_LPAE_PTE_TYPE_TABLE) { + /* + * We need to unmap and free the old table before + * overwriting it with a block entry. + */ + arm_lpae_iopte *tblp; + size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data); - tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data); - if (__arm_lpae_unmap(data, NULL, iova, sz, lvl, tblp) != sz) { - WARN_ON(1); - return -EINVAL; + tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data); + if (__arm_lpae_unmap(data, NULL, iova + i * sz, sz, + lvl, tblp) != sz) { + WARN_ON(1); + return -EINVAL; + } } - } - __arm_lpae_init_pte(data, paddr, prot, lvl, ptep); + __arm_lpae_init_pte(data, paddr, prot, lvl, num_entries, ptep); return 0; } @@ -323,7 +333,7 @@ static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table, return old; /* Even if it's not ours, there's no point waiting; just kick it */ - __arm_lpae_sync_pte(ptep, cfg); + __arm_lpae_sync_pte(ptep, 1, cfg); if (old == curr) WRITE_ONCE(*ptep, new | ARM_LPAE_PTE_SW_SYNC); @@ -344,7 +354,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, /* If we can install a leaf entry at this level, then do so */ if (size == block_size) - return arm_lpae_init_pte(data, iova, paddr, prot, lvl, ptep); + return arm_lpae_init_pte(data, iova, paddr, prot, lvl, 1, ptep); /* We can't allocate tables at the final level */ if (WARN_ON(lvl >= ARM_LPAE_MAX_LEVELS - 1)) @@ -361,7 +371,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, if (pte) __arm_lpae_free_pages(cptep, tblsz, cfg); } else if (!cfg->coherent_walk && !(pte & ARM_LPAE_PTE_SW_SYNC)) { - __arm_lpae_sync_pte(ptep, cfg); + __arm_lpae_sync_pte(ptep, 1, cfg); } if (pte && !iopte_leaf(pte, lvl, data->iop.fmt)) { @@ -543,7 +553,7 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, if (i == unmap_idx) continue; - __arm_lpae_init_pte(data, blk_paddr, pte, lvl, &tablep[i]); + __arm_lpae_init_pte(data, blk_paddr, pte, lvl, 1, &tablep[i]); } pte = arm_lpae_install_table(tablep, ptep, blk_pte, cfg); @@ -585,7 +595,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, /* If the size matches this level, we're in the right place */ if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) { - __arm_lpae_set_pte(ptep, 0, &iop->cfg); + __arm_lpae_set_pte(ptep, 0, 1, &iop->cfg); if (!iopte_leaf(pte, lvl, iop->fmt)) { /* Also flush any partial walks */ From 1fe27be5ffec20637a8fe17ec7c1e88f3aaf62f0 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:51 -0700 Subject: [PATCH 035/474] iommu/io-pgtable-arm: Implement arm_lpae_unmap_pages() Implement the unmap_pages() callback for the ARM LPAE io-pgtable format. Signed-off-by: Isaac J. Manjarres Suggested-by: Will Deacon Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/1623850736-389584-11-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-arm.c | 118 ++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 45 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index ea66b10c04c4..fe8fa0ee9c98 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -46,6 +46,9 @@ #define ARM_LPAE_PGD_SIZE(d) \ (sizeof(arm_lpae_iopte) << (d)->pgd_bits) +#define ARM_LPAE_PTES_PER_TABLE(d) \ + (ARM_LPAE_GRANULE(d) >> ilog2(sizeof(arm_lpae_iopte))) + /* * Calculate the index at level l used to map virtual address a using the * pagetable in d. @@ -239,22 +242,19 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries, sizeof(*ptep) * num_entries, DMA_TO_DEVICE); } -static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte, - int num_entries, struct io_pgtable_cfg *cfg) +static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg) { - int i; - for (i = 0; i < num_entries; i++) - ptep[i] = pte; + *ptep = 0; if (!cfg->coherent_walk) - __arm_lpae_sync_pte(ptep, num_entries, cfg); + __arm_lpae_sync_pte(ptep, 1, cfg); } static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, struct iommu_iotlb_gather *gather, - unsigned long iova, size_t size, int lvl, - arm_lpae_iopte *ptep); + unsigned long iova, size_t size, size_t pgcount, + int lvl, arm_lpae_iopte *ptep); static void __arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, phys_addr_t paddr, arm_lpae_iopte prot, @@ -298,7 +298,7 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data, size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data); tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data); - if (__arm_lpae_unmap(data, NULL, iova + i * sz, sz, + if (__arm_lpae_unmap(data, NULL, iova + i * sz, sz, 1, lvl, tblp) != sz) { WARN_ON(1); return -EINVAL; @@ -526,14 +526,15 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, arm_lpae_iopte blk_pte, int lvl, - arm_lpae_iopte *ptep) + arm_lpae_iopte *ptep, size_t pgcount) { struct io_pgtable_cfg *cfg = &data->iop.cfg; arm_lpae_iopte pte, *tablep; phys_addr_t blk_paddr; size_t tablesz = ARM_LPAE_GRANULE(data); size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data); - int i, unmap_idx = -1; + int ptes_per_table = ARM_LPAE_PTES_PER_TABLE(data); + int i, unmap_idx_start = -1, num_entries = 0, max_entries; if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) return 0; @@ -542,15 +543,18 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, if (!tablep) return 0; /* Bytes unmapped */ - if (size == split_sz) - unmap_idx = ARM_LPAE_LVL_IDX(iova, lvl, data); + if (size == split_sz) { + unmap_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data); + max_entries = ptes_per_table - unmap_idx_start; + num_entries = min_t(int, pgcount, max_entries); + } blk_paddr = iopte_to_paddr(blk_pte, data); pte = iopte_prot(blk_pte); - for (i = 0; i < tablesz / sizeof(pte); i++, blk_paddr += split_sz) { + for (i = 0; i < ptes_per_table; i++, blk_paddr += split_sz) { /* Unmap! */ - if (i == unmap_idx) + if (i >= unmap_idx_start && i < (unmap_idx_start + num_entries)) continue; __arm_lpae_init_pte(data, blk_paddr, pte, lvl, 1, &tablep[i]); @@ -568,76 +572,92 @@ static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, return 0; tablep = iopte_deref(pte, data); - } else if (unmap_idx >= 0) { - io_pgtable_tlb_add_page(&data->iop, gather, iova, size); - return size; + } else if (unmap_idx_start >= 0) { + for (i = 0; i < num_entries; i++) + io_pgtable_tlb_add_page(&data->iop, gather, iova + i * size, size); + + return num_entries * size; } - return __arm_lpae_unmap(data, gather, iova, size, lvl, tablep); + return __arm_lpae_unmap(data, gather, iova, size, pgcount, lvl, tablep); } static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, struct iommu_iotlb_gather *gather, - unsigned long iova, size_t size, int lvl, - arm_lpae_iopte *ptep) + unsigned long iova, size_t size, size_t pgcount, + int lvl, arm_lpae_iopte *ptep) { arm_lpae_iopte pte; struct io_pgtable *iop = &data->iop; + int i = 0, num_entries, max_entries, unmap_idx_start; /* Something went horribly wrong and we ran out of page table */ if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) return 0; - ptep += ARM_LPAE_LVL_IDX(iova, lvl, data); + unmap_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data); + ptep += unmap_idx_start; pte = READ_ONCE(*ptep); if (WARN_ON(!pte)) return 0; /* If the size matches this level, we're in the right place */ if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) { - __arm_lpae_set_pte(ptep, 0, 1, &iop->cfg); + max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start; + num_entries = min_t(int, pgcount, max_entries); - if (!iopte_leaf(pte, lvl, iop->fmt)) { - /* Also flush any partial walks */ - io_pgtable_tlb_flush_walk(iop, iova, size, - ARM_LPAE_GRANULE(data)); - ptep = iopte_deref(pte, data); - __arm_lpae_free_pgtable(data, lvl + 1, ptep); - } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) { - /* - * Order the PTE update against queueing the IOVA, to - * guarantee that a flush callback from a different CPU - * has observed it before the TLBIALL can be issued. - */ - smp_wmb(); - } else { - io_pgtable_tlb_add_page(iop, gather, iova, size); + while (i < num_entries) { + pte = READ_ONCE(*ptep); + if (WARN_ON(!pte)) + break; + + __arm_lpae_clear_pte(ptep, &iop->cfg); + + if (!iopte_leaf(pte, lvl, iop->fmt)) { + /* Also flush any partial walks */ + io_pgtable_tlb_flush_walk(iop, iova + i * size, size, + ARM_LPAE_GRANULE(data)); + __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data)); + } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) { + /* + * Order the PTE update against queueing the IOVA, to + * guarantee that a flush callback from a different CPU + * has observed it before the TLBIALL can be issued. + */ + smp_wmb(); + } else { + io_pgtable_tlb_add_page(iop, gather, iova + i * size, size); + } + + ptep++; + i++; } - return size; + return i * size; } else if (iopte_leaf(pte, lvl, iop->fmt)) { /* * Insert a table at the next level to map the old region, * minus the part we want to unmap */ return arm_lpae_split_blk_unmap(data, gather, iova, size, pte, - lvl + 1, ptep); + lvl + 1, ptep, pgcount); } /* Keep on walkin' */ ptep = iopte_deref(pte, data); - return __arm_lpae_unmap(data, gather, iova, size, lvl + 1, ptep); + return __arm_lpae_unmap(data, gather, iova, size, pgcount, lvl + 1, ptep); } -static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova, - size_t size, struct iommu_iotlb_gather *gather) +static size_t arm_lpae_unmap_pages(struct io_pgtable_ops *ops, unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *gather) { struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); struct io_pgtable_cfg *cfg = &data->iop.cfg; arm_lpae_iopte *ptep = data->pgd; long iaext = (s64)iova >> cfg->ias; - if (WARN_ON(!size || (size & cfg->pgsize_bitmap) != size)) + if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount)) return 0; if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1) @@ -645,7 +665,14 @@ static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova, if (WARN_ON(iaext)) return 0; - return __arm_lpae_unmap(data, gather, iova, size, data->start_level, ptep); + return __arm_lpae_unmap(data, gather, iova, pgsize, pgcount, + data->start_level, ptep); +} + +static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova, + size_t size, struct iommu_iotlb_gather *gather) +{ + return arm_lpae_unmap_pages(ops, iova, size, 1, gather); } static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops, @@ -761,6 +788,7 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg) data->iop.ops = (struct io_pgtable_ops) { .map = arm_lpae_map, .unmap = arm_lpae_unmap, + .unmap_pages = arm_lpae_unmap_pages, .iova_to_phys = arm_lpae_iova_to_phys, }; From 4a77b12deb253bd0d56bab6e9f5ad09cef754c36 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:52 -0700 Subject: [PATCH 036/474] iommu/io-pgtable-arm: Implement arm_lpae_map_pages() Implement the map_pages() callback for the ARM LPAE io-pgtable format. Signed-off-by: Isaac J. Manjarres Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/1623850736-389584-12-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-arm.c | 41 +++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index fe8fa0ee9c98..053df4048a29 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -341,20 +341,30 @@ static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table, } static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, - phys_addr_t paddr, size_t size, arm_lpae_iopte prot, - int lvl, arm_lpae_iopte *ptep, gfp_t gfp) + phys_addr_t paddr, size_t size, size_t pgcount, + arm_lpae_iopte prot, int lvl, arm_lpae_iopte *ptep, + gfp_t gfp, size_t *mapped) { arm_lpae_iopte *cptep, pte; size_t block_size = ARM_LPAE_BLOCK_SIZE(lvl, data); size_t tblsz = ARM_LPAE_GRANULE(data); struct io_pgtable_cfg *cfg = &data->iop.cfg; + int ret = 0, num_entries, max_entries, map_idx_start; /* Find our entry at the current level */ - ptep += ARM_LPAE_LVL_IDX(iova, lvl, data); + map_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data); + ptep += map_idx_start; /* If we can install a leaf entry at this level, then do so */ - if (size == block_size) - return arm_lpae_init_pte(data, iova, paddr, prot, lvl, 1, ptep); + if (size == block_size) { + max_entries = ARM_LPAE_PTES_PER_TABLE(data) - map_idx_start; + num_entries = min_t(int, pgcount, max_entries); + ret = arm_lpae_init_pte(data, iova, paddr, prot, lvl, num_entries, ptep); + if (!ret && mapped) + *mapped += num_entries * size; + + return ret; + } /* We can't allocate tables at the final level */ if (WARN_ON(lvl >= ARM_LPAE_MAX_LEVELS - 1)) @@ -383,7 +393,8 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, } /* Rinse, repeat */ - return __arm_lpae_map(data, iova, paddr, size, prot, lvl + 1, cptep, gfp); + return __arm_lpae_map(data, iova, paddr, size, pgcount, prot, lvl + 1, + cptep, gfp, mapped); } static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, @@ -450,8 +461,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, return pte; } -static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t size, int iommu_prot, gfp_t gfp) +static int arm_lpae_map_pages(struct io_pgtable_ops *ops, unsigned long iova, + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int iommu_prot, gfp_t gfp, size_t *mapped) { struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); struct io_pgtable_cfg *cfg = &data->iop.cfg; @@ -460,7 +472,7 @@ static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova, arm_lpae_iopte prot; long iaext = (s64)iova >> cfg->ias; - if (WARN_ON(!size || (size & cfg->pgsize_bitmap) != size)) + if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize)) return -EINVAL; if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1) @@ -473,7 +485,8 @@ static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova, return 0; prot = arm_lpae_prot_to_pte(data, iommu_prot); - ret = __arm_lpae_map(data, iova, paddr, size, prot, lvl, ptep, gfp); + ret = __arm_lpae_map(data, iova, paddr, pgsize, pgcount, prot, lvl, + ptep, gfp, mapped); /* * Synchronise all PTE updates for the new mapping before there's * a chance for anything to kick off a table walk for the new iova. @@ -483,6 +496,13 @@ static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova, return ret; } +static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova, + phys_addr_t paddr, size_t size, int iommu_prot, gfp_t gfp) +{ + return arm_lpae_map_pages(ops, iova, paddr, size, 1, iommu_prot, gfp, + NULL); +} + static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl, arm_lpae_iopte *ptep) { @@ -787,6 +807,7 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg) data->iop.ops = (struct io_pgtable_ops) { .map = arm_lpae_map, + .map_pages = arm_lpae_map_pages, .unmap = arm_lpae_unmap, .unmap_pages = arm_lpae_unmap_pages, .iova_to_phys = arm_lpae_iova_to_phys, From f13eabcf9dfad938333c0b6cc6bf4c05cc25eb4f Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:53 -0700 Subject: [PATCH 037/474] iommu/io-pgtable-arm-v7s: Implement arm_v7s_unmap_pages() Implement the unmap_pages() callback for the ARM v7s io-pgtable format. Signed-off-by: Isaac J. Manjarres Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/1623850736-389584-13-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-arm-v7s.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index d4004bcf333a..1af060686985 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -710,15 +710,32 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, return __arm_v7s_unmap(data, gather, iova, size, lvl + 1, ptep); } -static size_t arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova, - size_t size, struct iommu_iotlb_gather *gather) +static size_t arm_v7s_unmap_pages(struct io_pgtable_ops *ops, unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *gather) { struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops); + size_t unmapped = 0, ret; if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias))) return 0; - return __arm_v7s_unmap(data, gather, iova, size, 1, data->pgd); + while (pgcount--) { + ret = __arm_v7s_unmap(data, gather, iova, pgsize, 1, data->pgd); + if (!ret) + break; + + unmapped += pgsize; + iova += pgsize; + } + + return unmapped; +} + +static size_t arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova, + size_t size, struct iommu_iotlb_gather *gather) +{ + return arm_v7s_unmap_pages(ops, iova, size, 1, gather); } static phys_addr_t arm_v7s_iova_to_phys(struct io_pgtable_ops *ops, @@ -781,6 +798,7 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg, data->iop.ops = (struct io_pgtable_ops) { .map = arm_v7s_map, .unmap = arm_v7s_unmap, + .unmap_pages = arm_v7s_unmap_pages, .iova_to_phys = arm_v7s_iova_to_phys, }; From 23c30bed9c3c9062774488dc5223aa1ee850e028 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:54 -0700 Subject: [PATCH 038/474] iommu/io-pgtable-arm-v7s: Implement arm_v7s_map_pages() Implement the map_pages() callback for the ARM v7s io-pgtable format. Signed-off-by: Isaac J. Manjarres Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/1623850736-389584-14-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-arm-v7s.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 1af060686985..5db90d7ce2ec 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -519,11 +519,12 @@ static int __arm_v7s_map(struct arm_v7s_io_pgtable *data, unsigned long iova, return __arm_v7s_map(data, iova, paddr, size, prot, lvl + 1, cptep, gfp); } -static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +static int arm_v7s_map_pages(struct io_pgtable_ops *ops, unsigned long iova, + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) { struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops); - int ret; + int ret = -EINVAL; if (WARN_ON(iova >= (1ULL << data->iop.cfg.ias) || paddr >= (1ULL << data->iop.cfg.oas))) @@ -533,7 +534,17 @@ static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova, if (!(prot & (IOMMU_READ | IOMMU_WRITE))) return 0; - ret = __arm_v7s_map(data, iova, paddr, size, prot, 1, data->pgd, gfp); + while (pgcount--) { + ret = __arm_v7s_map(data, iova, paddr, pgsize, prot, 1, data->pgd, + gfp); + if (ret) + break; + + iova += pgsize; + paddr += pgsize; + if (mapped) + *mapped += pgsize; + } /* * Synchronise all PTE updates for the new mapping before there's * a chance for anything to kick off a table walk for the new iova. @@ -543,6 +554,12 @@ static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova, return ret; } +static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova, + phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +{ + return arm_v7s_map_pages(ops, iova, paddr, size, 1, prot, gfp, NULL); +} + static void arm_v7s_free_pgtable(struct io_pgtable *iop) { struct arm_v7s_io_pgtable *data = io_pgtable_to_data(iop); @@ -797,6 +814,7 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg, data->iop.ops = (struct io_pgtable_ops) { .map = arm_v7s_map, + .map_pages = arm_v7s_map_pages, .unmap = arm_v7s_unmap, .unmap_pages = arm_v7s_unmap_pages, .iova_to_phys = arm_v7s_iova_to_phys, From 9ea1a2c4944851aae4d821b3a6c99ade2bc3bbe6 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:55 -0700 Subject: [PATCH 039/474] iommu/arm-smmu: Implement the unmap_pages() IOMMU driver callback Implement the unmap_pages() callback for the ARM SMMU driver to allow calls from iommu_unmap to unmap multiple pages of the same size in one call. Also, remove the unmap() callback for the SMMU driver, as it will no longer be used. Signed-off-by: Isaac J. Manjarres Suggested-by: Will Deacon Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/1623850736-389584-15-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index f22dbeb1e510..9a0c49060bf3 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1215,8 +1215,9 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, return ret; } -static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size, struct iommu_iotlb_gather *gather) +static size_t arm_smmu_unmap_pages(struct iommu_domain *domain, unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *iotlb_gather) { struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu; @@ -1226,7 +1227,7 @@ static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, return 0; arm_smmu_rpm_get(smmu); - ret = ops->unmap(ops, iova, size, gather); + ret = ops->unmap_pages(ops, iova, pgsize, pgcount, iotlb_gather); arm_smmu_rpm_put(smmu); return ret; @@ -1583,7 +1584,7 @@ static struct iommu_ops arm_smmu_ops = { .domain_free = arm_smmu_domain_free, .attach_dev = arm_smmu_attach_dev, .map = arm_smmu_map, - .unmap = arm_smmu_unmap, + .unmap_pages = arm_smmu_unmap_pages, .flush_iotlb_all = arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, .iova_to_phys = arm_smmu_iova_to_phys, From 808035317b22e2dd8d2cfef2083ad1743ac0c0e2 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Wed, 16 Jun 2021 06:38:56 -0700 Subject: [PATCH 040/474] iommu/arm-smmu: Implement the map_pages() IOMMU driver callback Implement the map_pages() callback for the ARM SMMU driver to allow calls from iommu_map to map multiple pages of the same size in one call. Also, remove the map() callback for the ARM SMMU driver, as it will no longer be used. Signed-off-by: Isaac J. Manjarres Suggested-by: Will Deacon Signed-off-by: Georgi Djakov Link: https://lore.kernel.org/r/1623850736-389584-16-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 9a0c49060bf3..5ed4408d4b28 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1198,8 +1198,9 @@ rpm_put: return ret; } -static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) { struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu; @@ -1209,7 +1210,7 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, return -ENODEV; arm_smmu_rpm_get(smmu); - ret = ops->map(ops, iova, paddr, size, prot, gfp); + ret = ops->map_pages(ops, iova, paddr, pgsize, pgcount, prot, gfp, mapped); arm_smmu_rpm_put(smmu); return ret; @@ -1583,7 +1584,7 @@ static struct iommu_ops arm_smmu_ops = { .domain_alloc = arm_smmu_domain_alloc, .domain_free = arm_smmu_domain_free, .attach_dev = arm_smmu_attach_dev, - .map = arm_smmu_map, + .map_pages = arm_smmu_map_pages, .unmap_pages = arm_smmu_unmap_pages, .flush_iotlb_all = arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, From 8119cefd9a29b71997e62b762932d23499ba4896 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 14 Jul 2021 18:17:58 +0530 Subject: [PATCH 041/474] powerpc/kexec: blacklist functions called in real mode for kprobe As kprobe does not handle events happening in real mode, blacklist the functions that only get called in real mode or in kexec sequence with MMU turned off. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/162626687834.155313.4692863392927831843.stgit@hbathini-workstation.ibm.com --- arch/powerpc/kernel/head_64.S | 2 ++ arch/powerpc/kexec/core_64.c | 6 ++++-- arch/powerpc/mm/book3s64/hash_native.c | 2 +- arch/powerpc/mm/book3s64/pgtable.c | 4 ++-- arch/powerpc/mm/book3s64/radix_pgtable.c | 3 ++- arch/powerpc/platforms/ps3/htab.c | 3 ++- arch/powerpc/platforms/ps3/mm.c | 8 ++++++-- arch/powerpc/platforms/pseries/lpar.c | 9 ++++++--- 8 files changed, 25 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 79930b0bc781..f17ae2083733 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -712,6 +712,8 @@ _GLOBAL(copy_and_flush) isync blr +_ASM_NOKPROBE_SYMBOL(copy_and_flush); /* Called in real mode */ + .align 8 copy_to_here: diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 8a449b2d8715..84618d3c8013 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -72,7 +72,8 @@ int default_machine_kexec_prepare(struct kimage *image) return 0; } -static void copy_segments(unsigned long ind) +/* Called during kexec sequence with MMU off */ +static notrace void copy_segments(unsigned long ind) { unsigned long entry; unsigned long *ptr; @@ -105,7 +106,8 @@ static void copy_segments(unsigned long ind) } } -void kexec_copy_flush(struct kimage *image) +/* Called during kexec sequence with MMU off */ +notrace void kexec_copy_flush(struct kimage *image) { long i, nr_segments = image->nr_segments; struct kexec_segment ranges[KEXEC_SEGMENT_MAX]; diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 52e170bd95ae..d8279bfe68ea 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -787,7 +787,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, * TODO: add batching support when enabled. remember, no dynamic memory here, * although there is the control page available... */ -static void native_hpte_clear(void) +static notrace void native_hpte_clear(void) { unsigned long vpn = 0; unsigned long slot, slots; diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 9ffa65074cb0..300099de553b 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -172,8 +172,8 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/* For use by kexec */ -void mmu_cleanup_all(void) +/* For use by kexec, called with MMU off */ +notrace void mmu_cleanup_all(void) { if (radix_enabled()) radix__mmu_cleanup_all(); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index e50ddf129c15..ae20add7954a 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -679,7 +679,8 @@ void radix__early_init_mmu_secondary(void) mtspr(SPRN_UAMOR, 0); } -void radix__mmu_cleanup_all(void) +/* Called during kexec sequence with MMU off */ +notrace void radix__mmu_cleanup_all(void) { unsigned long lpcr; diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c index 7ddc7ec6a7c0..ef710a715903 100644 --- a/arch/powerpc/platforms/ps3/htab.c +++ b/arch/powerpc/platforms/ps3/htab.c @@ -169,7 +169,8 @@ static void ps3_hpte_invalidate(unsigned long slot, unsigned long vpn, spin_unlock_irqrestore(&ps3_htab_lock, flags); } -static void ps3_hpte_clear(void) +/* Called during kexec sequence with MMU off */ +static notrace void ps3_hpte_clear(void) { unsigned long hpte_count = (1UL << ppc64_pft_size) >> 4; u64 i; diff --git a/arch/powerpc/platforms/ps3/mm.c b/arch/powerpc/platforms/ps3/mm.c index a81eac35d900..9c44f335c0b9 100644 --- a/arch/powerpc/platforms/ps3/mm.c +++ b/arch/powerpc/platforms/ps3/mm.c @@ -195,9 +195,11 @@ fail: /** * ps3_mm_vas_destroy - + * + * called during kexec sequence with MMU off. */ -void ps3_mm_vas_destroy(void) +notrace void ps3_mm_vas_destroy(void) { int result; @@ -1243,9 +1245,11 @@ void __init ps3_mm_init(void) /** * ps3_mm_shutdown - final cleanup of address space + * + * called during kexec sequence with MMU off. */ -void ps3_mm_shutdown(void) +notrace void ps3_mm_shutdown(void) { ps3_mm_region_destroy(&map.r1); } diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index dab356e3ff87..869ef638698a 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -801,7 +801,8 @@ static long pSeries_lpar_hpte_remove(unsigned long hpte_group) return -1; } -static void manual_hpte_clear_all(void) +/* Called during kexec sequence with MMU off */ +static notrace void manual_hpte_clear_all(void) { unsigned long size_bytes = 1UL << ppc64_pft_size; unsigned long hpte_count = size_bytes >> 4; @@ -834,7 +835,8 @@ static void manual_hpte_clear_all(void) } } -static int hcall_hpte_clear_all(void) +/* Called during kexec sequence with MMU off */ +static notrace int hcall_hpte_clear_all(void) { int rc; @@ -845,7 +847,8 @@ static int hcall_hpte_clear_all(void) return rc; } -static void pseries_hpte_clear_all(void) +/* Called during kexec sequence with MMU off */ +static notrace void pseries_hpte_clear_all(void) { int rc; From 1d479f160c500249d8fa4d21e7d2b7aaffc04daf Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 12 Jul 2021 19:12:15 +0800 Subject: [PATCH 042/474] iommu: Deprecate Intel and AMD cmdline methods to enable strict mode Now that the x86 drivers support iommu.strict, deprecate the custom methods. Signed-off-by: John Garry Acked-by: Robin Murphy Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/1626088340-5838-2-git-send-email-john.garry@huawei.com Signed-off-by: Joerg Roedel --- Documentation/admin-guide/kernel-parameters.txt | 9 ++------- drivers/iommu/amd/init.c | 4 +++- drivers/iommu/intel/iommu.c | 1 + 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bdb22006f713..a04d2748c99a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -290,10 +290,7 @@ amd_iommu= [HW,X86-64] Pass parameters to the AMD IOMMU driver in the system. Possible values are: - fullflush - enable flushing of IO/TLB entries when - they are unmapped. Otherwise they are - flushed before they will be reused, which - is a lot of faster + fullflush - Deprecated, equivalent to iommu.strict=1 off - do not initialize any AMD IOMMU found in the system force_isolation - Force device isolation for all @@ -1944,9 +1941,7 @@ this case, gfx device will use physical address for DMA. strict [Default Off] - With this option on every unmap_single operation will - result in a hardware IOTLB flush operation as opposed - to batching them for performance. + Deprecated, equivalent to iommu.strict=1. sp_off [Default Off] By default, super page will be supported if Intel IOMMU has the capability. With this option, super page will diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 46280e6e1535..3a2fb805f11e 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3098,8 +3098,10 @@ static int __init parse_amd_iommu_intr(char *str) static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { - if (strncmp(str, "fullflush", 9) == 0) + if (strncmp(str, "fullflush", 9) == 0) { + pr_warn("amd_iommu=fullflush deprecated; use iommu.strict=1 instead\n"); amd_iommu_unmap_flush = true; + } if (strncmp(str, "force_enable", 12) == 0) amd_iommu_force_enable = true; if (strncmp(str, "off", 3) == 0) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index dd22fc7d5176..c6da5dadd12e 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -454,6 +454,7 @@ static int __init intel_iommu_setup(char *str) pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n"); iommu_dma_forcedac = true; } else if (!strncmp(str, "strict", 6)) { + pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); pr_info("Disable batched IOTLB flush\n"); intel_iommu_strict = 1; } else if (!strncmp(str, "sp_off", 6)) { From d8577d2e331dc06f5871afa6b76035dd8b74c903 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 12 Jul 2021 19:12:16 +0800 Subject: [PATCH 043/474] iommu: Print strict or lazy mode at init time As well as the default domain type, it's useful to know whether strict or lazy for DMA domains, so add this info in a separate print. The (stict/lazy) mode may be also set via iommu.strict earlyparm, but this will be processed prior to iommu_subsys_init(), so that print will be accurate for drivers which don't set the mode via custom means. For the drivers which set the mode via custom means - AMD and Intel drivers - they maintain prints to inform a change in policy or that custom cmdline methods to change policy are deprecated. Signed-off-by: John Garry Reviewed-by: Robin Murphy Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/1626088340-5838-3-git-send-email-john.garry@huawei.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 70a729ce88b1..69d7d4865668 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -139,6 +139,11 @@ static int __init iommu_subsys_init(void) (iommu_cmd_line & IOMMU_CMD_LINE_DMA_API) ? "(set via kernel command line)" : ""); + pr_info("DMA domain TLB invalidation policy: %s mode %s\n", + iommu_dma_strict ? "strict" : "lazy", + (iommu_cmd_line & IOMMU_CMD_LINE_STRICT) ? + "(set via kernel command line)" : ""); + return 0; } subsys_initcall(iommu_subsys_init); From 712d8f205835c1d170e85d7811da3d9f36d8fab7 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Mon, 12 Jul 2021 19:12:17 +0800 Subject: [PATCH 044/474] iommu: Enhance IOMMU default DMA mode build options First, add build options IOMMU_DEFAULT_{LAZY|STRICT}, so that we have the opportunity to set {lazy|strict} mode as default at build time. Then put the two config options in an choice, as they are mutually exclusive. [jpg: Make choice between strict and lazy only (and not passthrough)] Signed-off-by: Zhen Lei Signed-off-by: John Garry Reviewed-by: Robin Murphy Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/1626088340-5838-4-git-send-email-john.garry@huawei.com Signed-off-by: Joerg Roedel --- .../admin-guide/kernel-parameters.txt | 3 +- drivers/iommu/Kconfig | 40 +++++++++++++++++++ drivers/iommu/iommu.c | 2 +- 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a04d2748c99a..90b525cf0ec2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2042,9 +2042,10 @@ throughput at the cost of reduced device isolation. Will fall back to strict mode if not supported by the relevant IOMMU driver. - 1 - Strict mode (default). + 1 - Strict mode. DMA unmap operations invalidate IOMMU hardware TLBs synchronously. + unset - Use value of CONFIG_IOMMU_DEFAULT_{LAZY,STRICT}. Note: on x86, the default behaviour depends on the equivalent driver-specific parameters, but a strict mode explicitly specified by either method takes diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 07b7c25cbed8..9cd5d7afc766 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -90,6 +90,46 @@ config IOMMU_DEFAULT_PASSTHROUGH If unsure, say N here. +choice + prompt "IOMMU default DMA IOTLB invalidation mode" + depends on IOMMU_DMA + + default IOMMU_DEFAULT_STRICT + help + This option allows an IOMMU DMA IOTLB invalidation mode to be + chosen at build time, to override the default mode of each ARCH, + removing the need to pass in kernel parameters through command line. + It is still possible to provide common boot params to override this + config. + + If unsure, keep the default. + +config IOMMU_DEFAULT_STRICT + bool "strict" + help + For every IOMMU DMA unmap operation, the flush operation of IOTLB and + the free operation of IOVA are guaranteed to be done in the unmap + function. + +config IOMMU_DEFAULT_LAZY + bool "lazy" + help + Support lazy mode, where for every IOMMU DMA unmap operation, the + flush operation of IOTLB and the free operation of IOVA are deferred. + They are only guaranteed to be done before the related IOVA will be + reused. + + The isolation provided in this mode is not as secure as STRICT mode, + such that a vulnerable time window may be created between the DMA + unmap and the mappings cached in the IOMMU IOTLB or device TLB + finally being invalidated, where the device could still access the + memory which has already been unmapped by the device driver. + However this mode may provide better performance in high throughput + scenarios, and is still considerably more secure than passthrough + mode or no IOMMU. + +endchoice + config OF_IOMMU def_bool y depends on OF && IOMMU_API diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 69d7d4865668..bd9ccce387c5 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -30,7 +30,7 @@ static struct kset *iommu_group_kset; static DEFINE_IDA(iommu_group_ida); static unsigned int iommu_def_domain_type __read_mostly; -static bool iommu_dma_strict __read_mostly = true; +static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_STRICT); static u32 iommu_cmd_line __read_mostly; struct iommu_group { From d0e108b8e962e0aae65dc74ffda63b93b6ef32f4 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Mon, 12 Jul 2021 19:12:18 +0800 Subject: [PATCH 045/474] iommu/vt-d: Add support for IOMMU default DMA mode build options Make IOMMU_DEFAULT_LAZY default for when INTEL_IOMMU config is set, as is current behaviour. Also delete global flag intel_iommu_strict: - In intel_iommu_setup(), call iommu_set_dma_strict(true) directly. Also remove the print, as iommu_subsys_init() prints the mode and we have already marked this param as deprecated. - For cap_caching_mode() check in intel_iommu_setup(), call iommu_set_dma_strict(true) directly; also reword the accompanying print with a level downgrade and also add the missing '\n'. - For Ironlake GPU, again call iommu_set_dma_strict(true) directly and keep the accompanying print. [jpg: Remove intel_iommu_strict] Signed-off-by: Zhen Lei Signed-off-by: John Garry Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/1626088340-5838-5-git-send-email-john.garry@huawei.com Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 1 + drivers/iommu/intel/iommu.c | 15 ++++++--------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 9cd5d7afc766..265d7a6c9d3a 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -94,6 +94,7 @@ choice prompt "IOMMU default DMA IOTLB invalidation mode" depends on IOMMU_DMA + default IOMMU_DEFAULT_LAZY if INTEL_IOMMU default IOMMU_DEFAULT_STRICT help This option allows an IOMMU DMA IOTLB invalidation mode to be diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index c6da5dadd12e..6fd004a1a66d 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -361,7 +361,6 @@ int intel_iommu_enabled = 0; EXPORT_SYMBOL_GPL(intel_iommu_enabled); static int dmar_map_gfx = 1; -static int intel_iommu_strict; static int intel_iommu_superpage = 1; static int iommu_identity_mapping; static int iommu_skip_te_disable; @@ -455,8 +454,7 @@ static int __init intel_iommu_setup(char *str) iommu_dma_forcedac = true; } else if (!strncmp(str, "strict", 6)) { pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); - pr_info("Disable batched IOTLB flush\n"); - intel_iommu_strict = 1; + iommu_set_dma_strict(true); } else if (!strncmp(str, "sp_off", 6)) { pr_info("Disable supported super page\n"); intel_iommu_superpage = 0; @@ -4394,9 +4392,9 @@ int __init intel_iommu_init(void) * is likely to be much lower than the overhead of synchronizing * the virtual and physical IOMMU page-tables. */ - if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) { - pr_warn("IOMMU batching is disabled due to virtualization"); - intel_iommu_strict = 1; + if (cap_caching_mode(iommu->cap)) { + pr_info_once("IOMMU batching disallowed due to virtualization\n"); + iommu_set_dma_strict(true); } iommu_device_sysfs_add(&iommu->iommu, NULL, intel_iommu_groups, @@ -4405,7 +4403,6 @@ int __init intel_iommu_init(void) } up_read(&dmar_global_lock); - iommu_set_dma_strict(intel_iommu_strict); bus_set_iommu(&pci_bus_type, &intel_iommu_ops); if (si_domain && !hw_pass_through) register_memory_notifier(&intel_iommu_memory_nb); @@ -5715,8 +5712,8 @@ static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) } else if (dmar_map_gfx) { /* we have to ensure the gfx device is idle before we flush */ pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); - intel_iommu_strict = 1; - } + iommu_set_dma_strict(true); + } } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt); From 02252b3bfe9f98770a8d902925711676ff5bd766 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Mon, 12 Jul 2021 19:12:19 +0800 Subject: [PATCH 046/474] iommu/amd: Add support for IOMMU default DMA mode build options Make IOMMU_DEFAULT_LAZY default for when AMD_IOMMU config is set, which matches current behaviour. For "fullflush" param, just call iommu_set_dma_strict(true) directly. Since we get a strict vs lazy mode print already in iommu_subsys_init(), and maintain a deprecation print when "fullflush" param is passed, drop the prints in amd_iommu_init_dma_ops(). Finally drop global flag amd_iommu_unmap_flush, as it has no longer has any purpose. [jpg: Rebase for relocated file and drop amd_iommu_unmap_flush] Signed-off-by: Zhen Lei Signed-off-by: John Garry Link: https://lore.kernel.org/r/1626088340-5838-6-git-send-email-john.garry@huawei.com Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 2 +- drivers/iommu/amd/amd_iommu_types.h | 6 ------ drivers/iommu/amd/init.c | 3 +-- drivers/iommu/amd/iommu.c | 6 ------ 4 files changed, 2 insertions(+), 15 deletions(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 265d7a6c9d3a..c84da8205be7 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -94,7 +94,7 @@ choice prompt "IOMMU default DMA IOTLB invalidation mode" depends on IOMMU_DMA - default IOMMU_DEFAULT_LAZY if INTEL_IOMMU + default IOMMU_DEFAULT_LAZY if (AMD_IOMMU || INTEL_IOMMU) default IOMMU_DEFAULT_STRICT help This option allows an IOMMU DMA IOTLB invalidation mode to be diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 94c1a7a9876d..8dbe61e2b3c1 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -779,12 +779,6 @@ extern u16 amd_iommu_last_bdf; /* allocation bitmap for domain ids */ extern unsigned long *amd_iommu_pd_alloc_bitmap; -/* - * If true, the addresses will be flushed on unmap time, not when - * they are reused - */ -extern bool amd_iommu_unmap_flush; - /* Smallest max PASID supported by any IOMMU in the system */ extern u32 amd_iommu_max_pasid; diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 3a2fb805f11e..1e641cb6dddc 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -161,7 +161,6 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ -bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ @@ -3100,7 +3099,7 @@ static int __init parse_amd_iommu_options(char *str) for (; *str; ++str) { if (strncmp(str, "fullflush", 9) == 0) { pr_warn("amd_iommu=fullflush deprecated; use iommu.strict=1 instead\n"); - amd_iommu_unmap_flush = true; + iommu_set_dma_strict(true); } if (strncmp(str, "force_enable", 12) == 0) amd_iommu_force_enable = true; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 811a49a95d04..52fe2326042a 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1775,12 +1775,6 @@ void amd_iommu_domain_update(struct protection_domain *domain) static void __init amd_iommu_init_dma_ops(void) { swiotlb = (iommu_default_passthrough() || sme_me_mask) ? 1 : 0; - - if (amd_iommu_unmap_flush) - pr_info("IO/TLB flush on unmap enabled\n"); - else - pr_info("Lazy IO/TLB flushing enabled\n"); - iommu_set_dma_strict(amd_iommu_unmap_flush); } int __init amd_iommu_init_api(void) From 308723e3580027f0cd7c86a5edfe6b5acb6863d2 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 12 Jul 2021 19:12:20 +0800 Subject: [PATCH 047/474] iommu: Remove mode argument from iommu_set_dma_strict() We only ever now set strict mode enabled in iommu_set_dma_strict(), so just remove the argument. Signed-off-by: John Garry Reviewed-by: Robin Murphy Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/1626088340-5838-7-git-send-email-john.garry@huawei.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 2 +- drivers/iommu/intel/iommu.c | 6 +++--- drivers/iommu/iommu.c | 5 ++--- include/linux/iommu.h | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 1e641cb6dddc..6e12a615117b 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3099,7 +3099,7 @@ static int __init parse_amd_iommu_options(char *str) for (; *str; ++str) { if (strncmp(str, "fullflush", 9) == 0) { pr_warn("amd_iommu=fullflush deprecated; use iommu.strict=1 instead\n"); - iommu_set_dma_strict(true); + iommu_set_dma_strict(); } if (strncmp(str, "force_enable", 12) == 0) amd_iommu_force_enable = true; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 6fd004a1a66d..da9afa730df1 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -454,7 +454,7 @@ static int __init intel_iommu_setup(char *str) iommu_dma_forcedac = true; } else if (!strncmp(str, "strict", 6)) { pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n"); - iommu_set_dma_strict(true); + iommu_set_dma_strict(); } else if (!strncmp(str, "sp_off", 6)) { pr_info("Disable supported super page\n"); intel_iommu_superpage = 0; @@ -4394,7 +4394,7 @@ int __init intel_iommu_init(void) */ if (cap_caching_mode(iommu->cap)) { pr_info_once("IOMMU batching disallowed due to virtualization\n"); - iommu_set_dma_strict(true); + iommu_set_dma_strict(); } iommu_device_sysfs_add(&iommu->iommu, NULL, intel_iommu_groups, @@ -5712,7 +5712,7 @@ static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev) } else if (dmar_map_gfx) { /* we have to ensure the gfx device is idle before we flush */ pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n"); - iommu_set_dma_strict(true); + iommu_set_dma_strict(); } } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index bd9ccce387c5..eeea5e5c4d10 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -350,10 +350,9 @@ static int __init iommu_dma_setup(char *str) } early_param("iommu.strict", iommu_dma_setup); -void iommu_set_dma_strict(bool strict) +void iommu_set_dma_strict(void) { - if (strict || !(iommu_cmd_line & IOMMU_CMD_LINE_STRICT)) - iommu_dma_strict = strict; + iommu_dma_strict = true; } bool iommu_get_dma_strict(struct iommu_domain *domain) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index d7989d4a7404..4997c78e2670 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -485,7 +485,7 @@ int iommu_enable_nesting(struct iommu_domain *domain); int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks); -void iommu_set_dma_strict(bool val); +void iommu_set_dma_strict(void); bool iommu_get_dma_strict(struct iommu_domain *domain); extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev, From 13b6eb6e1c98c47f7e0d6c74e8b22cfe189a84dd Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 15 Jul 2021 14:04:24 +0100 Subject: [PATCH 048/474] iommu: Streamline iommu_iova_to_phys() If people are going to insist on calling iommu_iova_to_phys() pointlessly and expecting it to work, we can at least do ourselves a favour by handling those cases in the core code, rather than repeatedly across an inconsistent handful of drivers. Since all the existing drivers implement the internal callback, and any future ones are likely to want to work with iommu-dma which relies on iova_to_phys a fair bit, we may as well remove that currently-redundant check as well and consider it mandatory. Reviewed-by: Lu Baolu Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/f564f3f6ff731b898ff7a898919bf871c2c7745a.1626354264.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/io_pgtable.c | 3 --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 3 --- drivers/iommu/iommu.c | 5 ++++- 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index bb0ee5c9fde7..182c93a43efd 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -493,9 +493,6 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo unsigned long offset_mask, pte_pgsize; u64 *pte, __pte; - if (pgtable->mode == PAGE_MODE_NONE) - return iova; - pte = fetch_pte(pgtable, iova, &pte_pgsize); if (!pte || !IOMMU_PTE_PRESENT(*pte)) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 235f9bdaeaf2..6346f21726f4 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2488,9 +2488,6 @@ arm_smmu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; - if (domain->type == IOMMU_DOMAIN_IDENTITY) - return iova; - if (!ops) return 0; diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 5ed4408d4b28..ac21170fa208 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1322,9 +1322,6 @@ static phys_addr_t arm_smmu_iova_to_phys(struct iommu_domain *domain, struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; - if (domain->type == IOMMU_DOMAIN_IDENTITY) - return iova; - if (!ops) return 0; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index eeea5e5c4d10..f2cda9950bd5 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2372,7 +2372,10 @@ EXPORT_SYMBOL_GPL(iommu_detach_group); phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { - if (unlikely(domain->ops->iova_to_phys == NULL)) + if (domain->type == IOMMU_DOMAIN_IDENTITY) + return iova; + + if (domain->type == IOMMU_DOMAIN_BLOCKED) return 0; return domain->ops->iova_to_phys(domain, iova); From 8bc54824da4e8fcf0ed679cf09ac32f23d83254a Mon Sep 17 00:00:00 2001 From: Xiyu Yang via iommu Date: Mon, 19 Jul 2021 16:32:58 +0800 Subject: [PATCH 049/474] iommu/amd: Convert from atomic_t to refcount_t on pasid_state->count refcount_t type and corresponding API can protect refcounters from accidental underflow and overflow and further use-after-free situations. Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/1626683578-64214-1-git-send-email-xiyuyang19@fudan.edu.cn Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu_v2.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c index f8d4ad421e07..a9e568276c99 100644 --- a/drivers/iommu/amd/iommu_v2.c +++ b/drivers/iommu/amd/iommu_v2.c @@ -6,6 +6,7 @@ #define pr_fmt(fmt) "AMD-Vi: " fmt +#include #include #include #include @@ -33,7 +34,7 @@ struct pri_queue { struct pasid_state { struct list_head list; /* For global state-list */ - atomic_t count; /* Reference count */ + refcount_t count; /* Reference count */ unsigned mmu_notifier_count; /* Counting nested mmu_notifier calls */ struct mm_struct *mm; /* mm_struct for the faults */ @@ -242,7 +243,7 @@ static struct pasid_state *get_pasid_state(struct device_state *dev_state, ret = *ptr; if (ret) - atomic_inc(&ret->count); + refcount_inc(&ret->count); out_unlock: spin_unlock_irqrestore(&dev_state->lock, flags); @@ -257,14 +258,14 @@ static void free_pasid_state(struct pasid_state *pasid_state) static void put_pasid_state(struct pasid_state *pasid_state) { - if (atomic_dec_and_test(&pasid_state->count)) + if (refcount_dec_and_test(&pasid_state->count)) wake_up(&pasid_state->wq); } static void put_pasid_state_wait(struct pasid_state *pasid_state) { - atomic_dec(&pasid_state->count); - wait_event(pasid_state->wq, !atomic_read(&pasid_state->count)); + refcount_dec(&pasid_state->count); + wait_event(pasid_state->wq, !refcount_read(&pasid_state->count)); free_pasid_state(pasid_state); } @@ -624,7 +625,7 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, u32 pasid, goto out; - atomic_set(&pasid_state->count, 1); + refcount_set(&pasid_state->count, 1); init_waitqueue_head(&pasid_state->wq); spin_lock_init(&pasid_state->lock); From a886d5a7e67bc403b8e51ab50c10324bcdbb686f Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 20 Jul 2021 10:06:13 +0800 Subject: [PATCH 050/474] iommu/vt-d: Report real pgsize bitmap to iommu core The pgsize bitmap is used to advertise the page sizes our hardware supports to the IOMMU core, which will then use this information to split physically contiguous memory regions it is mapping into page sizes that we support. Traditionally the IOMMU core just handed us the mappings directly, after making sure the size is an order of a 4KiB page and that the mapping has natural alignment. To retain this behavior, we currently advertise that we support all page sizes that are an order of 4KiB. We are about to utilize the new IOMMU map/unmap_pages APIs. We could change this to advertise the real page sizes we support. Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20210720020615.4144323-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 38 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index da9afa730df1..4984cba8c00f 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -85,24 +85,6 @@ #define LEVEL_STRIDE (9) #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) -/* - * This bitmap is used to advertise the page sizes our hardware support - * to the IOMMU core, which will then use this information to split - * physically contiguous memory regions it is mapping into page sizes - * that we support. - * - * Traditionally the IOMMU core just handed us the mappings directly, - * after making sure the size is an order of a 4KiB page and that the - * mapping has natural alignment. - * - * To retain this behavior, we currently advertise that we support - * all page sizes that are an order of 4KiB. - * - * If at some point we'd like to utilize the IOMMU core's new behavior, - * we could change this to advertise the real page sizes we support. - */ -#define INTEL_IOMMU_PGSIZES (~0xFFFUL) - static inline int agaw_to_level(int agaw) { return agaw + 2; @@ -735,6 +717,23 @@ static int domain_update_device_node(struct dmar_domain *domain) static void domain_update_iotlb(struct dmar_domain *domain); +/* Return the super pagesize bitmap if supported. */ +static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) +{ + unsigned long bitmap = 0; + + /* + * 1-level super page supports page size of 2MiB, 2-level super page + * supports page size of both 2MiB and 1GiB. + */ + if (domain->iommu_superpage == 1) + bitmap |= SZ_2M; + else if (domain->iommu_superpage == 2) + bitmap |= SZ_2M | SZ_1G; + + return bitmap; +} + /* Some capabilities may be different across iommus */ static void domain_update_iommu_cap(struct dmar_domain *domain) { @@ -761,6 +760,7 @@ static void domain_update_iommu_cap(struct dmar_domain *domain) else domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); + domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); domain_update_iotlb(domain); } @@ -5609,7 +5609,7 @@ const struct iommu_ops intel_iommu_ops = { .dev_disable_feat = intel_iommu_dev_disable_feat, .is_attach_deferred = intel_iommu_is_attach_deferred, .def_domain_type = device_def_domain_type, - .pgsize_bitmap = INTEL_IOMMU_PGSIZES, + .pgsize_bitmap = SZ_4K, #ifdef CONFIG_INTEL_IOMMU_SVM .cache_invalidate = intel_iommu_sva_invalidate, .sva_bind_gpasid = intel_svm_bind_gpasid, From 3f34f125977685e591def32984c77e23a86075b0 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 20 Jul 2021 10:06:14 +0800 Subject: [PATCH 051/474] iommu/vt-d: Implement map/unmap_pages() iommu_ops callback Implement the map_pages() and unmap_pages() callback for the Intel IOMMU driver to allow calls from iommu core to map and unmap multiple pages of the same size in one call. With map/unmap_pages() implemented, the prior map/unmap callbacks are deprecated. Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20210720020615.4144323-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 4984cba8c00f..e4c74296b34a 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -5065,6 +5065,28 @@ static int intel_iommu_map(struct iommu_domain *domain, hpa >> VTD_PAGE_SHIFT, size, prot); } +static int intel_iommu_map_pages(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) +{ + unsigned long pgshift = __ffs(pgsize); + size_t size = pgcount << pgshift; + int ret; + + if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) + return -EINVAL; + + if (!IS_ALIGNED(iova | paddr, pgsize)) + return -EINVAL; + + ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); + if (!ret && mapped) + *mapped = size; + + return ret; +} + static size_t intel_iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *gather) @@ -5094,6 +5116,17 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain, return size; } +static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, + unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *gather) +{ + unsigned long pgshift = __ffs(pgsize); + size_t size = pgcount << pgshift; + + return intel_iommu_unmap(domain, iova, size, gather); +} + static void intel_iommu_tlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *gather) { @@ -5591,9 +5624,9 @@ const struct iommu_ops intel_iommu_ops = { .aux_attach_dev = intel_iommu_aux_attach_device, .aux_detach_dev = intel_iommu_aux_detach_device, .aux_get_pasid = intel_iommu_aux_get_pasid, - .map = intel_iommu_map, + .map_pages = intel_iommu_map_pages, + .unmap_pages = intel_iommu_unmap_pages, .iotlb_sync_map = intel_iommu_iotlb_sync_map, - .unmap = intel_iommu_unmap, .flush_iotlb_all = intel_flush_iotlb_all, .iotlb_sync = intel_iommu_tlb_sync, .iova_to_phys = intel_iommu_iova_to_phys, From 75cc1018a9e1e57d4ae43a101fc08a070894d439 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 20 Jul 2021 10:06:15 +0800 Subject: [PATCH 052/474] iommu/vt-d: Move clflush'es from iotlb_sync_map() to map_pages() As the Intel VT-d driver has switched to use the iommu_ops.map_pages() callback, multiple pages of the same size will be mapped in a call. There's no need to put the clflush'es in iotlb_sync_map() callback. Move them back into __domain_mapping() to simplify the code. Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20210720020615.4144323-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 48 ++++++------------------------------- 1 file changed, 7 insertions(+), 41 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e4c74296b34a..c12cc955389a 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2333,9 +2333,9 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, unsigned long phys_pfn, unsigned long nr_pages, int prot) { + struct dma_pte *first_pte = NULL, *pte = NULL; unsigned int largepage_lvl = 0; unsigned long lvl_pages = 0; - struct dma_pte *pte = NULL; phys_addr_t pteval; u64 attr; @@ -2368,6 +2368,8 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); if (!pte) return -ENOMEM; + first_pte = pte; + /* It is large page*/ if (largepage_lvl > 1) { unsigned long end_pfn; @@ -2415,14 +2417,14 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, * recalculate 'pte' and switch back to smaller pages for the * end of the mapping, if the trailing size is not enough to * use another superpage (i.e. nr_pages < lvl_pages). - * - * We leave clflush for the leaf pte changes to iotlb_sync_map() - * callback. */ pte++; if (!nr_pages || first_pte_in_page(pte) || - (largepage_lvl > 1 && nr_pages < lvl_pages)) + (largepage_lvl > 1 && nr_pages < lvl_pages)) { + domain_flush_cache(domain, first_pte, + (void *)pte - (void *)first_pte); pte = NULL; + } } return 0; @@ -5563,39 +5565,6 @@ static bool risky_device(struct pci_dev *pdev) return false; } -static void clflush_sync_map(struct dmar_domain *domain, unsigned long clf_pfn, - unsigned long clf_pages) -{ - struct dma_pte *first_pte = NULL, *pte = NULL; - unsigned long lvl_pages = 0; - int level = 0; - - while (clf_pages > 0) { - if (!pte) { - level = 0; - pte = pfn_to_dma_pte(domain, clf_pfn, &level); - if (WARN_ON(!pte)) - return; - first_pte = pte; - lvl_pages = lvl_to_nr_pages(level); - } - - if (WARN_ON(!lvl_pages || clf_pages < lvl_pages)) - return; - - clf_pages -= lvl_pages; - clf_pfn += lvl_pages; - pte++; - - if (!clf_pages || first_pte_in_page(pte) || - (level > 1 && clf_pages < lvl_pages)) { - domain_flush_cache(domain, first_pte, - (void *)pte - (void *)first_pte); - pte = NULL; - } - } -} - static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) { @@ -5605,9 +5574,6 @@ static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, struct intel_iommu *iommu; int iommu_id; - if (!dmar_domain->iommu_coherency) - clflush_sync_map(dmar_domain, pfn, pages); - for_each_domain_iommu(iommu_id, dmar_domain) { iommu = g_iommus[iommu_id]; __mapping_notify_one(iommu, dmar_domain, pfn, pages); From ee974d9625c405977ef5d9aedc476be1d0362ebf Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Wed, 21 Jul 2021 16:44:53 +0300 Subject: [PATCH 053/474] iommu/amd: Fix printing of IOMMU events when rate limiting kicks in For the printing of RMP_HW_ERROR / RMP_PAGE_FAULT / IO_PAGE_FAULT events, the AMD IOMMU code uses such logic: if (pdev) dev_data = dev_iommu_priv_get(&pdev->dev); if (dev_data && __ratelimit(&dev_data->rs)) { pci_err(pdev, ... } else { printk_ratelimit() / pr_err{,_ratelimited}(... } This means that if we receive an event for a PCI devid which actually does have a struct pci_dev and an attached struct iommu_dev_data, but rate limiting kicks in, we'll fall back to the non-PCI branch of the test, and print the event in a different format. Fix this by changing the logic to: if (dev_data) { if (__ratelimit(&dev_data->rs)) { pci_err(pdev, ... } } else { pr_err_ratelimited(... } Suggested-by: Suravee Suthikulpanit Signed-off-by: Lennert Buytenhek Reviewed-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/YPgk1dD1gPMhJXgY@wantstofly.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 811a49a95d04..a7d6d78147b7 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -425,9 +425,11 @@ static void amd_iommu_report_rmp_hw_error(volatile u32 *event) if (pdev) dev_data = dev_iommu_priv_get(&pdev->dev); - if (dev_data && __ratelimit(&dev_data->rs)) { - pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n", - vmg_tag, spa, flags); + if (dev_data) { + if (__ratelimit(&dev_data->rs)) { + pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n", + vmg_tag, spa, flags); + } } else { pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n", PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), @@ -456,9 +458,11 @@ static void amd_iommu_report_rmp_fault(volatile u32 *event) if (pdev) dev_data = dev_iommu_priv_get(&pdev->dev); - if (dev_data && __ratelimit(&dev_data->rs)) { - pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n", - vmg_tag, gpa, flags_rmp, flags); + if (dev_data) { + if (__ratelimit(&dev_data->rs)) { + pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n", + vmg_tag, gpa, flags_rmp, flags); + } } else { pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n", PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), @@ -480,11 +484,13 @@ static void amd_iommu_report_page_fault(u16 devid, u16 domain_id, if (pdev) dev_data = dev_iommu_priv_get(&pdev->dev); - if (dev_data && __ratelimit(&dev_data->rs)) { - pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n", - domain_id, address, flags); - } else if (printk_ratelimit()) { - pr_err("Event logged [IO_PAGE_FAULT device=%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n", + if (dev_data) { + if (__ratelimit(&dev_data->rs)) { + pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n", + domain_id, address, flags); + } + } else { + pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n", PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid), domain_id, address, flags); } From 16df55ce104125b4bf1070467973070fb6fc16ff Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 25 Jul 2021 09:02:56 -0700 Subject: [PATCH 054/474] mips: clean up (remove) kernel-doc in cavium-octeon/executive/ Remove all kernel-doc notation in arch/mips/cavium-octeon/executive/. This removes dozens of kernel-doc warnings. Most of these functions are static and don't need to be documented with kernel-doc. The function comments are still present for anyone who wants to read them. These files are part of the OCTEON SDK so presumably they are documented there as well. arch/mips/cavium-octeon/executive/cvmx-bootmem.c:61: warning: Function parameter or member 'addr' not described in 'CVMX_BOOTMEM_NAMED_GET_FIELD' arch/mips/cavium-octeon/executive/cvmx-bootmem.c:61: warning: Function parameter or member 'field' not described in 'CVMX_BOOTMEM_NAMED_GET_FIELD' arch/mips/cavium-octeon/executive/cvmx-bootmem.c:61: warning: expecting prototype for This macro returns a member of the(). Prototype was for CVMX_BOOTMEM_NAMED_GET_FIELD() instead arch/mips/cavium-octeon/executive/cvmx-bootmem.c:77: warning: Function parameter or member 'base' not described in '__cvmx_bootmem_desc_get' arch/mips/cavium-octeon/executive/cvmx-bootmem.c:77: warning: Function parameter or member 'offset' not described in '__cvmx_bootmem_desc_get' arch/mips/cavium-octeon/executive/cvmx-bootmem.c:77: warning: Function parameter or member 'size' not described in '__cvmx_bootmem_desc_get' arch/mips/cavium-octeon/executive/cvmx-bootmem.c:77: warning: expecting prototype for This function is the implementation of the get macros defined(). Prototype was for __cvmx_bootmem_desc_get() instead arch/mips/cavium-octeon/executive/cvmx-bootmem.c:133: warning: expecting prototype for Allocate a block of memory from the free list that was(). Prototype was for cvmx_bootmem_alloc_range() instead arch/mips/cavium-octeon/executive/cvmx-bootmem.c:554: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Finds a named memory block by name. arch/mips/cavium-octeon/executive/cvmx-bootmem.c:661: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Frees a named block. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/mips/cavium-octeon/executive/cvmx-helper-board.c:64: warning: expecting prototype for Return the MII PHY address associated with the given IPD(). Prototype was for cvmx_helper_board_get_mii_address() instead arch/mips/cavium-octeon/executive/cvmx-helper-board.c:211: warning: expecting prototype for This function is the board specific method of determining an(). Prototype was for __cvmx_helper_board_link_get() instead arch/mips/cavium-octeon/executive/cvmx-helper-board.c:278: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * This function is called by cvmx_helper_interface_probe() after it arch/mips/cavium-octeon/executive/cvmx-helper-board.c:324: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Get the clock type used for the USB block based on board type. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/mips/cavium-octeon/executive/cvmx-spi.c:77: warning: expecting prototype for Get current SPI4 initialization callbacks(). Prototype was for cvmx_spi_get_callbacks() instead arch/mips/cavium-octeon/executive/cvmx-spi.c:87: warning: expecting prototype for Set new SPI4 initialization callbacks(). Prototype was for cvmx_spi_set_callbacks() instead arch/mips/cavium-octeon/executive/cvmx-spi.c:92: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Initialize and start the SPI interface. arch/mips/cavium-octeon/executive/cvmx-spi.c:151: warning: expecting prototype for This routine restarts the SPI interface after it has lost synchronization(). Prototype was for cvmx_spi_restart_interface() instead arch/mips/cavium-octeon/executive/cvmx-spi.c:196: warning: expecting prototype for Callback to perform SPI4 reset(). Prototype was for cvmx_spi_reset_cb() instead arch/mips/cavium-octeon/executive/cvmx-spi.c:313: warning: expecting prototype for Callback to setup calendar and miscellaneous settings before clock detection(). Prototype was for cvmx_spi_calendar_setup_cb() instead arch/mips/cavium-octeon/executive/cvmx-spi.c:431: warning: expecting prototype for Callback to perform clock detection(). Prototype was for cvmx_spi_clock_detect_cb() instead arch/mips/cavium-octeon/executive/cvmx-spi.c:509: warning: expecting prototype for Callback to perform link training(). Prototype was for cvmx_spi_training_cb() instead arch/mips/cavium-octeon/executive/cvmx-spi.c:578: warning: expecting prototype for Callback to perform calendar data synchronization(). Prototype was for cvmx_spi_calendar_sync_cb() instead arch/mips/cavium-octeon/executive/cvmx-spi.c:634: warning: expecting prototype for Callback to handle interface up(). Prototype was for cvmx_spi_interface_up_cb() instead ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c:67: warning: expecting prototype for Probe a XAUI interface and determine the number of ports(). Prototype was for __cvmx_helper_xaui_probe() instead arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c:106: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Bringup and enable a XAUI interface. After this call packet arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c:253: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Return the link state of an IPD/PKO port as returned by arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c:292: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Configure an IPD/PKO port for the specified link state. This ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/mips/cavium-octeon/executive/cvmx-pko.c:43: warning: Function parameter or member 'interface' not described in '__cvmx_pko_int' arch/mips/cavium-octeon/executive/cvmx-pko.c:43: warning: Function parameter or member 'index' not described in '__cvmx_pko_int' arch/mips/cavium-octeon/executive/cvmx-pko.c:43: warning: expecting prototype for Internal state of packet output(). Prototype was for __cvmx_pko_int() instead arch/mips/cavium-octeon/executive/cvmx-pko.c:186: warning: expecting prototype for Call before any other calls to initialize the packet(). Prototype was for cvmx_pko_initialize_global() instead arch/mips/cavium-octeon/executive/cvmx-pko.c:241: warning: expecting prototype for This function does per(). Prototype was for cvmx_pko_initialize_local() instead arch/mips/cavium-octeon/executive/cvmx-pko.c:247: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Enables the packet output hardware. It must already be arch/mips/cavium-octeon/executive/cvmx-pko.c:270: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Disables the packet output. Does not affect any configuration. arch/mips/cavium-octeon/executive/cvmx-pko.c:282: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Reset the packet output. arch/mips/cavium-octeon/executive/cvmx-pko.c:293: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Shutdown and free resources required by packet output. arch/mips/cavium-octeon/executive/cvmx-pko.c:324: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Configure a output port and the associated queues for use. arch/mips/cavium-octeon/executive/cvmx-pko.c:555: warning: expecting prototype for Show map of ports(). Prototype was for cvmx_pko_show_queue_map() instead arch/mips/cavium-octeon/executive/cvmx-pko.c:577: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Rate limit a PKO port to a max packets/sec. This function is only arch/mips/cavium-octeon/executive/cvmx-pko.c:610: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Rate limit a PKO port to a max bits/sec. This function is only ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c:49: warning: cannot understand function prototype: '__cvmx_cmd_queue_all_state_t *__cvmx_cmd_queue_state_ptr; ' arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c:53: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Initialize the Global queue state pointer. arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c:101: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Initialize a command queue for use. The initial FPA buffer is arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c:199: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Shutdown a queue a free it's command buffers to the FPA. The arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c:235: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Return the number of command words pending in the queue. This arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c:291: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Return the command buffer to be written to. The purpose of this ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c:53: warning: expecting prototype for Probe RGMII ports and determine the number present(). Prototype was for __cvmx_helper_rgmii_probe() instead arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c:92: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Put an RGMII interface in loopback mode. Internal packets sent arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c:135: warning: expecting prototype for Workaround ASX setup errata with CN38XX pass1(). Prototype was for __cvmx_helper_errata_asx_pass1() instead arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c:152: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Configure all of the ASX, GMX, and PKO registers required arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c:255: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Return the link state of an IPD/PKO port as returned by arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c:284: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Configure an IPD/PKO port for the specified link state. This -- arch/mips/cavium-octeon/executive/cvmx-l2c.c:768: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Return log base 2 of the number of sets in the L2 cache arch/mips/cavium-octeon/executive/cvmx-l2c.c:861: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Flush a line from the L2 cache Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Aditya Srivastava Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Signed-off-by: Thomas Bogendoerfer --- .../cavium-octeon/executive/cvmx-bootmem.c | 10 ++++----- .../cavium-octeon/executive/cvmx-cmd-queue.c | 12 +++++----- .../executive/cvmx-helper-board.c | 8 +++---- .../executive/cvmx-helper-rgmii.c | 12 +++++----- .../executive/cvmx-helper-xaui.c | 8 +++---- arch/mips/cavium-octeon/executive/cvmx-l2c.c | 9 ++++---- arch/mips/cavium-octeon/executive/cvmx-pko.c | 22 +++++++++---------- arch/mips/cavium-octeon/executive/cvmx-spi.c | 20 ++++++++--------- 8 files changed, 50 insertions(+), 51 deletions(-) diff --git a/arch/mips/cavium-octeon/executive/cvmx-bootmem.c b/arch/mips/cavium-octeon/executive/cvmx-bootmem.c index e794b2d53adf..b63ad5d42cc7 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-bootmem.c +++ b/arch/mips/cavium-octeon/executive/cvmx-bootmem.c @@ -44,7 +44,7 @@ static struct cvmx_bootmem_desc *cvmx_bootmem_desc; /* See header file for descriptions of functions */ -/** +/* * This macro returns a member of the * cvmx_bootmem_named_block_desc_t structure. These members can't * be directly addressed as they might be in memory not directly @@ -60,7 +60,7 @@ static struct cvmx_bootmem_desc *cvmx_bootmem_desc; offsetof(struct cvmx_bootmem_named_block_desc, field), \ sizeof_field(struct cvmx_bootmem_named_block_desc, field)) -/** +/* * This function is the implementation of the get macros defined * for individual structure members. The argument are generated * by the macros inorder to read only the needed memory. @@ -115,7 +115,7 @@ static uint64_t cvmx_bootmem_phy_get_next(uint64_t addr) return cvmx_read64_uint64((addr + NEXT_OFFSET) | (1ull << 63)); } -/** +/* * Allocate a block of memory from the free list that was * passed to the application by the bootloader within a specified * address range. This is an allocate-only algorithm, so @@ -550,7 +550,7 @@ bootmem_free_done: } -/** +/* * Finds a named memory block by name. * Also used for finding an unused entry in the named block table. * @@ -657,7 +657,7 @@ struct cvmx_bootmem_named_block_desc *cvmx_bootmem_find_named_block(char *name) } EXPORT_SYMBOL(cvmx_bootmem_find_named_block); -/** +/* * Frees a named block. * * @name: name of block to free diff --git a/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c b/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c index fb42e8e21ea0..20189e9ad94d 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c +++ b/arch/mips/cavium-octeon/executive/cvmx-cmd-queue.c @@ -42,14 +42,14 @@ #include #include -/** +/* * This application uses this pointer to access the global queue * state. It points to a bootmem named block. */ __cvmx_cmd_queue_all_state_t *__cvmx_cmd_queue_state_ptr; EXPORT_SYMBOL_GPL(__cvmx_cmd_queue_state_ptr); -/** +/* * Initialize the Global queue state pointer. * * Returns CVMX_CMD_QUEUE_SUCCESS or a failure code @@ -84,7 +84,7 @@ static cvmx_cmd_queue_result_t __cvmx_cmd_queue_init_state_ptr(void) return CVMX_CMD_QUEUE_SUCCESS; } -/** +/* * Initialize a command queue for use. The initial FPA buffer is * allocated and the hardware unit is configured to point to the * new command queue. @@ -182,7 +182,7 @@ cvmx_cmd_queue_result_t cvmx_cmd_queue_initialize(cvmx_cmd_queue_id_t queue_id, } } -/** +/* * Shutdown a queue a free it's command buffers to the FPA. The * hardware connected to the queue must be stopped before this * function is called. @@ -218,7 +218,7 @@ cvmx_cmd_queue_result_t cvmx_cmd_queue_shutdown(cvmx_cmd_queue_id_t queue_id) return CVMX_CMD_QUEUE_SUCCESS; } -/** +/* * Return the number of command words pending in the queue. This * function may be relatively slow for some hardware units. * @@ -274,7 +274,7 @@ int cvmx_cmd_queue_length(cvmx_cmd_queue_id_t queue_id) return CVMX_CMD_QUEUE_INVALID_PARAM; } -/** +/* * Return the command buffer to be written to. The purpose of this * function is to allow CVMX routine access t othe low level buffer * for initial hardware setup. User applications should not call this diff --git a/arch/mips/cavium-octeon/executive/cvmx-helper-board.c b/arch/mips/cavium-octeon/executive/cvmx-helper-board.c index abd11b7af22f..1daa0c6b6f4e 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-helper-board.c +++ b/arch/mips/cavium-octeon/executive/cvmx-helper-board.c @@ -44,7 +44,7 @@ #include #include -/** +/* * Return the MII PHY address associated with the given IPD * port. A result of -1 means there isn't a MII capable PHY * connected to this port. On chips supporting multiple MII @@ -189,7 +189,7 @@ int cvmx_helper_board_get_mii_address(int ipd_port) return -1; } -/** +/* * This function is the board specific method of determining an * ethernet ports link speed. Most Octeon boards have Marvell PHYs * and are handled by the fall through case. This function must be @@ -274,7 +274,7 @@ union cvmx_helper_link_info __cvmx_helper_board_link_get(int ipd_port) return result; } -/** +/* * This function is called by cvmx_helper_interface_probe() after it * determines the number of ports Octeon can support on a specific * interface. This function is the per board location to override @@ -320,7 +320,7 @@ int __cvmx_helper_board_interface_probe(int interface, int supported_ports) return supported_ports; } -/** +/* * Get the clock type used for the USB block based on board type. * Used by the USB code for auto configuration of clock type. * diff --git a/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c b/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c index c4b58598aa9d..a8c3be4eb6f0 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c +++ b/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c @@ -42,7 +42,7 @@ #include #include -/** +/* * Probe RGMII ports and determine the number present * * @interface: Interface to probe @@ -88,7 +88,7 @@ int __cvmx_helper_rgmii_probe(int interface) return num_ports; } -/** +/* * Put an RGMII interface in loopback mode. Internal packets sent * out will be received back again on the same port. Externally * received packets will echo back out. @@ -120,7 +120,7 @@ void cvmx_helper_rgmii_internal_loopback(int port) cvmx_write_csr(CVMX_GMXX_PRTX_CFG(index, interface), gmx_cfg.u64); } -/** +/* * Workaround ASX setup errata with CN38XX pass1 * * @interface: Interface to setup @@ -148,7 +148,7 @@ static int __cvmx_helper_errata_asx_pass1(int interface, int port, return 0; } -/** +/* * Configure all of the ASX, GMX, and PKO registers required * to get RGMII to function on the supplied interface. * @@ -251,7 +251,7 @@ int __cvmx_helper_rgmii_enable(int interface) return 0; } -/** +/* * Return the link state of an IPD/PKO port as returned by * auto negotiation. The result of this function may not match * Octeon's link config if auto negotiation has changed since @@ -280,7 +280,7 @@ union cvmx_helper_link_info __cvmx_helper_rgmii_link_get(int ipd_port) return __cvmx_helper_board_link_get(ipd_port); } -/** +/* * Configure an IPD/PKO port for the specified link state. This * function does not influence auto negotiation at the PHY level. * The passed link state must always match the link state returned diff --git a/arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c b/arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c index 842990e8404f..fea71a85bb29 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c +++ b/arch/mips/cavium-octeon/executive/cvmx-helper-xaui.c @@ -54,7 +54,7 @@ int __cvmx_helper_xaui_enumerate(int interface) return 1; } -/** +/* * Probe a XAUI interface and determine the number of ports * connected to it. The XAUI interface should still be down * after this call. @@ -102,7 +102,7 @@ int __cvmx_helper_xaui_probe(int interface) return __cvmx_helper_xaui_enumerate(interface); } -/** +/* * Bringup and enable a XAUI interface. After this call packet * I/O should be fully functional. This is called with IPD * enabled but PKO disabled. @@ -249,7 +249,7 @@ int __cvmx_helper_xaui_enable(int interface) return 0; } -/** +/* * Return the link state of an IPD/PKO port as returned by * auto negotiation. The result of this function may not match * Octeon's link config if auto negotiation has changed since @@ -288,7 +288,7 @@ union cvmx_helper_link_info __cvmx_helper_xaui_link_get(int ipd_port) return result; } -/** +/* * Configure an IPD/PKO port for the specified link state. This * function does not influence auto negotiation at the PHY level. * The passed link state must always match the link state returned diff --git a/arch/mips/cavium-octeon/executive/cvmx-l2c.c b/arch/mips/cavium-octeon/executive/cvmx-l2c.c index 83df0a963a8b..33b303691bc2 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-l2c.c +++ b/arch/mips/cavium-octeon/executive/cvmx-l2c.c @@ -281,7 +281,7 @@ uint64_t cvmx_l2c_read_perf(uint32_t counter) } } -/** +/* * @INTERNAL * Helper function use to fault in cache lines for L2 cache locking * @@ -575,7 +575,7 @@ union __cvmx_l2c_tag { }; -/** +/* * @INTERNAL * Function to read a L2C tag. This code make the current core * the 'debug core' for the L2. This code must only be executed by @@ -764,9 +764,8 @@ int cvmx_l2c_get_cache_size_bytes(void) CVMX_CACHE_LINE_SIZE; } -/** +/* * Return log base 2 of the number of sets in the L2 cache - * Returns */ int cvmx_l2c_get_set_bits(void) { @@ -857,7 +856,7 @@ int cvmx_l2c_get_num_assoc(void) return l2_assoc; } -/** +/* * Flush a line from the L2 cache * This should only be called from one core at a time, as this routine * sets the core to the 'debug' core in order to flush the line. diff --git a/arch/mips/cavium-octeon/executive/cvmx-pko.c b/arch/mips/cavium-octeon/executive/cvmx-pko.c index b0efc35e95c4..7c4879e74318 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-pko.c +++ b/arch/mips/cavium-octeon/executive/cvmx-pko.c @@ -35,7 +35,7 @@ #include #include -/** +/* * Internal state of packet output */ @@ -176,7 +176,7 @@ static void __cvmx_pko_chip_init(void) } } -/** +/* * Call before any other calls to initialize the packet * output system. This does chip global config, and should only be * done by one core. @@ -229,7 +229,7 @@ void cvmx_pko_initialize_global(void) } } -/** +/* * This function does per-core initialization required by the PKO routines. * This must be called on all cores that will do packet output, and must * be called after the FPA has been initialized and filled with pages. @@ -243,7 +243,7 @@ int cvmx_pko_initialize_local(void) return 0; } -/** +/* * Enables the packet output hardware. It must already be * configured. */ @@ -266,7 +266,7 @@ void cvmx_pko_enable(void) cvmx_write_csr(CVMX_PKO_REG_FLAGS, flags.u64); } -/** +/* * Disables the packet output. Does not affect any configuration. */ void cvmx_pko_disable(void) @@ -278,7 +278,7 @@ void cvmx_pko_disable(void) } EXPORT_SYMBOL_GPL(cvmx_pko_disable); -/** +/* * Reset the packet output. */ static void __cvmx_pko_reset(void) @@ -289,7 +289,7 @@ static void __cvmx_pko_reset(void) cvmx_write_csr(CVMX_PKO_REG_FLAGS, pko_reg_flags.u64); } -/** +/* * Shutdown and free resources required by packet output. */ void cvmx_pko_shutdown(void) @@ -320,7 +320,7 @@ void cvmx_pko_shutdown(void) } EXPORT_SYMBOL_GPL(cvmx_pko_shutdown); -/** +/* * Configure a output port and the associated queues for use. * * @port: Port to configure. @@ -548,7 +548,7 @@ cvmx_pko_status_t cvmx_pko_config_port(uint64_t port, uint64_t base_queue, } #ifdef PKO_DEBUG -/** +/* * Show map of ports -> queues for different cores. */ void cvmx_pko_show_queue_map() @@ -573,7 +573,7 @@ void cvmx_pko_show_queue_map() } #endif -/** +/* * Rate limit a PKO port to a max packets/sec. This function is only * supported on CN51XX and higher, excluding CN58XX. * @@ -606,7 +606,7 @@ int cvmx_pko_rate_limit_packets(int port, int packets_s, int burst) return 0; } -/** +/* * Rate limit a PKO port to a max bits/sec. This function is only * supported on CN51XX and higher, excluding CN58XX. * diff --git a/arch/mips/cavium-octeon/executive/cvmx-spi.c b/arch/mips/cavium-octeon/executive/cvmx-spi.c index f51957a3e915..eb9333e84a6b 100644 --- a/arch/mips/cavium-octeon/executive/cvmx-spi.c +++ b/arch/mips/cavium-octeon/executive/cvmx-spi.c @@ -66,7 +66,7 @@ static cvmx_spi_callbacks_t cvmx_spi_callbacks = { .interface_up_cb = cvmx_spi_interface_up_cb }; -/** +/* * Get current SPI4 initialization callbacks * * @callbacks: Pointer to the callbacks structure.to fill @@ -78,7 +78,7 @@ void cvmx_spi_get_callbacks(cvmx_spi_callbacks_t *callbacks) memcpy(callbacks, &cvmx_spi_callbacks, sizeof(cvmx_spi_callbacks)); } -/** +/* * Set new SPI4 initialization callbacks * * @new_callbacks: Pointer to an updated callbacks structure. @@ -88,7 +88,7 @@ void cvmx_spi_set_callbacks(cvmx_spi_callbacks_t *new_callbacks) memcpy(&cvmx_spi_callbacks, new_callbacks, sizeof(cvmx_spi_callbacks)); } -/** +/* * Initialize and start the SPI interface. * * @interface: The identifier of the packet interface to configure and @@ -133,7 +133,7 @@ int cvmx_spi_start_interface(int interface, cvmx_spi_mode_t mode, int timeout, return res; } -/** +/* * This routine restarts the SPI interface after it has lost synchronization * with its correspondent system. * @@ -179,7 +179,7 @@ int cvmx_spi_restart_interface(int interface, cvmx_spi_mode_t mode, int timeout) } EXPORT_SYMBOL_GPL(cvmx_spi_restart_interface); -/** +/* * Callback to perform SPI4 reset * * @interface: The identifier of the packet interface to configure and @@ -294,7 +294,7 @@ int cvmx_spi_reset_cb(int interface, cvmx_spi_mode_t mode) return 0; } -/** +/* * Callback to setup calendar and miscellaneous settings before clock detection * * @interface: The identifier of the packet interface to configure and @@ -413,7 +413,7 @@ int cvmx_spi_calendar_setup_cb(int interface, cvmx_spi_mode_t mode, return 0; } -/** +/* * Callback to perform clock detection * * @interface: The identifier of the packet interface to configure and @@ -491,7 +491,7 @@ int cvmx_spi_clock_detect_cb(int interface, cvmx_spi_mode_t mode, int timeout) return 0; } -/** +/* * Callback to perform link training * * @interface: The identifier of the packet interface to configure and @@ -560,7 +560,7 @@ int cvmx_spi_training_cb(int interface, cvmx_spi_mode_t mode, int timeout) return 0; } -/** +/* * Callback to perform calendar data synchronization * * @interface: The identifier of the packet interface to configure and @@ -617,7 +617,7 @@ int cvmx_spi_calendar_sync_cb(int interface, cvmx_spi_mode_t mode, int timeout) return 0; } -/** +/* * Callback to handle interface up * * @interface: The identifier of the packet interface to configure and From 64c888ce33602c66f1a043c1f7943e3d453186c2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 25 Jul 2021 09:02:57 -0700 Subject: [PATCH 055/474] mips: clean up kernel-doc in cavium-octeon/*.c Convert function comments to kernel-doc notation to remove kernel-doc warnings in arch/mips/cavium-octeon/*.c. Also clean up the comments in a few places. arch/mips/cavium-octeon/flash_setup.c:66: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Module/ driver initialization. -- arch/mips/cavium-octeon/setup.c:308: warning: expecting prototype for Return non zero if we are currently running in the Octeon simulator(). Prototype was for octeon_is_simulation() instead arch/mips/cavium-octeon/setup.c:314: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Return true if Octeon is in PCI Host mode. This means arch/mips/cavium-octeon/setup.c:334: warning: expecting prototype for Get the clock rate of Octeon(). Prototype was for octeon_get_clock_rate() instead arch/mips/cavium-octeon/setup.c:351: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Write to the LCD display connected to the bootbus. This display arch/mips/cavium-octeon/setup.c:380: warning: expecting prototype for Return the console uart passed by the bootloader(). Prototype was for octeon_get_boot_uart() instead arch/mips/cavium-octeon/setup.c:386: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Get the coremask Linux was booted on. arch/mips/cavium-octeon/setup.c:399: warning: expecting prototype for Check the hardware BIST results for a CPU(). Prototype was for octeon_check_cpu_bist() instead arch/mips/cavium-octeon/setup.c:432: warning: expecting prototype for Reboot Octeon(). Prototype was for octeon_restart() instead arch/mips/cavium-octeon/setup.c:452: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Permanently stop a core. arch/mips/cavium-octeon/setup.c:475: warning: expecting prototype for Halt the system(). Prototype was for octeon_halt() instead arch/mips/cavium-octeon/setup.c:520: warning: expecting prototype for Return a string representing the system type(). Prototype was for octeon_board_type_string() instead arch/mips/cavium-octeon/setup.c:661: warning: expecting prototype for Early entry point for arch setup(). Prototype was for prom_init() instead -- arch/mips/cavium-octeon/smp.c:100: warning: Function parameter or member 'cpu' not described in 'octeon_send_ipi_single' arch/mips/cavium-octeon/smp.c:100: warning: Function parameter or member 'action' not described in 'octeon_send_ipi_single' arch/mips/cavium-octeon/smp.c:100: warning: expecting prototype for Cause the function described by call_data to be executed on the passed(). Prototype was for octeon_send_ipi_single() instead arch/mips/cavium-octeon/smp.c:119: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Detect available CPUs, populate cpu_possible_mask arch/mips/cavium-octeon/smp.c:210: warning: Function parameter or member 'cpu' not described in 'octeon_boot_secondary' arch/mips/cavium-octeon/smp.c:210: warning: Function parameter or member 'idle' not described in 'octeon_boot_secondary' arch/mips/cavium-octeon/smp.c:210: warning: expecting prototype for Firmware CPU startup hook(). Prototype was for octeon_boot_secondary() instead arch/mips/cavium-octeon/smp.c:236: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * After we've done initial boot, this function is called to allow the arch/mips/cavium-octeon/smp.c:258: warning: Function parameter or member 'max_cpus' not described in 'octeon_prepare_cpus' arch/mips/cavium-octeon/smp.c:258: warning: expecting prototype for Callout to firmware before smp_init(). Prototype was for octeon_prepare_cpus() instead arch/mips/cavium-octeon/smp.c:276: warning: expecting prototype for Last chance for the board code to finish SMP initialization before(). Prototype was for octeon_smp_finish() instead Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Aditya Srivastava Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Signed-off-by: Thomas Bogendoerfer --- arch/mips/cavium-octeon/flash_setup.c | 2 +- arch/mips/cavium-octeon/setup.c | 43 ++++++++++++++------------- arch/mips/cavium-octeon/smp.c | 14 ++++----- 3 files changed, 29 insertions(+), 30 deletions(-) diff --git a/arch/mips/cavium-octeon/flash_setup.c b/arch/mips/cavium-octeon/flash_setup.c index a5e8f4a784af..c8a8c6d359b9 100644 --- a/arch/mips/cavium-octeon/flash_setup.c +++ b/arch/mips/cavium-octeon/flash_setup.c @@ -62,7 +62,7 @@ static void octeon_flash_map_copy_to(struct map_info *map, unsigned long to, up(&octeon_bootbus_sem); } -/** +/* * Module/ driver initialization. * * Returns Zero on success diff --git a/arch/mips/cavium-octeon/setup.c b/arch/mips/cavium-octeon/setup.c index 0ddd3cc16ee4..00bf269763cf 100644 --- a/arch/mips/cavium-octeon/setup.c +++ b/arch/mips/cavium-octeon/setup.c @@ -295,9 +295,10 @@ static int octeon_uart; extern asmlinkage void handle_int(void); /** - * Return non zero if we are currently running in the Octeon simulator + * octeon_is_simulation - Return non-zero if we are currently running + * in the Octeon simulator * - * Returns + * Return: non-0 if running in the Octeon simulator, 0 otherwise */ int octeon_is_simulation(void) { @@ -306,10 +307,10 @@ int octeon_is_simulation(void) EXPORT_SYMBOL(octeon_is_simulation); /** - * Return true if Octeon is in PCI Host mode. This means + * octeon_is_pci_host - Return true if Octeon is in PCI Host mode. This means * Linux can control the PCI bus. * - * Returns Non zero if Octeon in host mode. + * Return: Non-zero if Octeon is in host mode. */ int octeon_is_pci_host(void) { @@ -321,9 +322,9 @@ int octeon_is_pci_host(void) } /** - * Get the clock rate of Octeon + * octeon_get_clock_rate - Get the clock rate of Octeon * - * Returns Clock rate in HZ + * Return: Clock rate in HZ */ uint64_t octeon_get_clock_rate(void) { @@ -343,11 +344,11 @@ EXPORT_SYMBOL(octeon_get_io_clock_rate); /** - * Write to the LCD display connected to the bootbus. This display - * exists on most Cavium evaluation boards. If it doesn't exist, then - * this function doesn't do anything. - * + * octeon_write_lcd - Write to the LCD display connected to the bootbus. * @s: String to write + * + * This display exists on most Cavium evaluation boards. If it doesn't exist, + * then this function doesn't do anything. */ static void octeon_write_lcd(const char *s) { @@ -367,9 +368,9 @@ static void octeon_write_lcd(const char *s) } /** - * Return the console uart passed by the bootloader + * octeon_get_boot_uart - Return the console uart passed by the bootloader * - * Returns uart (0 or 1) + * Return: uart number (0 or 1) */ static int octeon_get_boot_uart(void) { @@ -378,9 +379,9 @@ static int octeon_get_boot_uart(void) } /** - * Get the coremask Linux was booted on. + * octeon_get_boot_coremask - Get the coremask Linux was booted on. * - * Returns Core mask + * Return: Core mask */ int octeon_get_boot_coremask(void) { @@ -388,7 +389,7 @@ int octeon_get_boot_coremask(void) } /** - * Check the hardware BIST results for a CPU + * octeon_check_cpu_bist - Check the hardware BIST results for a CPU */ void octeon_check_cpu_bist(void) { @@ -419,7 +420,7 @@ void octeon_check_cpu_bist(void) } /** - * Reboot Octeon + * octeon_restart - Reboot Octeon * * @command: Command to pass to the bootloader. Currently ignored. */ @@ -444,7 +445,7 @@ static void octeon_restart(char *command) /** - * Permanently stop a core. + * octeon_kill_core - Permanently stop a core. * * @arg: Ignored. */ @@ -464,7 +465,7 @@ static void octeon_kill_core(void *arg) /** - * Halt the system + * octeon_halt - Halt the system */ static void octeon_halt(void) { @@ -507,9 +508,9 @@ static void __init init_octeon_system_type(void) } /** - * Return a string representing the system type + * octeon_board_type_string - Return a string representing the system type * - * Returns + * Return: system type string */ const char *octeon_board_type_string(void) { @@ -650,7 +651,7 @@ void octeon_user_io_init(void) } /** - * Early entry point for arch setup + * prom_init - Early entry point for arch setup */ void __init prom_init(void) { diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c index 66ce5527da54..89954f5f87fb 100644 --- a/arch/mips/cavium-octeon/smp.c +++ b/arch/mips/cavium-octeon/smp.c @@ -91,7 +91,7 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -/** +/* * Cause the function described by call_data to be executed on the passed * cpu. When the function has finished, increment the finished field of * call_data. @@ -115,7 +115,7 @@ static inline void octeon_send_ipi_mask(const struct cpumask *mask, octeon_send_ipi_single(i, action); } -/** +/* * Detect available CPUs, populate cpu_possible_mask */ static void octeon_smp_hotplug_setup(void) @@ -202,9 +202,8 @@ int plat_post_relocation(long offset) } #endif /* CONFIG_RELOCATABLE */ -/** +/* * Firmware CPU startup hook - * */ static int octeon_boot_secondary(int cpu, struct task_struct *idle) { @@ -232,7 +231,7 @@ static int octeon_boot_secondary(int cpu, struct task_struct *idle) return 0; } -/** +/* * After we've done initial boot, this function is called to allow the * board code to clean up state, if needed */ @@ -250,9 +249,8 @@ static void octeon_init_secondary(void) octeon_irq_setup_secondary(); } -/** +/* * Callout to firmware before smp_init - * */ static void __init octeon_prepare_cpus(unsigned int max_cpus) { @@ -268,7 +266,7 @@ static void __init octeon_prepare_cpus(unsigned int max_cpus) } } -/** +/* * Last chance for the board code to finish SMP initialization before * the CPU is "online". */ From d2ac3a11cba23454516ba17f03e078a6831b8be5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 25 Jul 2021 09:02:58 -0700 Subject: [PATCH 056/474] mips: clean up kernel-doc in mm/c-octeon.c Clean up kernel-doc warnings in arch/mips/mm/c-octeon.c. arch/mips/mm/c-octeon.c:34: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Octeon automatically flushes the dcache on tlb changes, so arch/mips/mm/c-octeon.c:65: warning: expecting prototype for Flush caches as necessary for all cores affected by a(). Prototype was for octeon_flush_icache_all_cores() instead arch/mips/mm/c-octeon.c:99: warning: expecting prototype for Called to flush the icache on all cores(). Prototype was for octeon_flush_icache_all() instead arch/mips/mm/c-octeon.c:111: warning: expecting prototype for Called to flush all memory associated with a memory(). Prototype was for octeon_flush_cache_mm() instead arch/mips/mm/c-octeon.c:124: warning: Function parameter or member 'start' not described in 'octeon_flush_icache_range' arch/mips/mm/c-octeon.c:124: warning: Function parameter or member 'end' not described in 'octeon_flush_icache_range' arch/mips/mm/c-octeon.c:124: warning: expecting prototype for Flush a range of kernel addresses out of the icache(). Prototype was for octeon_flush_icache_range() instead arch/mips/mm/c-octeon.c:138: warning: expecting prototype for Flush a range out of a vma(). Prototype was for octeon_flush_cache_range() instead arch/mips/mm/c-octeon.c:153: warning: expecting prototype for Flush a specific page of a vma(). Prototype was for octeon_flush_cache_page() instead arch/mips/mm/c-octeon.c:164: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Probe Octeon's caches arch/mips/mm/c-octeon.c:264: warning: expecting prototype for Setup the Octeon cache flush routines(). Prototype was for octeon_cache_init() instead arch/mips/mm/c-octeon.c:349: warning: expecting prototype for Called when the the exception is not recoverable(). Prototype was for cache_parity_error_octeon_non_recoverable() instead Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Aditya Srivastava Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Signed-off-by: Thomas Bogendoerfer --- arch/mips/mm/c-octeon.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/arch/mips/mm/c-octeon.c b/arch/mips/mm/c-octeon.c index 8ae181e08311..ec2ae501539a 100644 --- a/arch/mips/mm/c-octeon.c +++ b/arch/mips/mm/c-octeon.c @@ -30,7 +30,7 @@ unsigned long long cache_err_dcache[NR_CPUS]; EXPORT_SYMBOL_GPL(cache_err_dcache); -/** +/* * Octeon automatically flushes the dcache on tlb changes, so * from Linux's viewpoint it acts much like a physically * tagged cache. No flushing is needed @@ -56,8 +56,8 @@ static void local_octeon_flush_icache_range(unsigned long start, } /** - * Flush caches as necessary for all cores affected by a - * vma. If no vma is supplied, all cores are flushed. + * octeon_flush_icache_all_cores - Flush caches as necessary for all cores + * affected by a vma. If no vma is supplied, all cores are flushed. * * @vma: VMA to flush or NULL to flush all icaches. */ @@ -92,7 +92,7 @@ static void octeon_flush_icache_all_cores(struct vm_area_struct *vma) } -/** +/* * Called to flush the icache on all cores */ static void octeon_flush_icache_all(void) @@ -102,8 +102,7 @@ static void octeon_flush_icache_all(void) /** - * Called to flush all memory associated with a memory - * context. + * octeon_flush_cache_mm - flush all memory associated with a memory context. * * @mm: Memory context to flush */ @@ -116,7 +115,7 @@ static void octeon_flush_cache_mm(struct mm_struct *mm) } -/** +/* * Flush a range of kernel addresses out of the icache * */ @@ -127,11 +126,11 @@ static void octeon_flush_icache_range(unsigned long start, unsigned long end) /** - * Flush a range out of a vma + * octeon_flush_cache_range - Flush a range out of a vma * * @vma: VMA to flush - * @start: - * @end: + * @start: beginning address for flush + * @end: ending address for flush */ static void octeon_flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) @@ -142,11 +141,11 @@ static void octeon_flush_cache_range(struct vm_area_struct *vma, /** - * Flush a specific page of a vma + * octeon_flush_cache_page - Flush a specific page of a vma * * @vma: VMA to flush page for * @page: Page to flush - * @pfn: + * @pfn: Page frame number */ static void octeon_flush_cache_page(struct vm_area_struct *vma, unsigned long page, unsigned long pfn) @@ -160,7 +159,7 @@ static void octeon_flush_kernel_vmap_range(unsigned long vaddr, int size) BUG(); } -/** +/* * Probe Octeon's caches * */ @@ -256,7 +255,7 @@ static void octeon_cache_error_setup(void) set_handler(0x100, &except_vec2_octeon, 0x80); } -/** +/* * Setup the Octeon cache flush routines * */ @@ -341,7 +340,7 @@ asmlinkage void cache_parity_error_octeon_recoverable(void) co_cache_error_call_notifiers(0); } -/** +/* * Called when the the exception is not recoverable */ From a86aadeff2fe9203d9575f1c157d09b3c557d1ce Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 28 Jul 2021 11:26:12 +0100 Subject: [PATCH 057/474] MIPS: Alchemy: Fix spelling contraction "cant" -> "can't" There is a spelling mistake in a pr_warn message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Thomas Bogendoerfer --- arch/mips/alchemy/devboards/db1200.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/alchemy/devboards/db1200.c b/arch/mips/alchemy/devboards/db1200.c index 421d651433b6..1864eb935ca5 100644 --- a/arch/mips/alchemy/devboards/db1200.c +++ b/arch/mips/alchemy/devboards/db1200.c @@ -835,7 +835,7 @@ int __init db1200_dev_setup(void) if (!IS_ERR(c)) { pfc = clk_round_rate(c, 50000000); if ((pfc < 1) || (abs(50000000 - pfc) > 2500000)) - pr_warn("DB1200: cant get I2C close to 50MHz\n"); + pr_warn("DB1200: can't get I2C close to 50MHz\n"); else clk_set_rate(c, pfc); clk_prepare_enable(c); From a449ffaf9181b5a2dc705d8a06b13e0068207fd4 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Jul 2021 12:42:31 +0100 Subject: [PATCH 058/474] powerpc/svm: Don't issue ultracalls if !mem_encrypt_active() Commit ad6c00283163 ("swiotlb: Free tbl memory in swiotlb_exit()") introduced a set_memory_encrypted() call to swiotlb_exit() so that the buffer pages are returned to an encrypted state prior to being freed. Sachin reports that this leads to the following crash on a Power server: [ 0.010799] software IO TLB: tearing down default memory pool [ 0.010805] ------------[ cut here ]------------ [ 0.010808] kernel BUG at arch/powerpc/kernel/interrupt.c:98! Nick spotted that this is because set_memory_encrypted() is issuing an ultracall which doesn't exist for the processor, and should therefore be gated by mem_encrypt_active() to mirror the x86 implementation. Cc: Konrad Rzeszutek Wilk Cc: Claire Chang Cc: Christoph Hellwig Cc: Robin Murphy Fixes: ad6c00283163 ("swiotlb: Free tbl memory in swiotlb_exit()") Suggested-by: Nicholas Piggin Reported-by: Sachin Sant Tested-by: Sachin Sant Tested-by: Nathan Chancellor Link: https://lore.kernel.org/r/1905CD70-7656-42AE-99E2-A31FC3812EAC@linux.vnet.ibm.com/ Signed-off-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- arch/powerpc/platforms/pseries/svm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index 1d829e257996..87f001b4c4e4 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -63,6 +63,9 @@ void __init svm_swiotlb_init(void) int set_memory_encrypted(unsigned long addr, int numpages) { + if (!mem_encrypt_active()) + return 0; + if (!PAGE_ALIGNED(addr)) return -EINVAL; @@ -73,6 +76,9 @@ int set_memory_encrypted(unsigned long addr, int numpages) int set_memory_decrypted(unsigned long addr, int numpages) { + if (!mem_encrypt_active()) + return 0; + if (!PAGE_ALIGNED(addr)) return -EINVAL; From fc65d0acaf23179b94de399c204328fa259acb90 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 23 Jul 2021 02:32:03 -0700 Subject: [PATCH 059/474] iommu/amd: Selective flush on unmap Recent patch attempted to enable selective page flushes on AMD IOMMU but neglected to adapt amd_iommu_iotlb_sync() to use the selective flushes. Adapt amd_iommu_iotlb_sync() to use selective flushes and change amd_iommu_unmap() to collect the flushes. As a defensive measure, to avoid potential issues as those that the Intel IOMMU driver encountered recently, flush the page-walk caches by always setting the "pde" parameter. This can be removed later. Cc: Joerg Roedel Cc: Will Deacon Cc: Jiajun Cao Cc: Robin Murphy Cc: Lu Baolu Cc: iommu@lists.linux-foundation.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Nadav Amit Link: https://lore.kernel.org/r/20210723093209.714328-2-namit@vmware.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index a7d6d78147b7..5e4b0ee98f64 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2060,12 +2060,17 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, { struct protection_domain *domain = to_pdomain(dom); struct io_pgtable_ops *ops = &domain->iop.iop.ops; + size_t r; if ((amd_iommu_pgtable == AMD_IOMMU_V1) && (domain->iop.mode == PAGE_MODE_NONE)) return 0; - return (ops->unmap) ? ops->unmap(ops, iova, page_size, gather) : 0; + r = (ops->unmap) ? ops->unmap(ops, iova, page_size, gather) : 0; + + iommu_iotlb_gather_add_page(dom, gather, iova, page_size); + + return r; } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, @@ -2168,7 +2173,13 @@ static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) static void amd_iommu_iotlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *gather) { - amd_iommu_flush_iotlb_all(domain); + struct protection_domain *dom = to_pdomain(domain); + unsigned long flags; + + spin_lock_irqsave(&dom->lock, flags); + __domain_flush_pages(dom, gather->start, gather->end - gather->start, 1); + amd_iommu_domain_flush_complete(dom); + spin_unlock_irqrestore(&dom->lock, flags); } static int amd_iommu_def_domain_type(struct device *dev) From 6664340cf1d541c2a44d588002893f795ab4143f Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 23 Jul 2021 02:32:04 -0700 Subject: [PATCH 060/474] iommu/amd: Do not use flush-queue when NpCache is on Do not use flush-queue on virtualized environments, where the NpCache capability of the IOMMU is set. This is required to reduce virtualization overheads. This change follows a similar change to Intel's VT-d and a detailed explanation as for the rationale is described in commit 29b32839725f ("iommu/vt-d: Do not use flush-queue when caching-mode is on"). Cc: Joerg Roedel Cc: Will Deacon Cc: Jiajun Cao Cc: Robin Murphy Cc: Lu Baolu Cc: iommu@lists.linux-foundation.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Nadav Amit Link: https://lore.kernel.org/r/20210723093209.714328-3-namit@vmware.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/init.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 46280e6e1535..1c7ae7d3c55d 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1850,8 +1850,13 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) if (ret) return ret; - if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) + if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) { + if (!amd_iommu_unmap_flush) + pr_info("IOMMU batching is disabled due to virtualization\n"); + amd_iommu_np_cache = true; + amd_iommu_unmap_flush = true; + } init_iommu_perf_ctr(iommu); From 3136895cc5b665c1ab406d78f90c0700a3551e74 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 23 Jul 2021 02:32:05 -0700 Subject: [PATCH 061/474] iommu: Improve iommu_iotlb_gather helpers The Mediatek driver is not the only one which might want a basic address-based gathering behaviour, so although it's arguably simple enough to open-code, let's factor it out for the sake of cleanliness. Let's also take this opportunity to document the intent of these helpers for clarity. Cc: Joerg Roedel Cc: Will Deacon Cc: Jiajun Cao Cc: Robin Murphy Cc: Lu Baolu Cc: iommu@lists.linux-foundation.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Robin Murphy Signed-off-by: Nadav Amit Link: https://lore.kernel.org/r/20210723093209.714328-4-namit@vmware.com Signed-off-by: Joerg Roedel --- drivers/iommu/mtk_iommu.c | 6 +----- include/linux/iommu.h | 38 +++++++++++++++++++++++++++++++++----- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 6f7c69688ce2..d9939e4af35c 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -520,12 +520,8 @@ static size_t mtk_iommu_unmap(struct iommu_domain *domain, struct iommu_iotlb_gather *gather) { struct mtk_iommu_domain *dom = to_mtk_domain(domain); - unsigned long end = iova + size - 1; - if (gather->start > iova) - gather->start = iova; - if (gather->end < end) - gather->end = end; + iommu_iotlb_gather_add_range(gather, iova, size); return dom->iop->unmap(dom->iop, iova, size, gather); } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 32d448050bf7..e554871db46f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -497,6 +497,38 @@ static inline void iommu_iotlb_sync(struct iommu_domain *domain, iommu_iotlb_gather_init(iotlb_gather); } +/** + * iommu_iotlb_gather_add_range - Gather for address-based TLB invalidation + * @gather: TLB gather data + * @iova: start of page to invalidate + * @size: size of page to invalidate + * + * Helper for IOMMU drivers to build arbitrarily-sized invalidation commands + * where only the address range matters, and simply minimising intermediate + * syncs is preferred. + */ +static inline void iommu_iotlb_gather_add_range(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t size) +{ + unsigned long end = iova + size - 1; + + if (gather->start > iova) + gather->start = iova; + if (gather->end < end) + gather->end = end; +} + +/** + * iommu_iotlb_gather_add_page - Gather for page-based TLB invalidation + * @domain: IOMMU domain to be invalidated + * @gather: TLB gather data + * @iova: start of page to invalidate + * @size: size of page to invalidate + * + * Helper for IOMMU drivers to build invalidation commands based on individual + * pages, or with page size/table level hints which cannot be gathered if they + * differ. + */ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, struct iommu_iotlb_gather *gather, unsigned long iova, size_t size) @@ -515,11 +547,7 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, gather->pgsize = size; } - if (gather->end < end) - gather->end = end; - - if (gather->start > start) - gather->start = start; + iommu_iotlb_gather_add_range(gather, iova, size); } /* PCI device grouping function */ From febb82c208e481eee057c70fa3176bb48712a111 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 23 Jul 2021 02:32:06 -0700 Subject: [PATCH 062/474] iommu: Factor iommu_iotlb_gather_is_disjoint() out MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor iommu_iotlb_gather_add_page() and factor out the logic that detects whether IOTLB gather range and a new range are disjoint. To be used by the next patch that implements different gathering logic for AMD. Note that updating gather->pgsize unconditionally does not affect correctness as the function had (and has) an invariant, in which gather->pgsize always represents the flushing granularity of its range. Arguably, “size" should never be zero, but lets assume for the matter of discussion that it might. If "size" equals to "gather->pgsize", then the assignment in question has no impact. Otherwise, if "size" is non-zero, then iommu_iotlb_sync() would initialize the size and range (see iommu_iotlb_gather_init()), and the invariant is kept. Otherwise, "size" is zero, and "gather" already holds a range, so gather->pgsize is non-zero and (gather->pgsize && gather->pgsize != size) is true. Therefore, again, iommu_iotlb_sync() would be called and initialize the size. Cc: Joerg Roedel Cc: Jiajun Cao Cc: Lu Baolu Cc: iommu@lists.linux-foundation.org Cc: linux-kernel@vger.kernel.org> Reviewed-by: Robin Murphy Acked-by: Will Deacon Signed-off-by: Nadav Amit Link: https://lore.kernel.org/r/20210723093209.714328-5-namit@vmware.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e554871db46f..979a5ceeea55 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -497,6 +497,28 @@ static inline void iommu_iotlb_sync(struct iommu_domain *domain, iommu_iotlb_gather_init(iotlb_gather); } +/** + * iommu_iotlb_gather_is_disjoint - Checks whether a new range is disjoint + * + * @gather: TLB gather data + * @iova: start of page to invalidate + * @size: size of page to invalidate + * + * Helper for IOMMU drivers to check whether a new range and the gathered range + * are disjoint. For many IOMMUs, flushing the IOMMU in this case is better + * than merging the two, which might lead to unnecessary invalidations. + */ +static inline +bool iommu_iotlb_gather_is_disjoint(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t size) +{ + unsigned long start = iova, end = start + size - 1; + + return gather->end != 0 && + (end + 1 < gather->start || start > gather->end + 1); +} + + /** * iommu_iotlb_gather_add_range - Gather for address-based TLB invalidation * @gather: TLB gather data @@ -533,20 +555,16 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, struct iommu_iotlb_gather *gather, unsigned long iova, size_t size) { - unsigned long start = iova, end = start + size - 1; - /* * If the new page is disjoint from the current range or is mapped at * a different granularity, then sync the TLB so that the gather * structure can be rewritten. */ - if (gather->pgsize != size || - end + 1 < gather->start || start > gather->end + 1) { - if (gather->pgsize) - iommu_iotlb_sync(domain, gather); - gather->pgsize = size; - } + if ((gather->pgsize && gather->pgsize != size) || + iommu_iotlb_gather_is_disjoint(gather, iova, size)) + iommu_iotlb_sync(domain, gather); + gather->pgsize = size; iommu_iotlb_gather_add_range(gather, iova, size); } From fe6d269d0e9b05fa5233f810a3cb7cce4077b261 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 23 Jul 2021 02:32:07 -0700 Subject: [PATCH 063/474] iommu/amd: Tailored gather logic for AMD AMD's IOMMU can flush efficiently (i.e., in a single flush) any range. This is in contrast, for instnace, to Intel IOMMUs that have a limit on the number of pages that can be flushed in a single flush. In addition, AMD's IOMMU do not care about the page-size, so changes of the page size do not need to trigger a TLB flush. So in most cases, a TLB flush due to disjoint range is not needed for AMD. Yet, vIOMMUs require the hypervisor to synchronize the virtualized IOMMU's PTEs with the physical ones. This process induce overheads, so it is better not to cause unnecessary flushes, i.e., flushes of PTEs that were not modified. Implement and use amd_iommu_iotlb_gather_add_page() and use it instead of the generic iommu_iotlb_gather_add_page(). Ignore disjoint regions unless "non-present cache" feature is reported by the IOMMU capabilities, as this is an indication we are running on a physical IOMMU. A similar indication is used by VT-d (see "caching mode"). The new logic retains the same flushing behavior that we had before the introduction of page-selective IOTLB flushes for AMD. On virtualized environments, check if the newly flushed region and the gathered one are disjoint and flush if it is. Cc: Joerg Roedel Cc: Will Deacon Cc: Jiajun Cao Cc: Lu Baolu Cc: iommu@lists.linux-foundation.org Cc: linux-kernel@vger.kernel.org> Reviewed-by: Robin Murphy Signed-off-by: Nadav Amit Link: https://lore.kernel.org/r/20210723093209.714328-6-namit@vmware.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 5e4b0ee98f64..0957be3b6274 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2054,6 +2054,27 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, return ret; } +static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather, + unsigned long iova, size_t size) +{ + /* + * AMD's IOMMU can flush as many pages as necessary in a single flush. + * Unless we run in a virtual machine, which can be inferred according + * to whether "non-present cache" is on, it is probably best to prefer + * (potentially) too extensive TLB flushing (i.e., more misses) over + * mutliple TLB flushes (i.e., more flushes). For virtual machines the + * hypervisor needs to synchronize the host IOMMU PTEs with those of + * the guest, and the trade-off is different: unnecessary TLB flushes + * should be avoided. + */ + if (amd_iommu_np_cache && + iommu_iotlb_gather_is_disjoint(gather, iova, size)) + iommu_iotlb_sync(domain, gather); + + iommu_iotlb_gather_add_range(gather, iova, size); +} + static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, size_t page_size, struct iommu_iotlb_gather *gather) @@ -2068,7 +2089,7 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, r = (ops->unmap) ? ops->unmap(ops, iova, page_size, gather) : 0; - iommu_iotlb_gather_add_page(dom, gather, iova, page_size); + amd_iommu_iotlb_gather_add_page(dom, gather, iova, page_size); return r; } From 3b122a5666cb7c0bb9a439fba0c9a6cf59f999c3 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 23 Jul 2021 02:32:08 -0700 Subject: [PATCH 064/474] iommu/amd: Sync once for scatter-gather operations On virtual machines, software must flush the IOTLB after each page table entry update. The iommu_map_sg() code iterates through the given scatter-gather list and invokes iommu_map() for each element in the scatter-gather list, which calls into the vendor IOMMU driver through iommu_ops callback. As the result, a single sg mapping may lead to multiple IOTLB flushes. Fix this by adding amd_iotlb_sync_map() callback and flushing at this point after all sg mappings we set. This commit is followed and inspired by commit 933fcd01e97e2 ("iommu/vt-d: Add iotlb_sync_map callback"). Cc: Joerg Roedel Cc: Will Deacon Cc: Jiajun Cao Cc: Robin Murphy Cc: Lu Baolu Cc: iommu@lists.linux-foundation.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Nadav Amit Link: https://lore.kernel.org/r/20210723093209.714328-7-namit@vmware.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 0957be3b6274..cee91cdf0016 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2028,6 +2028,16 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, return ret; } +static void amd_iommu_iotlb_sync_map(struct iommu_domain *dom, + unsigned long iova, size_t size) +{ + struct protection_domain *domain = to_pdomain(dom); + struct io_pgtable_ops *ops = &domain->iop.iop.ops; + + if (ops->map) + domain_flush_np_cache(domain, iova, size); +} + static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, phys_addr_t paddr, size_t page_size, int iommu_prot, gfp_t gfp) @@ -2046,10 +2056,8 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, if (iommu_prot & IOMMU_WRITE) prot |= IOMMU_PROT_IW; - if (ops->map) { + if (ops->map) ret = ops->map(ops, iova, paddr, page_size, prot, gfp); - domain_flush_np_cache(domain, iova, page_size); - } return ret; } @@ -2229,6 +2237,7 @@ const struct iommu_ops amd_iommu_ops = { .attach_dev = amd_iommu_attach_device, .detach_dev = amd_iommu_detach_device, .map = amd_iommu_map, + .iotlb_sync_map = amd_iommu_iotlb_sync_map, .unmap = amd_iommu_unmap, .iova_to_phys = amd_iommu_iova_to_phys, .probe_device = amd_iommu_probe_device, From a270be1b3fdfb6940dd692c859fdf9a7407047be Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 23 Jul 2021 02:32:09 -0700 Subject: [PATCH 065/474] iommu/amd: Use only natural aligned flushes in a VM When running on an AMD vIOMMU, it is better to avoid TLB flushes of unmodified PTEs. vIOMMUs require the hypervisor to synchronize the virtualized IOMMU's PTEs with the physical ones. This process induce overheads. AMD IOMMU allows us to flush any range that is aligned to the power of 2. So when running on top of a vIOMMU, break the range into sub-ranges that are naturally aligned, and flush each one separately. This apporach is better when running with a vIOMMU, but on physical IOMMUs, the penalty of IOTLB misses due to unnecessary flushed entries is likely to be low. Repurpose (i.e., keeping the name, changing the logic) domain_flush_pages() so it is used to choose whether to perform one flush of the whole range or multiple ones to avoid flushing unnecessary ranges. Use NpCache, as usual, to infer whether the IOMMU is physical or virtual. Cc: Joerg Roedel Cc: Will Deacon Cc: Jiajun Cao Cc: Lu Baolu Cc: iommu@lists.linux-foundation.org Cc: linux-kernel@vger.kernel.org Suggested-by: Robin Murphy Signed-off-by: Nadav Amit Link: https://lore.kernel.org/r/20210723093209.714328-8-namit@vmware.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 47 ++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index cee91cdf0016..fb5c40715d10 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1267,15 +1267,52 @@ static void __domain_flush_pages(struct protection_domain *domain, } static void domain_flush_pages(struct protection_domain *domain, - u64 address, size_t size) + u64 address, size_t size, int pde) { - __domain_flush_pages(domain, address, size, 0); + if (likely(!amd_iommu_np_cache)) { + __domain_flush_pages(domain, address, size, pde); + return; + } + + /* + * When NpCache is on, we infer that we run in a VM and use a vIOMMU. + * In such setups it is best to avoid flushes of ranges which are not + * naturally aligned, since it would lead to flushes of unmodified + * PTEs. Such flushes would require the hypervisor to do more work than + * necessary. Therefore, perform repeated flushes of aligned ranges + * until you cover the range. Each iteration flushes the smaller + * between the natural alignment of the address that we flush and the + * greatest naturally aligned region that fits in the range. + */ + while (size != 0) { + int addr_alignment = __ffs(address); + int size_alignment = __fls(size); + int min_alignment; + size_t flush_size; + + /* + * size is always non-zero, but address might be zero, causing + * addr_alignment to be negative. As the casting of the + * argument in __ffs(address) to long might trim the high bits + * of the address on x86-32, cast to long when doing the check. + */ + if (likely((unsigned long)address != 0)) + min_alignment = min(addr_alignment, size_alignment); + else + min_alignment = size_alignment; + + flush_size = 1ul << min_alignment; + + __domain_flush_pages(domain, address, flush_size, pde); + address += flush_size; + size -= flush_size; + } } /* Flush the whole IO/TLB for a given protection domain - including PDE */ void amd_iommu_domain_flush_tlb_pde(struct protection_domain *domain) { - __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); + domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); } void amd_iommu_domain_flush_complete(struct protection_domain *domain) @@ -1302,7 +1339,7 @@ static void domain_flush_np_cache(struct protection_domain *domain, unsigned long flags; spin_lock_irqsave(&domain->lock, flags); - domain_flush_pages(domain, iova, size); + domain_flush_pages(domain, iova, size, 1); amd_iommu_domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -2206,7 +2243,7 @@ static void amd_iommu_iotlb_sync(struct iommu_domain *domain, unsigned long flags; spin_lock_irqsave(&dom->lock, flags); - __domain_flush_pages(dom, gather->start, gather->end - gather->start, 1); + domain_flush_pages(dom, gather->start, gather->end - gather->start, 1); amd_iommu_domain_flush_complete(dom); spin_unlock_irqrestore(&dom->lock, flags); } From 5c08c5acdc6ce46296a0f98d06896c818b8b34e0 Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 22 Jun 2021 00:36:34 +0800 Subject: [PATCH 066/474] iommu/arm-smmu-v3: Remove some unneeded init in arm_smmu_cmdq_issue_cmdlist() Members of struct "llq" will be zero-inited, apart from member max_n_shift. But we write llq.val straight after the init, so it was pointless to zero init those other members. As such, separately init member max_n_shift only. In addition, struct "head" is initialised to "llq" only so that member max_n_shift is set. But that member is never referenced for "head", so remove any init there. Removing these initializations is seen as a small performance optimisation, as this code is (very) hot path. Signed-off-by: John Garry Link: https://lore.kernel.org/r/1624293394-202509-1-git-send-email-john.garry@huawei.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 235f9bdaeaf2..952291840c76 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -733,11 +733,11 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, unsigned long flags; bool owner; struct arm_smmu_cmdq *cmdq = &smmu->cmdq; - struct arm_smmu_ll_queue llq = { - .max_n_shift = cmdq->q.llq.max_n_shift, - }, head = llq; + struct arm_smmu_ll_queue llq, head; int ret = 0; + llq.max_n_shift = cmdq->q.llq.max_n_shift; + /* 1. Allocate some space in the queue */ local_irq_save(flags); llq.val = READ_ONCE(cmdq->q.llq.val); From 59103c79f46ab49453270ec9165cfdb38422088f Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Sat, 31 Jul 2021 10:17:10 +0800 Subject: [PATCH 067/474] iommu/arm-smmu-v3: Implement the unmap_pages() IOMMU driver callback Implement the unmap_pages() callback for ARM SMMUV3 driver to allow calls from iommu_unmap to unmap multiple pages of the same size in one call. Also remove the unmap() callback for the ARM SMMUV3 driver as it will no longer be used. Signed-off-by: Xiang Chen Acked-by: Will Deacon Link: https://lore.kernel.org/r/1627697831-158822-2-git-send-email-chenxiang66@hisilicon.com Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 6346f21726f4..2060e6d019a3 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2450,8 +2450,9 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, return ops->map(ops, iova, paddr, size, prot, gfp); } -static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, - size_t size, struct iommu_iotlb_gather *gather) +static size_t arm_smmu_unmap_pages(struct iommu_domain *domain, unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *gather) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; @@ -2459,7 +2460,7 @@ static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, if (!ops) return 0; - return ops->unmap(ops, iova, size, gather); + return ops->unmap_pages(ops, iova, pgsize, pgcount, gather); } static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain) @@ -2823,7 +2824,7 @@ static struct iommu_ops arm_smmu_ops = { .domain_free = arm_smmu_domain_free, .attach_dev = arm_smmu_attach_dev, .map = arm_smmu_map, - .unmap = arm_smmu_unmap, + .unmap_pages = arm_smmu_unmap_pages, .flush_iotlb_all = arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, .iova_to_phys = arm_smmu_iova_to_phys, From 9eec3f9b9e243b554b5804bbdb58ea61068adbda Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Sat, 31 Jul 2021 10:17:11 +0800 Subject: [PATCH 068/474] iommu/arm-smmu-v3: Implement the map_pages() IOMMU driver callback Implement the map_pages() callback for ARM SMMUV3 driver to allow calls from iommu_map to map multiple pages of the same size in one call. Also remove the map() callback for the ARM SMMUV3 driver as it will no longer be used. Signed-off-by: Xiang Chen Acked-by: Will Deacon Link: https://lore.kernel.org/r/1627697831-158822-3-git-send-email-chenxiang66@hisilicon.com Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 2060e6d019a3..35d54919c583 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2439,15 +2439,16 @@ out_unlock: return ret; } -static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) { struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; if (!ops) return -ENODEV; - return ops->map(ops, iova, paddr, size, prot, gfp); + return ops->map_pages(ops, iova, paddr, pgsize, pgcount, prot, gfp, mapped); } static size_t arm_smmu_unmap_pages(struct iommu_domain *domain, unsigned long iova, @@ -2823,7 +2824,7 @@ static struct iommu_ops arm_smmu_ops = { .domain_alloc = arm_smmu_domain_alloc, .domain_free = arm_smmu_domain_free, .attach_dev = arm_smmu_attach_dev, - .map = arm_smmu_map, + .map_pages = arm_smmu_map_pages, .unmap_pages = arm_smmu_unmap_pages, .flush_iotlb_all = arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, From 47a70bea54b7afa983c71fa409785c067f4e865b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 2 Aug 2021 17:06:43 +0200 Subject: [PATCH 069/474] iommu/amd: Remove stale amd_iommu_unmap_flush usage Remove the new use of the variable introduced in the AMD driver branch. The variable was removed already in the iommu core branch, causing build errors when the brances are merged. Cc: Nadav Amit Cc: Zhen Lei Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20210802150643.3634-1-joro@8bytes.org --- drivers/iommu/amd/init.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 239556c1f698..bdcf167b4afe 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1850,11 +1850,9 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) return ret; if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) { - if (!amd_iommu_unmap_flush) - pr_info("IOMMU batching is disabled due to virtualization\n"); - + pr_info("Using strict mode due to virtualization\n"); + iommu_set_dma_strict(); amd_iommu_np_cache = true; - amd_iommu_unmap_flush = true; } init_iommu_perf_ctr(iommu); From 50741b70b0cbbafbd9199f5180e66c0c53783a4a Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Mon, 19 Jul 2021 12:03:18 +0530 Subject: [PATCH 070/474] cpuidle: pseries: Fixup CEDE0 latency only for POWER10 onwards Commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for CEDE(0)") sets the exit latency of CEDE(0) based on the latency values of the Extended CEDE states advertised by the platform On POWER9 LPARs, the firmwares advertise a very low value of 2us for CEDE1 exit latency on a Dedicated LPAR. The latency advertized by the PHYP hypervisor corresponds to the latency required to wakeup from the underlying hardware idle state. However the wakeup latency from the LPAR perspective should include 1. The time taken to transition the CPU from the Hypervisor into the LPAR post wakeup from platform idle state 2. Time taken to send the IPI from the source CPU (waker) to the idle target CPU (wakee). 1. can be measured via timer idle test, where we queue a timer, say for 1ms, and enter the CEDE state. When the timer fires, in the timer handler we compute how much extra timer over the expected 1ms have we consumed. On a a POWER9 LPAR the numbers are CEDE latency measured using a timer (numbers in ns) N Min Median Avg 90%ile 99%ile Max Stddev 400 2601 5677 5668.74 5917 6413 9299 455.01 1. and 2. combined can be determined by an IPI latency test where we send an IPI to an idle CPU and in the handler compute the time difference between when the IPI was sent and when the handler ran. We see the following numbers on POWER9 LPAR. CEDE latency measured using an IPI (numbers in ns) N Min Median Avg 90%ile 99%ile Max Stddev 400 711 7564 7369.43 8559 9514 9698 1200.01 Suppose, we consider the 99th percentile latency value measured using the IPI to be the wakeup latency, the value would be 9.5us This is in the ballpark of the default value of 10us. Hence, use the exit latency of CEDE(0) based on the latency values advertized by platform only from POWER10 onwards. The values advertized on POWER10 platforms is more realistic and informed by the latency measurements. For earlier platforms stick to the default value of 10us. The fix was suggested by Michael Ellerman. Fixes: d947fb4c965c ("cpuidle: pseries: Fixup exit latency for CEDE(0)") Reported-by: Enrico Joedecke Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1626676399-15975-2-git-send-email-ego@linux.vnet.ibm.com --- drivers/cpuidle/cpuidle-pseries.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index a2b5c6f60cf0..e592280d8acf 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -419,7 +419,21 @@ static int pseries_idle_probe(void) cpuidle_state_table = shared_states; max_idle_state = ARRAY_SIZE(shared_states); } else { - fixup_cede0_latency(); + /* + * Use firmware provided latency values + * starting with POWER10 platforms. In the + * case that we are running on a POWER10 + * platform but in an earlier compat mode, we + * can still use the firmware provided values. + * + * However, on platforms prior to POWER10, we + * cannot rely on the accuracy of the firmware + * provided latency values. On such platforms, + * go with the conservative default estimate + * of 10us. + */ + if (cpu_has_feature(CPU_FTR_ARCH_31) || pvr_version_is(PVR_POWER10)) + fixup_cede0_latency(); cpuidle_state_table = dedicated_states; max_idle_state = NR_DEDICATED_STATES; } From 71737a6c2a8f801622d2b71567d1ec1e4c5b40b8 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Mon, 19 Jul 2021 12:03:19 +0530 Subject: [PATCH 071/474] cpuidle: pseries: Do not cap the CEDE0 latency in fixup_cede0_latency() Currently in fixup_cede0_latency() code, we perform the fixup the CEDE(0) exit latency value only if minimum advertized extended CEDE latency values are less than 10us. This was done so as to not break the expected behaviour on POWER8 platforms where the advertised latency was higher than the default 10us, which would delay the SMT folding on the core. However, after the earlier patch "cpuidle/pseries: Fixup CEDE0 latency only for POWER10 onwards", we can be sure that the fixup of CEDE0 latency is going to happen only from POWER10 onwards. Hence unconditionally use the minimum exit latency provided by the platform. Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1626676399-15975-3-git-send-email-ego@linux.vnet.ibm.com --- drivers/cpuidle/cpuidle-pseries.c | 61 ++++++++++++++++--------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index e592280d8acf..bba449b77641 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -346,11 +346,9 @@ static int pseries_cpuidle_driver_init(void) static void __init fixup_cede0_latency(void) { struct xcede_latency_payload *payload; - u64 min_latency_us; + u64 min_xcede_latency_us = UINT_MAX; int i; - min_latency_us = dedicated_states[1].exit_latency; // CEDE latency - if (parse_cede_parameters()) return; @@ -358,42 +356,45 @@ static void __init fixup_cede0_latency(void) nr_xcede_records); payload = &xcede_latency_parameter.payload; + + /* + * The CEDE idle state maps to CEDE(0). While the hypervisor + * does not advertise CEDE(0) exit latency values, it does + * advertise the latency values of the extended CEDE states. + * We use the lowest advertised exit latency value as a proxy + * for the exit latency of CEDE(0). + */ for (i = 0; i < nr_xcede_records; i++) { struct xcede_latency_record *record = &payload->records[i]; + u8 hint = record->hint; u64 latency_tb = be64_to_cpu(record->latency_ticks); u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC); - if (latency_us == 0) - pr_warn("cpuidle: xcede record %d has an unrealistic latency of 0us.\n", i); + /* + * We expect the exit latency of an extended CEDE + * state to be non-zero, it to since it takes at least + * a few nanoseconds to wakeup the idle CPU and + * dispatch the virtual processor into the Linux + * Guest. + * + * So we consider only non-zero value for performing + * the fixup of CEDE(0) latency. + */ + if (latency_us == 0) { + pr_warn("cpuidle: Skipping xcede record %d [hint=%d]. Exit latency = 0us\n", + i, hint); + continue; + } - if (latency_us < min_latency_us) - min_latency_us = latency_us; + if (latency_us < min_xcede_latency_us) + min_xcede_latency_us = latency_us; } - /* - * By default, we assume that CEDE(0) has exit latency 10us, - * since there is no way for us to query from the platform. - * - * However, if the wakeup latency of an Extended CEDE state is - * smaller than 10us, then we can be sure that CEDE(0) - * requires no more than that. - * - * Perform the fix-up. - */ - if (min_latency_us < dedicated_states[1].exit_latency) { - /* - * We set a minimum of 1us wakeup latency for cede0 to - * distinguish it from snooze - */ - u64 cede0_latency = 1; - - if (min_latency_us > cede0_latency) - cede0_latency = min_latency_us - 1; - - dedicated_states[1].exit_latency = cede0_latency; - dedicated_states[1].target_residency = 10 * (cede0_latency); + if (min_xcede_latency_us != UINT_MAX) { + dedicated_states[1].exit_latency = min_xcede_latency_us; + dedicated_states[1].target_residency = 10 * (min_xcede_latency_us); pr_info("cpuidle: Fixed up CEDE exit latency to %llu us\n", - cede0_latency); + min_xcede_latency_us); } } From a6cae77f1bc89368a4e2822afcddc45c3062d499 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Thu, 29 Jul 2021 20:01:03 +0200 Subject: [PATCH 072/474] powerpc/stacktrace: Include linux/delay.h commit 7c6986ade69e ("powerpc/stacktrace: Fix spurious "stale" traces in raise_backtrace_ipi()") introduces udelay() call without including the linux/delay.h header. This may happen to work on master but the header that declares the functionshould be included nonetheless. Fixes: 7c6986ade69e ("powerpc/stacktrace: Fix spurious "stale" traces in raise_backtrace_ipi()") Signed-off-by: Michal Suchanek Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210729180103.15578-1-msuchanek@suse.de --- arch/powerpc/kernel/stacktrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 2b0d04a1b7d2..9e4a4a7af380 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -8,6 +8,7 @@ * Copyright 2018 Nick Piggin, Michael Ellerman, IBM Corp. */ +#include #include #include #include From d04691d373e75c83424b85c0e68e4a3f9370c10d Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 3 Aug 2021 14:15:47 -0700 Subject: [PATCH 073/474] cpuidle: pseries: Mark pseries_idle_proble() as __init After commit 7cbd631d4dec ("cpuidle: pseries: Fixup CEDE0 latency only for POWER10 onwards"), pseries_idle_probe() is no longer inlined when compiling with clang, which causes a modpost warning: WARNING: modpost: vmlinux.o(.text+0xc86a54): Section mismatch in reference from the function pseries_idle_probe() to the function .init.text:fixup_cede0_latency() The function pseries_idle_probe() references the function __init fixup_cede0_latency(). This is often because pseries_idle_probe lacks a __init annotation or the annotation of fixup_cede0_latency is wrong. pseries_idle_probe() is a non-init function, which calls fixup_cede0_latency(), which is an init function, explaining the mismatch. pseries_idle_probe() is only called from pseries_processor_idle_init(), which is an init function, so mark pseries_idle_probe() as __init so there is no more warning. Fixes: 054e44ba99ae ("cpuidle: pseries: Add function to parse extended CEDE records") Signed-off-by: Nathan Chancellor Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210803211547.1093820-1-nathan@kernel.org --- drivers/cpuidle/cpuidle-pseries.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index bba449b77641..7e7ab5597d7a 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -403,7 +403,7 @@ static void __init fixup_cede0_latency(void) * pseries_idle_probe() * Choose state table for shared versus dedicated partition */ -static int pseries_idle_probe(void) +static int __init pseries_idle_probe(void) { if (cpuidle_disable != IDLE_NO_OVERRIDE) From 156ca4e650bfb9a4259b427069caa11b5a4df3d4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 29 Jul 2021 23:19:35 +0900 Subject: [PATCH 074/474] powerpc: remove unused zInstall target from arch/powerpc/boot/Makefile Commit c913e5f95e54 ("powerpc/boot: Don't install zImage.* from make install") added the zInstall target to arch/powerpc/boot/Makefile, but you cannot use it since the corresponding hook is missing in arch/powerpc/Makefile. It has never worked since its addition. Nobody has complained about it for 7 years, which means this code was unneeded. With this removal, the install.sh will be passed in with 4 parameters. Simplify the shell script. Signed-off-by: Masahiro Yamada Reviewed-by: Nick Desaulniers Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210729141937.445051-1-masahiroy@kernel.org --- arch/powerpc/boot/Makefile | 6 +----- arch/powerpc/boot/install.sh | 13 ------------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index e312ea802aa6..a702f9d1ec0d 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -448,11 +448,7 @@ $(obj)/zImage.initrd: $(addprefix $(obj)/, $(initrd-y)) install: $(CONFIGURE) $(addprefix $(obj)/, $(image-y)) sh -x $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" vmlinux System.map "$(INSTALL_PATH)" -# Install the vmlinux and other built boot targets. -zInstall: $(CONFIGURE) $(addprefix $(obj)/, $(image-y)) - sh -x $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" vmlinux System.map "$(INSTALL_PATH)" $^ - -PHONY += install zInstall +PHONY += install # anything not in $(targets) clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* treeImage.* \ diff --git a/arch/powerpc/boot/install.sh b/arch/powerpc/boot/install.sh index b6a256bc96ee..658c93ca7437 100644 --- a/arch/powerpc/boot/install.sh +++ b/arch/powerpc/boot/install.sh @@ -15,7 +15,6 @@ # $2 - kernel image file # $3 - kernel map file # $4 - default install path (blank if root directory) -# $5 and more - kernel boot files; zImage*, uImage, cuImage.*, etc. # # Bail with error code if anything goes wrong @@ -41,15 +40,3 @@ fi cat $2 > $4/$image_name cp $3 $4/System.map - -# Copy all the bootable image files -path=$4 -shift 4 -while [ $# -ne 0 ]; do - image_name=`basename $1` - if [ -f $path/$image_name ]; then - mv $path/$image_name $path/$image_name.old - fi - cat $1 > $path/$image_name - shift -done; From 9bef456b20581e630ef9a13555ca04fed65a859d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 29 Jul 2021 23:19:36 +0900 Subject: [PATCH 075/474] powerpc: make the install target not depend on any build artifact The install target should not depend on any build artifact. The reason is explained in commit 19514fc665ff ("arm, kbuild: make "make install" not depend on vmlinux"). Change the PowerPC installation code in a similar way. Signed-off-by: Masahiro Yamada Reviewed-by: Nick Desaulniers Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210729141937.445051-2-masahiroy@kernel.org --- arch/powerpc/boot/Makefile | 2 +- arch/powerpc/boot/install.sh | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index a702f9d1ec0d..0d165bd98b61 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -445,7 +445,7 @@ $(obj)/zImage.initrd: $(addprefix $(obj)/, $(initrd-y)) $(Q)rm -f $@; ln $< $@ # Only install the vmlinux -install: $(CONFIGURE) $(addprefix $(obj)/, $(image-y)) +install: sh -x $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" vmlinux System.map "$(INSTALL_PATH)" PHONY += install diff --git a/arch/powerpc/boot/install.sh b/arch/powerpc/boot/install.sh index 658c93ca7437..14473150ddb4 100644 --- a/arch/powerpc/boot/install.sh +++ b/arch/powerpc/boot/install.sh @@ -20,6 +20,20 @@ # Bail with error code if anything goes wrong set -e +verify () { + if [ ! -f "$1" ]; then + echo "" 1>&2 + echo " *** Missing file: $1" 1>&2 + echo ' *** You need to run "make" before "make install".' 1>&2 + echo "" 1>&2 + exit 1 + fi +} + +# Make sure the files actually exist +verify "$2" +verify "$3" + # User may have a custom install script if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi From 86ff0bce2e9665c8b074930fe6caed615da070c1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 29 Jul 2021 23:19:37 +0900 Subject: [PATCH 076/474] powerpc: move the install rule to arch/powerpc/Makefile Currently, the install target in arch/powerpc/Makefile descends into arch/powerpc/boot/Makefile to invoke the shell script, but there is no good reason to do so. arch/powerpc/Makefile can run the shell script directly. Signed-off-by: Masahiro Yamada Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210729141937.445051-3-masahiroy@kernel.org --- arch/powerpc/Makefile | 3 ++- arch/powerpc/boot/Makefile | 6 ------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 6505d66f1193..9aaf1abbc641 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -407,7 +407,8 @@ endef PHONY += install install: - $(Q)$(MAKE) $(build)=$(boot) install + sh -x $(srctree)/$(boot)/install.sh "$(KERNELRELEASE)" vmlinux \ + System.map "$(INSTALL_PATH)" archclean: $(Q)$(MAKE) $(clean)=$(boot) diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 0d165bd98b61..10c0fb306f15 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -444,12 +444,6 @@ $(obj)/zImage: $(addprefix $(obj)/, $(image-y)) $(obj)/zImage.initrd: $(addprefix $(obj)/, $(initrd-y)) $(Q)rm -f $@; ln $< $@ -# Only install the vmlinux -install: - sh -x $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" vmlinux System.map "$(INSTALL_PATH)" - -PHONY += install - # anything not in $(targets) clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* treeImage.* \ zImage zImage.initrd zImage.chrp zImage.coff zImage.holly \ From a4bec516b9c0823d7e2bb8c8928c98b535cf9adf Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Wed, 28 Jul 2021 23:26:05 +0530 Subject: [PATCH 077/474] powerpc/cacheinfo: Lookup cache by dt node and thread-group id Currently the cacheinfo code on powerpc indexes the "cache" objects (modelling the L1/L2/L3 caches) where the key is device-tree node corresponding to that cache. On some of the POWER server platforms thread-groups within the core share different sets of caches (Eg: On SMT8 POWER9 systems, threads 0,2,4,6 of a core share L1 cache and threads 1,3,5,7 of the same core share another L1 cache). On such platforms, there is a single device-tree node corresponding to that cache and the cache-configuration within the threads of the core is indicated via "ibm,thread-groups" device-tree property. Since the current code is not aware of the "ibm,thread-groups" property, on the aforementoined systems, cacheinfo code still treats all the threads in the core to be sharing the cache because of the single device-tree node (In the earlier example, the cacheinfo code would says CPUs 0-7 share L1 cache). In this patch, we make the powerpc cacheinfo code aware of the "ibm,thread-groups" property. We indexe the "cache" objects by the key-pair (device-tree node, thread-group id). For any CPUX, for a given level of cache, the thread-group id is defined to be the first CPU in the "ibm,thread-groups" cache-group containing CPUX. For levels of cache which are not represented in "ibm,thread-groups" property, the thread-group id is -1. [parth: Remove "static" keyword for the definition of "thread_group_l1_cache_map" and "thread_group_l2_cache_map" to get rid of the compile error.] Signed-off-by: Gautham R. Shenoy Signed-off-by: Parth Shah Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210728175607.591679-2-parth@linux.ibm.com --- arch/powerpc/include/asm/smp.h | 3 ++ arch/powerpc/kernel/cacheinfo.c | 80 ++++++++++++++++++++++++--------- arch/powerpc/kernel/smp.c | 4 +- 3 files changed, 63 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 03b3d010cbab..1259040cc3a4 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -33,6 +33,9 @@ extern bool coregroup_enabled; extern int cpu_to_chip_id(int cpu); extern int *chip_id_lookup_table; +DECLARE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); +DECLARE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); + #ifdef CONFIG_SMP struct smp_ops_t { diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index 6f903e9aa20b..5a6925d87424 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -120,6 +120,7 @@ struct cache { struct cpumask shared_cpu_map; /* online CPUs using this cache */ int type; /* split cache disambiguation */ int level; /* level not explicit in device tree */ + int group_id; /* id of the group of threads that share this cache */ struct list_head list; /* global list of cache objects */ struct cache *next_local; /* next cache of >= level */ }; @@ -142,22 +143,24 @@ static const char *cache_type_string(const struct cache *cache) } static void cache_init(struct cache *cache, int type, int level, - struct device_node *ofnode) + struct device_node *ofnode, int group_id) { cache->type = type; cache->level = level; cache->ofnode = of_node_get(ofnode); + cache->group_id = group_id; INIT_LIST_HEAD(&cache->list); list_add(&cache->list, &cache_list); } -static struct cache *new_cache(int type, int level, struct device_node *ofnode) +static struct cache *new_cache(int type, int level, + struct device_node *ofnode, int group_id) { struct cache *cache; cache = kzalloc(sizeof(*cache), GFP_KERNEL); if (cache) - cache_init(cache, type, level, ofnode); + cache_init(cache, type, level, ofnode, group_id); return cache; } @@ -309,20 +312,24 @@ static struct cache *cache_find_first_sibling(struct cache *cache) return cache; list_for_each_entry(iter, &cache_list, list) - if (iter->ofnode == cache->ofnode && iter->next_local == cache) + if (iter->ofnode == cache->ofnode && + iter->group_id == cache->group_id && + iter->next_local == cache) return iter; return cache; } -/* return the first cache on a local list matching node */ -static struct cache *cache_lookup_by_node(const struct device_node *node) +/* return the first cache on a local list matching node and thread-group id */ +static struct cache *cache_lookup_by_node_group(const struct device_node *node, + int group_id) { struct cache *cache = NULL; struct cache *iter; list_for_each_entry(iter, &cache_list, list) { - if (iter->ofnode != node) + if (iter->ofnode != node || + iter->group_id != group_id) continue; cache = cache_find_first_sibling(iter); break; @@ -352,14 +359,15 @@ static int cache_is_unified_d(const struct device_node *np) CACHE_TYPE_UNIFIED_D : CACHE_TYPE_UNIFIED; } -static struct cache *cache_do_one_devnode_unified(struct device_node *node, int level) +static struct cache *cache_do_one_devnode_unified(struct device_node *node, int group_id, + int level) { pr_debug("creating L%d ucache for %pOFP\n", level, node); - return new_cache(cache_is_unified_d(node), level, node); + return new_cache(cache_is_unified_d(node), level, node, group_id); } -static struct cache *cache_do_one_devnode_split(struct device_node *node, +static struct cache *cache_do_one_devnode_split(struct device_node *node, int group_id, int level) { struct cache *dcache, *icache; @@ -367,8 +375,8 @@ static struct cache *cache_do_one_devnode_split(struct device_node *node, pr_debug("creating L%d dcache and icache for %pOFP\n", level, node); - dcache = new_cache(CACHE_TYPE_DATA, level, node); - icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node); + dcache = new_cache(CACHE_TYPE_DATA, level, node, group_id); + icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node, group_id); if (!dcache || !icache) goto err; @@ -382,31 +390,32 @@ err: return NULL; } -static struct cache *cache_do_one_devnode(struct device_node *node, int level) +static struct cache *cache_do_one_devnode(struct device_node *node, int group_id, int level) { struct cache *cache; if (cache_node_is_unified(node)) - cache = cache_do_one_devnode_unified(node, level); + cache = cache_do_one_devnode_unified(node, group_id, level); else - cache = cache_do_one_devnode_split(node, level); + cache = cache_do_one_devnode_split(node, group_id, level); return cache; } static struct cache *cache_lookup_or_instantiate(struct device_node *node, + int group_id, int level) { struct cache *cache; - cache = cache_lookup_by_node(node); + cache = cache_lookup_by_node_group(node, group_id); WARN_ONCE(cache && cache->level != level, "cache level mismatch on lookup (got %d, expected %d)\n", cache->level, level); if (!cache) - cache = cache_do_one_devnode(node, level); + cache = cache_do_one_devnode(node, group_id, level); return cache; } @@ -443,7 +452,27 @@ static void do_subsidiary_caches_debugcheck(struct cache *cache) of_node_get_device_type(cache->ofnode)); } -static void do_subsidiary_caches(struct cache *cache) +/* + * If sub-groups of threads in a core containing @cpu_id share the + * L@level-cache (information obtained via "ibm,thread-groups" + * device-tree property), then we identify the group by the first + * thread-sibling in the group. We define this to be the group-id. + * + * In the absence of any thread-group information for L@level-cache, + * this function returns -1. + */ +static int get_group_id(unsigned int cpu_id, int level) +{ + if (has_big_cores && level == 1) + return cpumask_first(per_cpu(thread_group_l1_cache_map, + cpu_id)); + else if (thread_group_shares_l2 && level == 2) + return cpumask_first(per_cpu(thread_group_l2_cache_map, + cpu_id)); + return -1; +} + +static void do_subsidiary_caches(struct cache *cache, unsigned int cpu_id) { struct device_node *subcache_node; int level = cache->level; @@ -452,9 +481,11 @@ static void do_subsidiary_caches(struct cache *cache) while ((subcache_node = of_find_next_cache_node(cache->ofnode))) { struct cache *subcache; + int group_id; level++; - subcache = cache_lookup_or_instantiate(subcache_node, level); + group_id = get_group_id(cpu_id, level); + subcache = cache_lookup_or_instantiate(subcache_node, group_id, level); of_node_put(subcache_node); if (!subcache) break; @@ -468,6 +499,7 @@ static struct cache *cache_chain_instantiate(unsigned int cpu_id) { struct device_node *cpu_node; struct cache *cpu_cache = NULL; + int group_id; pr_debug("creating cache object(s) for CPU %i\n", cpu_id); @@ -476,11 +508,13 @@ static struct cache *cache_chain_instantiate(unsigned int cpu_id) if (!cpu_node) goto out; - cpu_cache = cache_lookup_or_instantiate(cpu_node, 1); + group_id = get_group_id(cpu_id, 1); + + cpu_cache = cache_lookup_or_instantiate(cpu_node, group_id, 1); if (!cpu_cache) goto out; - do_subsidiary_caches(cpu_cache); + do_subsidiary_caches(cpu_cache, cpu_id); cache_cpu_set(cpu_cache, cpu_id); out: @@ -848,13 +882,15 @@ static struct cache *cache_lookup_by_cpu(unsigned int cpu_id) { struct device_node *cpu_node; struct cache *cache; + int group_id; cpu_node = of_get_cpu_node(cpu_id, NULL); WARN_ONCE(!cpu_node, "no OF node found for CPU %i\n", cpu_id); if (!cpu_node) return NULL; - cache = cache_lookup_by_node(cpu_node); + group_id = get_group_id(cpu_id, 1); + cache = cache_lookup_by_node_group(cpu_node, group_id); of_node_put(cpu_node); return cache; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 447b78a87c8f..a7fcac44a8e2 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -122,14 +122,14 @@ static struct thread_groups_list tgl[NR_CPUS] __initdata; * On big-cores system, thread_group_l1_cache_map for each CPU corresponds to * the set its siblings that share the L1-cache. */ -static DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); +DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); /* * On some big-cores system, thread_group_l2_cache_map for each CPU * corresponds to the set its siblings within the core that share the * L2-cache. */ -static DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); +DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); /* SMP operations for this machine */ struct smp_ops_t *smp_ops; From 69aa8e078545bc14d84a8b4b3cb914ac8f9f280e Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Wed, 28 Jul 2021 23:26:06 +0530 Subject: [PATCH 078/474] powerpc/cacheinfo: Remove the redundant get_shared_cpu_map() The helper function get_shared_cpu_map() was added in 'commit 500fe5f550ec ("powerpc/cacheinfo: Report the correct shared_cpu_map on big-cores")' and subsequently expanded upon in 'commit 0be47634db0b ("powerpc/cacheinfo: Print correct cache-sibling map/list for L2 cache")' in order to help report the correct groups of threads sharing these caches on big-core systems where groups of threads within a core can share different sets of caches. Now that powerpc/cacheinfo is aware of "ibm,thread-groups" property, cache->shared_cpu_map contains the correct set of thread-siblings sharing the cache. Hence we no longer need the functions get_shared_cpu_map(). This patch removes this function. We also remove the helper function index_dir_to_cpu() which was only called by get_shared_cpu_map(). With these functions removed, we can still see the correct cache-sibling map/list for L1 and L2 caches on systems with L1 and L2 caches distributed among groups of threads in a core. With this patch, on a SMT8 POWER10 system where the L1 and L2 caches are split between the two groups of threads in a core, for CPUs 8,9, the L1-Data, L1-Instruction, L2, L3 cache CPU sibling list is as follows: $ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list /sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,10,12,14 /sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,10,12,14 /sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,10,12,14 /sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-15 /sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9,11,13,15 /sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9,11,13,15 /sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9,11,13,15 /sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-15 $ ppc64_cpu --smt=4 $ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list /sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,10 /sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,10 /sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,10 /sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-11 /sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9,11 /sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9,11 /sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9,11 /sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-11 $ ppc64_cpu --smt=2 $ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list /sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8 /sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8 /sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8 /sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-9 /sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9 /sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9 /sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9 /sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-9 $ ppc64_cpu --smt=1 $ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list /sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8 /sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8 /sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8 /sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8 Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210728175607.591679-3-parth@linux.ibm.com --- arch/powerpc/kernel/cacheinfo.c | 41 +-------------------------------- 1 file changed, 1 insertion(+), 40 deletions(-) diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index 5a6925d87424..20d91693eac1 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -675,45 +675,6 @@ static ssize_t level_show(struct kobject *k, struct kobj_attribute *attr, char * static struct kobj_attribute cache_level_attr = __ATTR(level, 0444, level_show, NULL); -static unsigned int index_dir_to_cpu(struct cache_index_dir *index) -{ - struct kobject *index_dir_kobj = &index->kobj; - struct kobject *cache_dir_kobj = index_dir_kobj->parent; - struct kobject *cpu_dev_kobj = cache_dir_kobj->parent; - struct device *dev = kobj_to_dev(cpu_dev_kobj); - - return dev->id; -} - -/* - * On big-core systems, each core has two groups of CPUs each of which - * has its own L1-cache. The thread-siblings which share l1-cache with - * @cpu can be obtained via cpu_smallcore_mask(). - * - * On some big-core systems, the L2 cache is shared only between some - * groups of siblings. This is already parsed and encoded in - * cpu_l2_cache_mask(). - * - * TODO: cache_lookup_or_instantiate() needs to be made aware of the - * "ibm,thread-groups" property so that cache->shared_cpu_map - * reflects the correct siblings on platforms that have this - * device-tree property. This helper function is only a stop-gap - * solution so that we report the correct siblings to the - * userspace via sysfs. - */ -static const struct cpumask *get_shared_cpu_map(struct cache_index_dir *index, struct cache *cache) -{ - if (has_big_cores) { - int cpu = index_dir_to_cpu(index); - if (cache->level == 1) - return cpu_smallcore_mask(cpu); - if (cache->level == 2 && thread_group_shares_l2) - return cpu_l2_cache_mask(cpu); - } - - return &cache->shared_cpu_map; -} - static ssize_t show_shared_cpumap(struct kobject *k, struct kobj_attribute *attr, char *buf, bool list) { @@ -724,7 +685,7 @@ show_shared_cpumap(struct kobject *k, struct kobj_attribute *attr, char *buf, bo index = kobj_to_cache_index_dir(k); cache = index->cache; - mask = get_shared_cpu_map(index, cache); + mask = &cache->shared_cpu_map; return cpumap_print_to_pagebuf(list, buf, mask); } From e9ef81e1079b0c4c374fba0f9affa7129c7c913b Mon Sep 17 00:00:00 2001 From: Parth Shah Date: Wed, 28 Jul 2021 23:26:07 +0530 Subject: [PATCH 079/474] powerpc/smp: Use existing L2 cache_map cpumask to find L3 cache siblings On POWER10 systems, the "ibm,thread-groups" property "2" indicates the cpus in thread-group share both L2 and L3 caches. Hence, use cache_property = 2 itself to find both the L2 and L3 cache siblings. Hence, create a new thread_group_l3_cache_map to keep list of L3 siblings, but fill the mask using same property "2" array. Signed-off-by: Parth Shah Reviewed-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210728175607.591679-4-parth@linux.ibm.com --- arch/powerpc/include/asm/smp.h | 3 ++ arch/powerpc/kernel/cacheinfo.c | 3 ++ arch/powerpc/kernel/smp.c | 80 +++++++++++++++++++++------------ 3 files changed, 58 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 1259040cc3a4..7ef1cd8168a0 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -35,6 +35,7 @@ extern int *chip_id_lookup_table; DECLARE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); DECLARE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); +DECLARE_PER_CPU(cpumask_var_t, thread_group_l3_cache_map); #ifdef CONFIG_SMP @@ -144,6 +145,7 @@ extern int cpu_to_core_id(int cpu); extern bool has_big_cores; extern bool thread_group_shares_l2; +extern bool thread_group_shares_l3; #define cpu_smt_mask cpu_smt_mask #ifdef CONFIG_SCHED_SMT @@ -198,6 +200,7 @@ extern void __cpu_die(unsigned int cpu); #define hard_smp_processor_id() get_hard_smp_processor_id(0) #define smp_setup_cpu_maps() #define thread_group_shares_l2 0 +#define thread_group_shares_l3 0 static inline void inhibit_secondary_onlining(void) {} static inline void uninhibit_secondary_onlining(void) {} static inline const struct cpumask *cpu_sibling_mask(int cpu) diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index 20d91693eac1..cf1be75b7833 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -469,6 +469,9 @@ static int get_group_id(unsigned int cpu_id, int level) else if (thread_group_shares_l2 && level == 2) return cpumask_first(per_cpu(thread_group_l2_cache_map, cpu_id)); + else if (thread_group_shares_l3 && level == 3) + return cpumask_first(per_cpu(thread_group_l3_cache_map, + cpu_id)); return -1; } diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index a7fcac44a8e2..f2abd88e0c25 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -78,6 +78,7 @@ struct task_struct *secondary_current; bool has_big_cores; bool coregroup_enabled; bool thread_group_shares_l2; +bool thread_group_shares_l3; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); @@ -101,7 +102,7 @@ enum { #define MAX_THREAD_LIST_SIZE 8 #define THREAD_GROUP_SHARE_L1 1 -#define THREAD_GROUP_SHARE_L2 2 +#define THREAD_GROUP_SHARE_L2_L3 2 struct thread_groups { unsigned int property; unsigned int nr_groups; @@ -131,6 +132,12 @@ DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); */ DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); +/* + * On P10, thread_group_l3_cache_map for each CPU is equal to the + * thread_group_l2_cache_map + */ +DEFINE_PER_CPU(cpumask_var_t, thread_group_l3_cache_map); + /* SMP operations for this machine */ struct smp_ops_t *smp_ops; @@ -889,33 +896,10 @@ out: return tg; } -static int __init init_thread_group_cache_map(int cpu, int cache_property) - +static int update_mask_from_threadgroup(cpumask_var_t *mask, struct thread_groups *tg, int cpu, int cpu_group_start) { int first_thread = cpu_first_thread_sibling(cpu); - int i, cpu_group_start = -1, err = 0; - struct thread_groups *tg = NULL; - cpumask_var_t *mask = NULL; - - if (cache_property != THREAD_GROUP_SHARE_L1 && - cache_property != THREAD_GROUP_SHARE_L2) - return -EINVAL; - - tg = get_thread_groups(cpu, cache_property, &err); - if (!tg) - return err; - - cpu_group_start = get_cpu_thread_group_start(cpu, tg); - - if (unlikely(cpu_group_start == -1)) { - WARN_ON_ONCE(1); - return -ENODATA; - } - - if (cache_property == THREAD_GROUP_SHARE_L1) - mask = &per_cpu(thread_group_l1_cache_map, cpu); - else if (cache_property == THREAD_GROUP_SHARE_L2) - mask = &per_cpu(thread_group_l2_cache_map, cpu); + int i; zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu)); @@ -934,6 +918,44 @@ static int __init init_thread_group_cache_map(int cpu, int cache_property) return 0; } +static int __init init_thread_group_cache_map(int cpu, int cache_property) + +{ + int cpu_group_start = -1, err = 0; + struct thread_groups *tg = NULL; + cpumask_var_t *mask = NULL; + + if (cache_property != THREAD_GROUP_SHARE_L1 && + cache_property != THREAD_GROUP_SHARE_L2_L3) + return -EINVAL; + + tg = get_thread_groups(cpu, cache_property, &err); + + if (!tg) + return err; + + cpu_group_start = get_cpu_thread_group_start(cpu, tg); + + if (unlikely(cpu_group_start == -1)) { + WARN_ON_ONCE(1); + return -ENODATA; + } + + if (cache_property == THREAD_GROUP_SHARE_L1) { + mask = &per_cpu(thread_group_l1_cache_map, cpu); + update_mask_from_threadgroup(mask, tg, cpu, cpu_group_start); + } + else if (cache_property == THREAD_GROUP_SHARE_L2_L3) { + mask = &per_cpu(thread_group_l2_cache_map, cpu); + update_mask_from_threadgroup(mask, tg, cpu, cpu_group_start); + mask = &per_cpu(thread_group_l3_cache_map, cpu); + update_mask_from_threadgroup(mask, tg, cpu, cpu_group_start); + } + + + return 0; +} + static bool shared_caches; #ifdef CONFIG_SCHED_SMT @@ -1020,14 +1042,16 @@ static int __init init_big_cores(void) has_big_cores = true; for_each_possible_cpu(cpu) { - int err = init_thread_group_cache_map(cpu, THREAD_GROUP_SHARE_L2); + int err = init_thread_group_cache_map(cpu, THREAD_GROUP_SHARE_L2_L3); if (err) return err; } thread_group_shares_l2 = true; - pr_debug("L2 cache only shared by the threads in the small core\n"); + thread_group_shares_l3 = true; + pr_debug("L2/L3 cache only shared by the threads in the small core\n"); + return 0; } From cf9c615cde49fb5d2480549c8c955a0a387798d3 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 21 Jul 2021 00:15:04 +1000 Subject: [PATCH 080/474] powerpc/64s/perf: Always use SIAR for kernel interrupts If an interrupt is taken in kernel mode, always use SIAR for it rather than looking at regs_sipr. This prevents samples piling up around interrupt enable (hard enable or interrupt replay via soft enable) in PMUs / modes where the PR sample indication is not in synch with SIAR. This results in better sampling of interrupt entry and exit in particular. Signed-off-by: Nicholas Piggin Tested-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210720141504.420110-1-npiggin@gmail.com --- arch/powerpc/perf/core-book3s.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index bb0ee716de91..91203ed9d0ff 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -340,6 +340,13 @@ static inline void perf_read_regs(struct pt_regs *regs) * If the PMU doesn't update the SIAR for non marked events use * pt_regs. * + * If regs is a kernel interrupt, always use SIAR. Some PMUs have an + * issue with regs_sipr not being in synch with SIAR in interrupt entry + * and return sequences, which can result in regs_sipr being true for + * kernel interrupts and SIAR, which has the effect of causing samples + * to pile up at mtmsrd MSR[EE] 0->1 or pending irq replay around + * interrupt entry/exit. + * * If the PMU has HV/PR flags then check to see if they * place the exception in userspace. If so, use pt_regs. In * continuous sampling mode the SIAR and the PMU exception are @@ -356,6 +363,8 @@ static inline void perf_read_regs(struct pt_regs *regs) use_siar = 1; else if ((ppmu->flags & PPMU_NO_CONT_SAMPLING)) use_siar = 0; + else if (!user_mode(regs)) + use_siar = 1; else if (!(ppmu->flags & PPMU_NO_SIPR) && regs_sipr(regs)) use_siar = 0; else From 946e1052cdcc7e585ee5d1e72528ca49fb295243 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 18 Jul 2021 19:33:09 -0700 Subject: [PATCH 081/474] openrisc: don't printk() unconditionally Don't call printk() when CONFIG_PRINTK is not set. Fixes the following build errors: or1k-linux-ld: arch/openrisc/kernel/entry.o: in function `_external_irq_handler': (.text+0x804): undefined reference to `printk' (.text+0x804): relocation truncated to fit: R_OR1K_INSN_REL_26 against undefined symbol `printk' Fixes: 9d02a4283e9c ("OpenRISC: Boot code") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Cc: openrisc@lists.librecores.org Signed-off-by: Stafford Horne --- arch/openrisc/kernel/entry.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/openrisc/kernel/entry.S b/arch/openrisc/kernel/entry.S index bc657e55c15f..98e4f97db515 100644 --- a/arch/openrisc/kernel/entry.S +++ b/arch/openrisc/kernel/entry.S @@ -547,6 +547,7 @@ EXCEPTION_ENTRY(_external_irq_handler) l.bnf 1f // ext irq enabled, all ok. l.nop +#ifdef CONFIG_PRINTK l.addi r1,r1,-0x8 l.movhi r3,hi(42f) l.ori r3,r3,lo(42f) @@ -560,6 +561,7 @@ EXCEPTION_ENTRY(_external_irq_handler) .string "\n\rESR interrupt bug: in _external_irq_handler (ESR %x)\n\r" .align 4 .previous +#endif l.ori r4,r4,SPR_SR_IEE // fix the bug // l.sw PT_SR(r1),r4 From 11648cbb7b335b7eb54e1ff973fb938939616f46 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Jul 2021 19:23:38 -0700 Subject: [PATCH 082/474] openrisc: rename or32 code & comments to or1k From Documentation/openrisc/todo.rst, rename "or32" in the source code to "or1k" since this is the name that has been settled on. Signed-off-by: Randy Dunlap Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Cc: openrisc@lists.librecores.org Signed-off-by: Stafford Horne --- arch/openrisc/include/asm/pgtable.h | 6 +++--- arch/openrisc/include/asm/thread_info.h | 2 +- arch/openrisc/kernel/entry.S | 4 ++-- arch/openrisc/kernel/head.S | 6 +++--- arch/openrisc/kernel/setup.c | 4 ++-- arch/openrisc/lib/Makefile | 2 +- arch/openrisc/mm/fault.c | 2 +- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 4ac591c9ca33..cdd657f80bfa 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -12,7 +12,7 @@ * et al. */ -/* or32 pgtable.h - macros and functions to manipulate page tables +/* or1k pgtable.h - macros and functions to manipulate page tables * * Based on: * include/asm-cris/pgtable.h @@ -29,14 +29,14 @@ /* * The Linux memory management assumes a three-level page table setup. On - * or32, we use that, but "fold" the mid level into the top-level page + * or1k, we use that, but "fold" the mid level into the top-level page * table. Since the MMU TLB is software loaded through an interrupt, it * supports any page table structure, so we could have used a three-level * setup, but for the amounts of memory we normally use, a two-level is * probably more efficient. * * This file contains the functions and defines necessary to modify and use - * the or32 page table tree. + * the or1k page table tree. */ extern void paging_init(void); diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h index 4f9d2a261455..659834ab87fa 100644 --- a/arch/openrisc/include/asm/thread_info.h +++ b/arch/openrisc/include/asm/thread_info.h @@ -25,7 +25,7 @@ /* THREAD_SIZE is the size of the task_struct/kernel_stack combo. * normally, the stack is found by doing something like p + THREAD_SIZE - * in or32, a page is 8192 bytes, which seems like a sane size + * in or1k, a page is 8192 bytes, which seems like a sane size */ #define THREAD_SIZE_ORDER 0 diff --git a/arch/openrisc/kernel/entry.S b/arch/openrisc/kernel/entry.S index 98e4f97db515..8c8ac3451425 100644 --- a/arch/openrisc/kernel/entry.S +++ b/arch/openrisc/kernel/entry.S @@ -326,7 +326,7 @@ EXCEPTION_ENTRY(_data_page_fault_handler) 1: l.ori r6,r0,0x0 // !write access 2: - /* call fault.c handler in or32/mm/fault.c */ + /* call fault.c handler in openrisc/mm/fault.c */ l.jal do_page_fault l.nop l.j _ret_from_exception @@ -348,7 +348,7 @@ EXCEPTION_ENTRY(_insn_page_fault_handler) /* r4 set be EXCEPTION_HANDLE */ // effective address of fault l.ori r6,r0,0x0 // !write access - /* call fault.c handler in or32/mm/fault.c */ + /* call fault.c handler in openrisc/mm/fault.c */ l.jal do_page_fault l.nop l.j _ret_from_exception diff --git a/arch/openrisc/kernel/head.S b/arch/openrisc/kernel/head.S index af355e3f4619..15f1b38dfe03 100644 --- a/arch/openrisc/kernel/head.S +++ b/arch/openrisc/kernel/head.S @@ -599,7 +599,7 @@ flush_tlb: l.jal _flush_tlb l.nop -/* The MMU needs to be enabled before or32_early_setup is called */ +/* The MMU needs to be enabled before or1k_early_setup is called */ enable_mmu: /* @@ -641,9 +641,9 @@ enable_mmu: /* magic number mismatch, set fdt pointer to null */ l.or r25,r0,r0 _fdt_found: - /* pass fdt pointer to or32_early_setup in r3 */ + /* pass fdt pointer to or1k_early_setup in r3 */ l.or r3,r0,r25 - LOAD_SYMBOL_2_GPR(r24, or32_early_setup) + LOAD_SYMBOL_2_GPR(r24, or1k_early_setup) l.jalr r24 l.nop diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c index 8ae2da6ac097..7eddcac0ef2f 100644 --- a/arch/openrisc/kernel/setup.c +++ b/arch/openrisc/kernel/setup.c @@ -209,7 +209,7 @@ void __init setup_cpuinfo(void) } /** - * or32_early_setup + * or1k_early_setup * * Handles the pointer to the device tree that this kernel is to use * for establishing the available platform devices. @@ -217,7 +217,7 @@ void __init setup_cpuinfo(void) * Falls back on built-in device tree in case null pointer is passed. */ -void __init or32_early_setup(void *fdt) +void __init or1k_early_setup(void *fdt) { if (fdt) pr_info("FDT at %p\n", fdt); diff --git a/arch/openrisc/lib/Makefile b/arch/openrisc/lib/Makefile index 79775aaa6baa..53327406b483 100644 --- a/arch/openrisc/lib/Makefile +++ b/arch/openrisc/lib/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only # -# Makefile for or32 specific library files.. +# Makefile for or1k specific library files.. # obj-y := delay.o string.o memset.o memcpy.o diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index ca97d9baab51..c730d1a51686 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -28,7 +28,7 @@ unsigned long pte_misses; /* updated by do_page_fault() */ unsigned long pte_errors; /* updated by do_page_fault() */ /* __PHX__ :: - check the vmalloc_fault in do_page_fault() - * - also look into include/asm-or32/mmu_context.h + * - also look into include/asm/mmu_context.h */ volatile pgd_t *current_pgd[NR_CPUS]; From 730d070ae9f12fff5d44fe8fb0547ae37d100da8 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:15:45 +0200 Subject: [PATCH 083/474] MIPS: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/mips-mt-fpaff.c | 10 +++++----- arch/mips/kernel/process.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c index 6c590ef27648..67e130d3f038 100644 --- a/arch/mips/kernel/mips-mt-fpaff.c +++ b/arch/mips/kernel/mips-mt-fpaff.c @@ -76,13 +76,13 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len, if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) return -EFAULT; - get_online_cpus(); + cpus_read_lock(); rcu_read_lock(); p = find_process_by_pid(pid); if (!p) { rcu_read_unlock(); - put_online_cpus(); + cpus_read_unlock(); return -ESRCH; } @@ -147,7 +147,7 @@ out_free_cpus_allowed: free_cpumask_var(cpus_allowed); out_put_task: put_task_struct(p); - put_online_cpus(); + cpus_read_unlock(); return retval; } @@ -166,7 +166,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len, if (len < real_len) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); rcu_read_lock(); retval = -ESRCH; @@ -182,7 +182,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len, out_unlock: rcu_read_unlock(); - put_online_cpus(); + cpus_read_unlock(); if (retval) return retval; if (copy_to_user(user_mask_ptr, &mask, real_len)) diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 73c8e7990a97..95aa86fa6077 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -859,10 +859,10 @@ int mips_set_process_fp_mode(struct task_struct *task, unsigned int value) * scheduled in then it will already have picked up the new FP mode * whilst doing so. */ - get_online_cpus(); + cpus_read_lock(); for_each_cpu_and(cpu, &process_cpus, cpu_online_mask) work_on_cpu(cpu, prepare_for_fp_mode_switch, NULL); - put_online_cpus(); + cpus_read_unlock(); return 0; } From ad548993a66c498267695edd8b19a682be0e3a8b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 4 Aug 2021 17:02:16 -0700 Subject: [PATCH 084/474] MIPS: loongson2ef: don't build serial.o unconditionally LOONGSON_UART_BASE depends on EARLY_PRINTK || SERIAL_8250, but when neither of these Kconfig symbols is set, the kernel build has errors: ../arch/mips/loongson2ef/common/serial.c: In function 'serial_init': ../arch/mips/loongson2ef/common/serial.c:66:25: error: 'loongson_uart_base' undeclared (first use in this function) 66 | loongson_uart_base; ../arch/mips/loongson2ef/common/serial.c:66:25: note: each undeclared identifier is reported only once for each function it appears in ../arch/mips/loongson2ef/common/serial.c:68:41: error: '_loongson_uart_base' undeclared (first use in this function) 68 | (void __iomem *)_loongson_uart_base; Fix this by building serial.o only when one (or both) of these Kconfig symbols is enabled. Tested with: (a) EARLY_PRINTK=y, SERIAL_8250 not set; (b) EARLY_PRINTK=y, SERIAL_8250=y; (c) EARLY_PRINTK=y, SERIAL_8250=m. (d) EARLY_PRINTK not set, SERIAL_8250=y; (e) EARLY_PRINTK not set, SERIAL_8250=m; (f) EARLY_PRINTK not set, SERIAL_8250 not set. Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Jiaxun Yang Cc: Thomas Bogendoerfer Cc: linux-mips@vger.kernel.org Signed-off-by: Thomas Bogendoerfer --- arch/mips/loongson2ef/common/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/mips/loongson2ef/common/Makefile b/arch/mips/loongson2ef/common/Makefile index d5ab3e543ea3..30ea8b5ca685 100644 --- a/arch/mips/loongson2ef/common/Makefile +++ b/arch/mips/loongson2ef/common/Makefile @@ -4,12 +4,14 @@ # obj-y += setup.o init.o env.o time.o reset.o irq.o \ - bonito-irq.o mem.o machtype.o platform.o serial.o + bonito-irq.o mem.o machtype.o platform.o obj-$(CONFIG_PCI) += pci.o # # Serial port support # +obj-$(CONFIG_LOONGSON_UART_BASE) += serial.o +obj-$(CONFIG_EARLY_PRINTK) += serial.o obj-$(CONFIG_LOONGSON_UART_BASE) += uart_base.o obj-$(CONFIG_LOONGSON_MC146818) += rtc.o From cb95ea79b3fc772c5873a7a4532ab4c14a455da2 Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Thu, 29 Jul 2021 17:31:52 +0800 Subject: [PATCH 085/474] MIPS: locking/atomic: Fix atomic{_64,}_sub_if_positive This looks like a typo and that caused atomic64 test failed. Signed-off-by: Rui Wang Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/atomic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h index 95e1f7f3597f..a0b9e7c1e4fc 100644 --- a/arch/mips/include/asm/atomic.h +++ b/arch/mips/include/asm/atomic.h @@ -206,7 +206,7 @@ ATOMIC_OPS(atomic64, xor, s64, ^=, xor, lld, scd) * The function returns the old value of @v minus @i. */ #define ATOMIC_SIP_OP(pfx, type, op, ll, sc) \ -static __inline__ int arch_##pfx##_sub_if_positive(type i, pfx##_t * v) \ +static __inline__ type arch_##pfx##_sub_if_positive(type i, pfx##_t * v) \ { \ type temp, result; \ \ From 09ca497528dac12cbbceab8197011c875a96d053 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sun, 27 Jun 2021 17:09:18 +0000 Subject: [PATCH 086/474] powerpc: Remove in_kernel_text() Last user of in_kernel_text() stopped using in with commit 549e8152de80 ("powerpc: Make the 64-bit kernel as a position-independent executable"). Generic function is_kernel_text() does the same. So remote it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/2a3a5b6f8cc0ef4e854d7b764f66aa8d2ee270d2.1624813698.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/sections.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index 324d7b298ec3..6e4af4492a14 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -38,14 +38,6 @@ extern char start_virt_trampolines[]; extern char end_virt_trampolines[]; #endif -static inline int in_kernel_text(unsigned long addr) -{ - if (addr >= (unsigned long)_stext && addr < (unsigned long)__init_end) - return 1; - - return 0; -} - static inline unsigned long kernel_toc_addr(void) { /* Defined by the linker, see vmlinux.lds.S */ From c8a6d91005343dea0d53be0ff0620c66934dcd44 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 5 Jul 2021 12:00:50 +0000 Subject: [PATCH 087/474] powerpc/non-smp: Unconditionaly call smp_mb() on switch_mm Commit 3ccfebedd8cf ("powerpc, membarrier: Skip memory barrier in switch_mm()") added some logic to skip the smp_mb() in switch_mm_irqs_off() before the call to switch_mmu_context(). However, on non SMP smp_mb() is just a compiler barrier and doing it unconditionaly is simpler than the logic used to check whether the barrier is needed or not. After the patch: 00000000 : ... c: 7c 04 18 40 cmplw r4,r3 10: 81 24 00 24 lwz r9,36(r4) 14: 91 25 04 c8 stw r9,1224(r5) 18: 4d 82 00 20 beqlr 1c: 48 00 00 00 b 1c 1c: R_PPC_REL24 switch_mmu_context Before the patch: 00000000 : ... c: 7c 04 18 40 cmplw r4,r3 10: 81 24 00 24 lwz r9,36(r4) 14: 91 25 04 c8 stw r9,1224(r5) 18: 4d 82 00 20 beqlr 1c: 81 24 00 28 lwz r9,40(r4) 20: 71 29 00 0a andi. r9,r9,10 24: 40 82 00 34 bne 58 28: 48 00 00 00 b 28 28: R_PPC_REL24 switch_mmu_context ... 58: 2c 03 00 00 cmpwi r3,0 5c: 41 82 ff cc beq 28 60: 48 00 00 00 b 60 60: R_PPC_REL24 switch_mmu_context Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e9d501da0c59f60ca767b1b3ea4603fce6d02b9e.1625486440.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/membarrier.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h index 6e20bb5c74ea..de7f79157918 100644 --- a/arch/powerpc/include/asm/membarrier.h +++ b/arch/powerpc/include/asm/membarrier.h @@ -12,7 +12,8 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev, * when switching from userspace to kernel is not needed after * store to rq->curr. */ - if (likely(!(atomic_read(&next->membarrier_state) & + if (IS_ENABLED(CONFIG_SMP) && + likely(!(atomic_read(&next->membarrier_state) & (MEMBARRIER_STATE_PRIVATE_EXPEDITED | MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev)) return; From 9c7248bb8de31f51c693bfa6a6ea53b1c07e0fa8 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Tue, 11 May 2021 09:31:36 +0200 Subject: [PATCH 088/474] powerpc/numa: Consider the max NUMA node for migratable LPAR When a LPAR is migratable, we should consider the maximum possible NUMA node instead of the number of NUMA nodes from the actual system. The DT property 'ibm,current-associativity-domains' defines the maximum number of nodes the LPAR can see when running on that box. But if the LPAR is being migrated on another box, it may see up to the nodes defined by 'ibm,max-associativity-domains'. So if a LPAR is migratable, that value should be used. Unfortunately, there is no easy way to know if an LPAR is migratable or not. The hypervisor exports the property 'ibm,migratable-partition' in the case it set to migrate partition, but that would not mean that the current partition is migratable. Without this patch, when a LPAR is started on a 2 node box and then migrated to a 3 node box, the hypervisor may spread the LPAR's CPUs on the 3rd node. In that case if a CPU from that 3rd node is added to the LPAR, it will be wrongly assigned to the node because the kernel has been set to use up to 2 nodes (the configuration of the departure node). With this patch applies, the CPU is correctly added to the 3rd node. Fixes: f9f130ff2ec9 ("powerpc/numa: Detect support for coregroup") Signed-off-by: Laurent Dufour Reviewed-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210511073136.17795-1-ldufour@linux.ibm.com --- arch/powerpc/mm/numa.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f2bf98bdcea2..094a1076fd1f 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -893,7 +893,7 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) static void __init find_possible_nodes(void) { struct device_node *rtas; - const __be32 *domains; + const __be32 *domains = NULL; int prop_length, max_nodes; u32 i; @@ -909,9 +909,14 @@ static void __init find_possible_nodes(void) * it doesn't exist, then fallback on ibm,max-associativity-domains. * Current denotes what the platform can support compared to max * which denotes what the Hypervisor can support. + * + * If the LPAR is migratable, new nodes might be activated after a LPM, + * so we should consider the max number in that case. */ - domains = of_get_property(rtas, "ibm,current-associativity-domains", - &prop_length); + if (!of_get_property(of_root, "ibm,migratable-partition", NULL)) + domains = of_get_property(rtas, + "ibm,current-associativity-domains", + &prop_length); if (!domains) { domains = of_get_property(rtas, "ibm,max-associativity-domains", &prop_length); @@ -920,6 +925,8 @@ static void __init find_possible_nodes(void) } max_nodes = of_read_number(&domains[min_common_depth], 1); + pr_info("Partition configured for %d NUMA nodes.\n", max_nodes); + for (i = 0; i < max_nodes; i++) { if (!node_possible(i)) node_set(i, node_possible_map); From d144f4d5a8a804133d20ff311d7be70bcdbfaac2 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Mon, 17 May 2021 11:06:06 +0200 Subject: [PATCH 089/474] pseries/drmem: update LMBs after LPM After a LPM, the device tree node ibm,dynamic-reconfiguration-memory may be updated by the hypervisor in the case the NUMA topology of the LPAR's memory is updated. This is handled by the kernel, but the memory's node is not updated because there is no way to move a memory block between nodes from the Linux kernel point of view. If later a memory block is added or removed, drmem_update_dt() is called and it is overwriting the DT node ibm,dynamic-reconfiguration-memory to match the added or removed LMB. But the LMB's associativity node has not been updated after the DT node update and thus the node is overwritten by the Linux's topology instead of the hypervisor one. Introduce a hook called when the ibm,dynamic-reconfiguration-memory node is updated to force an update of the LMB's associativity. However, ignore the call to that hook when the update has been triggered by drmem_update_dt(). Because, in that case, the LMB tree has been used to set the DT property and thus it doesn't need to be updated back. Since drmem_update_dt() is called under the protection of the device_hotplug_lock and the hook is called in the same context, use a simple boolean variable to detect that call. Signed-off-by: Laurent Dufour Reviewed-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210517090606.56930-1-ldufour@linux.ibm.com --- arch/powerpc/include/asm/drmem.h | 1 + arch/powerpc/mm/drmem.c | 46 +++++++++++++++++++ .../platforms/pseries/hotplug-memory.c | 4 ++ 3 files changed, 51 insertions(+) diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h index bf2402fed3e0..4265d5e95c2c 100644 --- a/arch/powerpc/include/asm/drmem.h +++ b/arch/powerpc/include/asm/drmem.h @@ -111,6 +111,7 @@ int drmem_update_dt(void); int __init walk_drmem_lmbs_early(unsigned long node, void *data, int (*func)(struct drmem_lmb *, const __be32 **, void *)); +void drmem_update_lmbs(struct property *prop); #endif static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb) diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c index 9af3832c9d8d..22197b18d85e 100644 --- a/arch/powerpc/mm/drmem.c +++ b/arch/powerpc/mm/drmem.c @@ -18,6 +18,7 @@ static int n_root_addr_cells, n_root_size_cells; static struct drmem_lmb_info __drmem_info; struct drmem_lmb_info *drmem_info = &__drmem_info; +static bool in_drmem_update; u64 drmem_lmb_memory_max(void) { @@ -178,6 +179,11 @@ int drmem_update_dt(void) if (!memory) return -1; + /* + * Set in_drmem_update to prevent the notifier callback to process the + * DT property back since the change is coming from the LMB tree. + */ + in_drmem_update = true; prop = of_find_property(memory, "ibm,dynamic-memory", NULL); if (prop) { rc = drmem_update_dt_v1(memory, prop); @@ -186,6 +192,7 @@ int drmem_update_dt(void) if (prop) rc = drmem_update_dt_v2(memory, prop); } + in_drmem_update = false; of_node_put(memory); return rc; @@ -307,6 +314,45 @@ int __init walk_drmem_lmbs_early(unsigned long node, void *data, return ret; } +/* + * Update the LMB associativity index. + */ +static int update_lmb(struct drmem_lmb *updated_lmb, + __maybe_unused const __be32 **usm, + __maybe_unused void *data) +{ + struct drmem_lmb *lmb; + + for_each_drmem_lmb(lmb) { + if (lmb->drc_index != updated_lmb->drc_index) + continue; + + lmb->aa_index = updated_lmb->aa_index; + break; + } + return 0; +} + +/* + * Update the LMB associativity index. + * + * This needs to be called when the hypervisor is updating the + * dynamic-reconfiguration-memory node property. + */ +void drmem_update_lmbs(struct property *prop) +{ + /* + * Don't update the LMBs if triggered by the update done in + * drmem_update_dt(), the LMB values have been used to the update the DT + * property in that case. + */ + if (in_drmem_update) + return; + if (!strcmp(prop->name, "ibm,dynamic-memory")) + __walk_drmem_v1_lmbs(prop->value, NULL, NULL, update_lmb); + else if (!strcmp(prop->name, "ibm,dynamic-memory-v2")) + __walk_drmem_v2_lmbs(prop->value, NULL, NULL, update_lmb); +} #endif static int init_drmem_lmb_size(struct device_node *dn) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 377d852f5a9a..0beb3ca2b549 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -979,6 +979,10 @@ static int pseries_memory_notifier(struct notifier_block *nb, case OF_RECONFIG_DETACH_NODE: err = pseries_remove_mem_node(rd->dn); break; + case OF_RECONFIG_UPDATE_PROPERTY: + if (!strcmp(rd->dn->name, + "ibm,dynamic-reconfiguration-memory")) + drmem_update_lmbs(rd->prop); } return notifier_from_errno(err); } From bd1dd4c5f5286df0148b5b316f37c583b8f55fa1 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 29 Apr 2021 19:49:08 +0200 Subject: [PATCH 090/474] powerpc/pseries: Prevent free CPU ids being reused on another node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a CPU is hot added, the CPU ids are taken from the available mask from the lower possible set. If that set of values was previously used for a CPU attached to a different node, it appears to an application as if these CPUs have migrated from one node to another node which is not expected. To prevent this, it is needed to record the CPU ids used for each node and to not reuse them on another node. However, to prevent CPU hot plug to fail, in the case the CPU ids is starved on a node, the capability to reuse other nodes’ free CPU ids is kept. A warning is displayed in such a case to warn the user. A new CPU bit mask (node_recorded_ids_map) is introduced for each possible node. It is populated with the CPU onlined at boot time, and then when a CPU is hot plugged to a node. The bits in that mask remain when the CPU is hot unplugged, to remind this CPU ids have been used for this node. If no id set was found, a retry is made without removing the ids used on the other nodes to try reusing them. This is the way ids have been allocated prior to this patch. The effect of this patch can be seen by removing and adding CPUs using the Qemu monitor. In the following case, the first CPU from the node 2 is removed, then the first one from the node 1 is removed too. Later, the first CPU of the node 2 is added back. Without that patch, the kernel will number these CPUs using the first CPU ids available which are the ones freed when removing the second CPU of the node 0. This leads to the CPU ids 16-23 to move from the node 1 to the node 2. With the patch applied, the CPU ids 32-39 are used since they are the lowest free ones which have not been used on another node. At boot time: [root@vm40 ~]# numactl -H | grep cpus node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 node 1 cpus: 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 node 2 cpus: 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 Vanilla kernel, after the CPU hot unplug/plug operations: [root@vm40 ~]# numactl -H | grep cpus node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 node 1 cpus: 24 25 26 27 28 29 30 31 node 2 cpus: 16 17 18 19 20 21 22 23 40 41 42 43 44 45 46 47 Patched kernel, after the CPU hot unplug/plug operations: [root@vm40 ~]# numactl -H | grep cpus node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 node 1 cpus: 24 25 26 27 28 29 30 31 node 2 cpus: 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 Signed-off-by: Laurent Dufour Reviewed-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210429174908.16613-1-ldufour@linux.ibm.com --- arch/powerpc/platforms/pseries/hotplug-cpu.c | 171 ++++++++++++++----- 1 file changed, 132 insertions(+), 39 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 7e970f81d8ff..e1f224320102 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -39,6 +39,12 @@ /* This version can't take the spinlock, because it never returns */ static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE; +/* + * Record the CPU ids used on each nodes. + * Protected by cpu_add_remove_lock. + */ +static cpumask_var_t node_recorded_ids_map[MAX_NUMNODES]; + static void rtas_stop_self(void) { static struct rtas_args args; @@ -139,72 +145,148 @@ static void pseries_cpu_die(unsigned int cpu) paca_ptrs[cpu]->cpu_start = 0; } +/** + * find_cpu_id_range - found a linear ranger of @nthreads free CPU ids. + * @nthreads : the number of threads (cpu ids) + * @assigned_node : the node it belongs to or NUMA_NO_NODE if free ids from any + * node can be peek. + * @cpu_mask: the returned CPU mask. + * + * Returns 0 on success. + */ +static int find_cpu_id_range(unsigned int nthreads, int assigned_node, + cpumask_var_t *cpu_mask) +{ + cpumask_var_t candidate_mask; + unsigned int cpu, node; + int rc = -ENOSPC; + + if (!zalloc_cpumask_var(&candidate_mask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_clear(*cpu_mask); + for (cpu = 0; cpu < nthreads; cpu++) + cpumask_set_cpu(cpu, *cpu_mask); + + BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask)); + + /* Get a bitmap of unoccupied slots. */ + cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask); + + if (assigned_node != NUMA_NO_NODE) { + /* + * Remove free ids previously assigned on the other nodes. We + * can walk only online nodes because once a node became online + * it is not turned offlined back. + */ + for_each_online_node(node) { + if (node == assigned_node) + continue; + cpumask_andnot(candidate_mask, candidate_mask, + node_recorded_ids_map[node]); + } + } + + if (cpumask_empty(candidate_mask)) + goto out; + + while (!cpumask_empty(*cpu_mask)) { + if (cpumask_subset(*cpu_mask, candidate_mask)) + /* Found a range where we can insert the new cpu(s) */ + break; + cpumask_shift_left(*cpu_mask, *cpu_mask, nthreads); + } + + if (!cpumask_empty(*cpu_mask)) + rc = 0; + +out: + free_cpumask_var(candidate_mask); + return rc; +} + /* * Update cpu_present_mask and paca(s) for a new cpu node. The wrinkle - * here is that a cpu device node may represent up to two logical cpus + * here is that a cpu device node may represent multiple logical cpus * in the SMT case. We must honor the assumption in other code that * the logical ids for sibling SMT threads x and y are adjacent, such * that x^1 == y and y^1 == x. */ static int pseries_add_processor(struct device_node *np) { - unsigned int cpu; - cpumask_var_t candidate_mask, tmp; - int err = -ENOSPC, len, nthreads, i; + int len, nthreads, node, cpu, assigned_node; + int rc = 0; + cpumask_var_t cpu_mask; const __be32 *intserv; intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len); if (!intserv) return 0; - zalloc_cpumask_var(&candidate_mask, GFP_KERNEL); - zalloc_cpumask_var(&tmp, GFP_KERNEL); - nthreads = len / sizeof(u32); - for (i = 0; i < nthreads; i++) - cpumask_set_cpu(i, tmp); + + if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + /* + * Fetch from the DT nodes read by dlpar_configure_connector() the NUMA + * node id the added CPU belongs to. + */ + node = of_node_to_nid(np); + if (node < 0 || !node_possible(node)) + node = first_online_node; + + BUG_ON(node == NUMA_NO_NODE); + assigned_node = node; cpu_maps_update_begin(); - BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask)); - - /* Get a bitmap of unoccupied slots. */ - cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask); - if (cpumask_empty(candidate_mask)) { - /* If we get here, it most likely means that NR_CPUS is - * less than the partition's max processors setting. + rc = find_cpu_id_range(nthreads, node, &cpu_mask); + if (rc && nr_node_ids > 1) { + /* + * Try again, considering the free CPU ids from the other node. */ - printk(KERN_ERR "Cannot add cpu %pOF; this system configuration" - " supports %d logical cpus.\n", np, - num_possible_cpus()); - goto out_unlock; + node = NUMA_NO_NODE; + rc = find_cpu_id_range(nthreads, NUMA_NO_NODE, &cpu_mask); } - while (!cpumask_empty(tmp)) - if (cpumask_subset(tmp, candidate_mask)) - /* Found a range where we can insert the new cpu(s) */ - break; - else - cpumask_shift_left(tmp, tmp, nthreads); - - if (cpumask_empty(tmp)) { - printk(KERN_ERR "Unable to find space in cpu_present_mask for" - " processor %pOFn with %d thread(s)\n", np, - nthreads); - goto out_unlock; + if (rc) { + pr_err("Cannot add cpu %pOF; this system configuration" + " supports %d logical cpus.\n", np, num_possible_cpus()); + goto out; } - for_each_cpu(cpu, tmp) { + for_each_cpu(cpu, cpu_mask) { BUG_ON(cpu_present(cpu)); set_cpu_present(cpu, true); set_hard_smp_processor_id(cpu, be32_to_cpu(*intserv++)); } - err = 0; -out_unlock: + + /* Record the newly used CPU ids for the associate node. */ + cpumask_or(node_recorded_ids_map[assigned_node], + node_recorded_ids_map[assigned_node], cpu_mask); + + /* + * If node is set to NUMA_NO_NODE, CPU ids have be reused from + * another node, remove them from its mask. + */ + if (node == NUMA_NO_NODE) { + cpu = cpumask_first(cpu_mask); + pr_warn("Reusing free CPU ids %d-%d from another node\n", + cpu, cpu + nthreads - 1); + for_each_online_node(node) { + if (node == assigned_node) + continue; + cpumask_andnot(node_recorded_ids_map[node], + node_recorded_ids_map[node], + cpu_mask); + } + } + +out: cpu_maps_update_done(); - free_cpumask_var(candidate_mask); - free_cpumask_var(tmp); - return err; + free_cpumask_var(cpu_mask); + return rc; } /* @@ -908,6 +990,7 @@ static struct notifier_block pseries_smp_nb = { static int __init pseries_cpu_hotplug_init(void) { int qcss_tok; + unsigned int node; #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE ppc_md.cpu_probe = dlpar_cpu_probe; @@ -929,8 +1012,18 @@ static int __init pseries_cpu_hotplug_init(void) smp_ops->cpu_die = pseries_cpu_die; /* Processors can be added/removed only on LPAR */ - if (firmware_has_feature(FW_FEATURE_LPAR)) + if (firmware_has_feature(FW_FEATURE_LPAR)) { + for_each_node(node) { + alloc_bootmem_cpumask_var(&node_recorded_ids_map[node]); + + /* Record ids of CPU added at boot time */ + cpumask_or(node_recorded_ids_map[node], + node_recorded_ids_map[node], + node_to_cpumask_map[node]); + } + of_reconfig_notifier_register(&pseries_smp_nb); + } return 0; } From c00103abf76fd3916596afd07dd3fdeee0dca15d Mon Sep 17 00:00:00 2001 From: kernel test robot Date: Tue, 3 Aug 2021 16:59:55 +0200 Subject: [PATCH 091/474] powerpc/kexec: fix for_each_child.cocci warning for_each_node_by_type should have of_node_put() before return. Generated by: scripts/coccinelle/iterators/for_each_child.cocci Reported-by: kernel test robot Signed-off-by: kernel test robot Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/alpine.DEB.2.22.394.2108031654080.17639@hadrien --- arch/powerpc/kexec/core_64.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 84618d3c8013..89c069d664a5 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -64,8 +64,10 @@ int default_machine_kexec_prepare(struct kimage *image) begin = image->segment[i].mem; end = begin + image->segment[i].memsz; - if ((begin < high) && (end > low)) + if ((begin < high) && (end > low)) { + of_node_put(node); return -ETXTBSY; + } } } From 5ae36401ca4ea2737d779ce7c267444b16530001 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:15:46 +0200 Subject: [PATCH 092/474] powerpc: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210803141621.780504-4-bigeasy@linutronix.de --- arch/powerpc/kernel/rtasd.c | 4 ++-- arch/powerpc/kvm/book3s_hv_builtin.c | 10 +++++----- arch/powerpc/platforms/powernv/idle.c | 4 ++-- arch/powerpc/platforms/powernv/opal-imc.c | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 8561dfb33f24..32ee17753eb4 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -429,7 +429,7 @@ static void rtas_event_scan(struct work_struct *w) do_event_scan(); - get_online_cpus(); + cpus_read_lock(); /* raw_ OK because just using CPU as starting point. */ cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); @@ -451,7 +451,7 @@ static void rtas_event_scan(struct work_struct *w) schedule_delayed_work_on(cpu, &event_scan_work, __round_jiffies_relative(event_scan_delay, cpu)); - put_online_cpus(); + cpus_read_unlock(); } #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index be8ef1c5b1bf..fcf4760a3a0e 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -137,23 +137,23 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target, * exist in the system. We use a counter of VMs to track this. * * One of the operations we need to block is onlining of secondaries, so we - * protect hv_vm_count with get/put_online_cpus(). + * protect hv_vm_count with cpus_read_lock/unlock(). */ static atomic_t hv_vm_count; void kvm_hv_vm_activated(void) { - get_online_cpus(); + cpus_read_lock(); atomic_inc(&hv_vm_count); - put_online_cpus(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(kvm_hv_vm_activated); void kvm_hv_vm_deactivated(void) { - get_online_cpus(); + cpus_read_lock(); atomic_dec(&hv_vm_count); - put_online_cpus(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(kvm_hv_vm_deactivated); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 528a7e0cf83a..aa27689b832d 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -199,12 +199,12 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev, */ power7_fastsleep_workaround_exit = false; - get_online_cpus(); + cpus_read_lock(); primary_thread_mask = cpu_online_cores_map(); on_each_cpu_mask(&primary_thread_mask, pnv_fastsleep_workaround_apply, &err, 1); - put_online_cpus(); + cpus_read_unlock(); if (err) { pr_err("fastsleep_workaround_applyonce change failed while running pnv_fastsleep_workaround_apply"); goto fail; diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 7824cc364bc4..ba02a75c1410 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -186,7 +186,7 @@ static void disable_nest_pmu_counters(void) int nid, cpu; const struct cpumask *l_cpumask; - get_online_cpus(); + cpus_read_lock(); for_each_node_with_cpus(nid) { l_cpumask = cpumask_of_node(nid); cpu = cpumask_first_and(l_cpumask, cpu_online_mask); @@ -195,7 +195,7 @@ static void disable_nest_pmu_counters(void) opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, get_hard_smp_processor_id(cpu)); } - put_online_cpus(); + cpus_read_unlock(); } static void disable_core_pmu_counters(void) @@ -203,7 +203,7 @@ static void disable_core_pmu_counters(void) cpumask_t cores_map; int cpu, rc; - get_online_cpus(); + cpus_read_lock(); /* Disable the IMC Core functions */ cores_map = cpu_online_cores_map(); for_each_cpu(cpu, &cores_map) { @@ -213,7 +213,7 @@ static void disable_core_pmu_counters(void) pr_err("%s: Failed to stop Core (cpu = %d)\n", __FUNCTION__, cpu); } - put_online_cpus(); + cpus_read_unlock(); } int get_max_nest_dev(void) From 27fd1111051dc218e5b6cb2da5dbb3f342879ff1 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Wed, 4 Aug 2021 11:37:24 +1000 Subject: [PATCH 093/474] powerpc: Always inline radix_enabled() to fix build failure This is the same as commit acdad8fb4a15 ("powerpc: Force inlining of mmu_has_feature to fix build failure") but for radix_enabled(). The config in the linked bugzilla causes the following build failure: LD .tmp_vmlinux.kallsyms1 powerpc64-linux-ld: arch/powerpc/mm/pgtable.o: in function `.__ptep_set_access_flags': pgtable.c:(.text+0x17c): undefined reference to `.radix__ptep_set_access_flags' powerpc64-linux-ld: arch/powerpc/mm/pageattr.o: in function `.change_page_attr': pageattr.c:(.text+0xc0): undefined reference to `.radix__flush_tlb_kernel_range' etc. This is due to radix_enabled() not being inlined. See extract from building with -Winline: In file included from arch/powerpc/include/asm/lppaca.h:46, from arch/powerpc/include/asm/paca.h:17, from arch/powerpc/include/asm/current.h:13, from include/linux/thread_info.h:23, from include/asm-generic/preempt.h:5, from ./arch/powerpc/include/generated/asm/preempt.h:1, from include/linux/preempt.h:78, from include/linux/spinlock.h:51, from include/linux/mmzone.h:8, from include/linux/gfp.h:6, from arch/powerpc/mm/pgtable.c:21: arch/powerpc/include/asm/book3s/64/pgtable.h: In function '__ptep_set_access_flags': arch/powerpc/include/asm/mmu.h:327:20: error: inlining failed in call to 'radix_enabled': call is unlikely and code size would grow [-Werror=inline] The code relies on constant folding of MMU_FTRS_POSSIBLE at buildtime and elimination of non possible parts of code at compile time. For this to work radix_enabled() must be inlined so make it __always_inline. Reported-by: Erhard F. Suggested-by: Michael Ellerman Tested-by: Randy Dunlap Signed-off-by: Jordan Niethe [mpe: Trimmed error messages in change log] Signed-off-by: Michael Ellerman Link: https://bugzilla.kernel.org/show_bug.cgi?id=213803 Link: https://lore.kernel.org/r/20210804013724.514468-1-jniethe5@gmail.com --- arch/powerpc/include/asm/mmu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 27016b98ecb2..8abe8e42e045 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -324,7 +324,7 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) } #endif /* !CONFIG_DEBUG_VM */ -static inline bool radix_enabled(void) +static __always_inline bool radix_enabled(void) { return mmu_has_feature(MMU_FTR_TYPE_RADIX); } From 9b49f979b3d560cb75ea9f1a596baf432d566798 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 5 Aug 2021 11:20:05 +1000 Subject: [PATCH 094/474] powerpc/configs: Disable legacy ptys on microwatt defconfig We shouldn't need legacy ptys, and disabling the option improves boot time by about 0.5 seconds. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210805112005.3cb1f412@kryten.localdomain --- arch/powerpc/configs/microwatt_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/configs/microwatt_defconfig b/arch/powerpc/configs/microwatt_defconfig index a08b739123da..ebc90aefbc0c 100644 --- a/arch/powerpc/configs/microwatt_defconfig +++ b/arch/powerpc/configs/microwatt_defconfig @@ -57,6 +57,7 @@ CONFIG_NETDEVICES=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set # CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set CONFIG_SERIAL_8250=y # CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set CONFIG_SERIAL_8250_CONSOLE=y From 2ac78e0c00184a9ba53d507be7549c69a3f566b6 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 5 Aug 2021 17:56:49 +1000 Subject: [PATCH 095/474] KVM: PPC: Use arch_get_random_seed_long instead of powernv variant The powernv_get_random_long() does not work in nested KVM (which is pseries) and produces a crash when accessing in_be64(rng->regs) in powernv_get_random_long(). This replaces powernv_get_random_long with the ppc_md machine hook wrapper. Signed-off-by: Alexey Kardashevskiy Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210805075649.2086567-1-aik@ozlabs.ru --- arch/powerpc/kvm/book3s_hv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 085fb8ecbf68..9f957ceee58a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1165,7 +1165,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) break; #endif case H_RANDOM: - if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4])) + if (!arch_get_random_seed_long(&vcpu->arch.regs.gpr[4])) ret = H_HARDWARE; break; case H_RPT_INVALIDATE: From 786e5b102a0007d81579822eac23cb5bfaa0b65f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:19 +0200 Subject: [PATCH 096/474] powerpc/pseries/pci: Introduce __find_pe_total_msi() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It will help to size the PCI MSI domain. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-2-clg@kaod.org --- arch/powerpc/platforms/pseries/msi.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 637300330507..d2d090e04745 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -164,12 +164,12 @@ static int check_req_msix(struct pci_dev *pdev, int nvec) /* Quota calculation */ -static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) +static struct device_node *__find_pe_total_msi(struct device_node *node, int *total) { struct device_node *dn; const __be32 *p; - dn = of_node_get(pci_device_to_OF_node(dev)); + dn = of_node_get(node); while (dn) { p = of_get_property(dn, "ibm,pe-total-#msi", NULL); if (p) { @@ -185,6 +185,11 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) return NULL; } +static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) +{ + return __find_pe_total_msi(pci_device_to_OF_node(dev), total); +} + static struct device_node *find_pe_dn(struct pci_dev *dev, int *total) { struct device_node *dn; From e81202007363bd694b711f307f02320b5f98edaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:20 +0200 Subject: [PATCH 097/474] powerpc/pseries/pci: Introduce rtas_prepare_msi_irqs() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This splits the routine setting the MSIs in two parts: allocation of MSIs for the PCI device at the FW level (RTAS) and the actual mapping and activation of the IRQs. rtas_prepare_msi_irqs() will serve as a handler for the PCI MSI domain. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-3-clg@kaod.org --- arch/powerpc/platforms/pseries/msi.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index d2d090e04745..4bf14f27e1aa 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -373,12 +373,11 @@ static void rtas_hack_32bit_msi_gen2(struct pci_dev *pdev) pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 0); } -static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type) +static int rtas_prepare_msi_irqs(struct pci_dev *pdev, int nvec_in, int type, + msi_alloc_info_t *arg) { struct pci_dn *pdn; - int hwirq, virq, i, quota, rc; - struct msi_desc *entry; - struct msi_msg msg; + int quota, rc; int nvec = nvec_in; int use_32bit_msi_hack = 0; @@ -456,6 +455,22 @@ again: return rc; } + return 0; +} + +static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type) +{ + struct pci_dn *pdn; + int hwirq, virq, i; + int rc; + struct msi_desc *entry; + struct msi_msg msg; + + rc = rtas_prepare_msi_irqs(pdev, nvec_in, type, NULL); + if (rc) + return rc; + + pdn = pci_get_pdn(pdev); i = 0; for_each_pci_msi_entry(entry, pdev) { hwirq = rtas_query_irq_number(pdn, i++); From 14be098c5387eb93b794f299f3c3e2ddf6038ec7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:21 +0200 Subject: [PATCH 098/474] powerpc/xive: Add support for IRQ domain hierarchy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds handlers to allocate/free IRQs in a domain hierarchy. We could try to use xive_irq_domain_map() in xive_irq_domain_alloc() but we rely on xive_irq_alloc_data() to set the IRQ handler data and duplicating the code is simpler. xive_irq_free_data() needs to be called when IRQ are freed to clear the MMIO mappings and free the XIVE handler data, xive_irq_data structure. This is going to be a problem with MSI domains which we will address later. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-4-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 64 +++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index dbdbbc2f1dc5..420d96deb7b6 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1366,7 +1366,71 @@ static void xive_irq_domain_debug_show(struct seq_file *m, struct irq_domain *d, } #endif +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +static int xive_irq_domain_translate(struct irq_domain *d, + struct irq_fwspec *fwspec, + unsigned long *hwirq, + unsigned int *type) +{ + return xive_irq_domain_xlate(d, to_of_node(fwspec->fwnode), + fwspec->param, fwspec->param_count, + hwirq, type); +} + +static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_fwspec *fwspec = arg; + irq_hw_number_t hwirq; + unsigned int type = IRQ_TYPE_NONE; + int i, rc; + + rc = xive_irq_domain_translate(domain, fwspec, &hwirq, &type); + if (rc) + return rc; + + pr_debug("%s %d/%lx #%d\n", __func__, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) { + /* TODO: call xive_irq_domain_map() */ + + /* + * Mark interrupts as edge sensitive by default so that resend + * actually works. Will fix that up below if needed. + */ + irq_clear_status_flags(virq, IRQ_LEVEL); + + /* allocates and sets handler data */ + rc = xive_irq_alloc_data(virq + i, hwirq + i); + if (rc) + return rc; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &xive_irq_chip, domain->host_data); + irq_set_handler(virq + i, handle_fasteoi_irq); + } + + return 0; +} + +static void xive_irq_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + int i; + + pr_debug("%s %d #%d\n", __func__, virq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) + xive_irq_free_data(virq + i); +} +#endif + static const struct irq_domain_ops xive_irq_domain_ops = { +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + .alloc = xive_irq_domain_alloc, + .free = xive_irq_domain_free, + .translate = xive_irq_domain_translate, +#endif .match = xive_irq_domain_match, .map = xive_irq_domain_map, .unmap = xive_irq_domain_unmap, From 6c2ab2a5d634d4e30445ee5d52d5d1469bf74aa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:22 +0200 Subject: [PATCH 099/474] powerpc/xive: Ease debugging of xive_irq_set_affinity() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pr_debug() is easier to activate and it helps to know how the kernel configures the HW when tweaking the IRQ subsystem. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-5-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 420d96deb7b6..823f9fe3542a 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -713,7 +713,7 @@ static int xive_irq_set_affinity(struct irq_data *d, u32 target, old_target; int rc = 0; - pr_devel("xive_irq_set_affinity: irq %d\n", d->irq); + pr_debug("%s: irq %d/%x\n", __func__, d->irq, hw_irq); /* Is this valid ? */ if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) @@ -758,7 +758,7 @@ static int xive_irq_set_affinity(struct irq_data *d, return rc; } - pr_devel(" target: 0x%x\n", target); + pr_debug(" target: 0x%x\n", target); xd->target = target; /* Give up previous target */ From a5f3d2c17b07e69166b93209f34a5fb8271a6810 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:23 +0200 Subject: [PATCH 100/474] powerpc/pseries/pci: Add MSI domains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two IRQ domains are added on top of default machine IRQ domain. First, the top level "pSeries-PCI-MSI" domain deals with the MSI specificities. In this domain, the HW IRQ numbers are generated by the PCI MSI layer, they compose a unique ID for an MSI source with the PCI device identifier and the MSI vector number. These numbers can be quite large on a pSeries machine running under the IBM Hypervisor and /sys/kernel/irq/ and /proc/interrupts will require small fixes to show them correctly. Second domain is the in-the-middle "pSeries-MSI" domain which acts as a proxy between the PCI MSI subsystem and the machine IRQ subsystem. It usually allocate the MSI vector numbers but, on pSeries machines, this is done by the RTAS FW and RTAS returns IRQ numbers in the IRQ number space of the machine. This is why the in-the-middle "pSeries-MSI" domain has the same HW IRQ numbers as its parent domain. Only the XIVE (P9/P10) parent domain is supported for now. We still need to add support for IRQ domain hierarchy under XICS. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-6-clg@kaod.org --- arch/powerpc/include/asm/pci-bridge.h | 5 + arch/powerpc/kernel/pci-common.c | 6 + arch/powerpc/platforms/pseries/msi.c | 185 +++++++++++++++++++++++ arch/powerpc/platforms/pseries/pseries.h | 1 + arch/powerpc/platforms/pseries/setup.c | 2 + 5 files changed, 199 insertions(+) diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 74424c14515c..90f488fa4c17 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -126,6 +126,11 @@ struct pci_controller { #endif /* CONFIG_PPC64 */ void *private_data; + + /* IRQ domain hierarchy */ + struct irq_domain *dev_domain; + struct irq_domain *msi_domain; + struct fwnode_handle *fwnode; }; /* These are used for config access before all the PCI probing diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 001e90cd8948..c3573430919d 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -1060,11 +1061,16 @@ void pcibios_bus_add_device(struct pci_dev *dev) int pcibios_add_device(struct pci_dev *dev) { + struct irq_domain *d; + #ifdef CONFIG_PCI_IOV if (ppc_md.pcibios_fixup_sriov) ppc_md.pcibios_fixup_sriov(dev); #endif /* CONFIG_PCI_IOV */ + d = dev_get_msi_domain(&dev->bus->dev); + if (d) + dev_set_msi_domain(&dev->dev, d); return 0; } diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 4bf14f27e1aa..86c6809ebac2 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "pseries.h" @@ -518,6 +519,190 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type) return 0; } +static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct msi_desc *desc = first_pci_msi_entry(pdev); + int type = desc->msi_attrib.is_msix ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI; + + return rtas_prepare_msi_irqs(pdev, nvec, type, arg); +} + +static struct msi_domain_ops pseries_pci_msi_domain_ops = { + .msi_prepare = pseries_msi_ops_prepare, +}; + +static void pseries_msi_shutdown(struct irq_data *d) +{ + d = d->parent_data; + if (d->chip->irq_shutdown) + d->chip->irq_shutdown(d); +} + +static void pseries_msi_mask(struct irq_data *d) +{ + pci_msi_mask_irq(d); + irq_chip_mask_parent(d); +} + +static void pseries_msi_unmask(struct irq_data *d) +{ + pci_msi_unmask_irq(d); + irq_chip_unmask_parent(d); +} + +static struct irq_chip pseries_pci_msi_irq_chip = { + .name = "pSeries-PCI-MSI", + .irq_shutdown = pseries_msi_shutdown, + .irq_mask = pseries_msi_mask, + .irq_unmask = pseries_msi_unmask, + .irq_eoi = irq_chip_eoi_parent, +}; + +static struct msi_domain_info pseries_msi_domain_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX), + .ops = &pseries_pci_msi_domain_ops, + .chip = &pseries_pci_msi_irq_chip, +}; + +static void pseries_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + __pci_read_msi_msg(irq_data_get_msi_desc(data), msg); +} + +static struct irq_chip pseries_msi_irq_chip = { + .name = "pSeries-MSI", + .irq_shutdown = pseries_msi_shutdown, + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_compose_msi_msg = pseries_msi_compose_msg, +}; + +static int pseries_irq_parent_domain_alloc(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) +{ + struct irq_fwspec parent_fwspec; + int ret; + + parent_fwspec.fwnode = domain->parent->fwnode; + parent_fwspec.param_count = 2; + parent_fwspec.param[0] = hwirq; + parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + + ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &parent_fwspec); + if (ret) + return ret; + + return 0; +} + +static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct pci_controller *phb = domain->host_data; + msi_alloc_info_t *info = arg; + struct msi_desc *desc = info->desc; + struct pci_dev *pdev = msi_desc_to_pci_dev(desc); + int hwirq; + int i, ret; + + hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_attrib.entry_nr); + if (hwirq < 0) { + dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq); + return hwirq; + } + + dev_dbg(&pdev->dev, "%s bridge %pOF %d/%x #%d\n", __func__, + phb->dn, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) { + ret = pseries_irq_parent_domain_alloc(domain, virq + i, hwirq + i); + if (ret) + goto out; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &pseries_msi_irq_chip, domain->host_data); + } + + return 0; + +out: + /* TODO: handle RTAS cleanup in ->msi_finish() ? */ + irq_domain_free_irqs_parent(domain, virq, i - 1); + return ret; +} + +static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct pci_controller *phb = irq_data_get_irq_chip_data(d); + + pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs); + + irq_domain_free_irqs_parent(domain, virq, nr_irqs); +} + +static const struct irq_domain_ops pseries_irq_domain_ops = { + .alloc = pseries_irq_domain_alloc, + .free = pseries_irq_domain_free, +}; + +static int __pseries_msi_allocate_domains(struct pci_controller *phb, + unsigned int count) +{ + struct irq_domain *parent = irq_get_default_host(); + + phb->fwnode = irq_domain_alloc_named_id_fwnode("pSeries-MSI", + phb->global_number); + if (!phb->fwnode) + return -ENOMEM; + + phb->dev_domain = irq_domain_create_hierarchy(parent, 0, count, + phb->fwnode, + &pseries_irq_domain_ops, phb); + if (!phb->dev_domain) { + pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n", + phb->dn, phb->global_number); + irq_domain_free_fwnode(phb->fwnode); + return -ENOMEM; + } + + phb->msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(phb->dn), + &pseries_msi_domain_info, + phb->dev_domain); + if (!phb->msi_domain) { + pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n", + phb->dn, phb->global_number); + irq_domain_free_fwnode(phb->fwnode); + irq_domain_remove(phb->dev_domain); + return -ENOMEM; + } + + return 0; +} + +int pseries_msi_allocate_domains(struct pci_controller *phb) +{ + int count; + + /* Only supported by the XIVE driver */ + if (!xive_enabled()) + return -ENODEV; + + if (!__find_pe_total_msi(phb->dn, &count)) { + pr_err("PCI: failed to find MSIs for bridge %pOF (domain %d)\n", + phb->dn, phb->global_number); + return -ENOSPC; + } + + return __pseries_msi_allocate_domains(phb, count); +} + static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev) { /* No LSI -> leave MSIs (if any) configured */ diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 1f051a786fb3..d9280262588b 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -85,6 +85,7 @@ struct pci_host_bridge; int pseries_root_bridge_prepare(struct pci_host_bridge *bridge); extern struct pci_controller_ops pseries_pci_controller_ops; +int pseries_msi_allocate_domains(struct pci_controller *phb); unsigned long pseries_memory_block_size(void); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 631a0d57b6cd..35724caf8a83 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -486,6 +486,8 @@ static void __init pSeries_discover_phbs(void) /* create pci_dn's for DT nodes under this PHB */ pci_devs_phb_init_dynamic(phb); + + pseries_msi_allocate_domains(phb); } of_node_put(root); From 5690bcae186084a8544b1819f0d89399268bd0cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:24 +0200 Subject: [PATCH 101/474] powerpc/xive: Drop unmask of MSIs at startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That was a workaround in the XIVE domain because of the lack of MSI domain. This is now handled. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-7-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 823f9fe3542a..356f584cc51b 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -616,16 +616,6 @@ static unsigned int xive_irq_startup(struct irq_data *d) pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n", d->irq, hw_irq, d); -#ifdef CONFIG_PCI_MSI - /* - * The generic MSI code returns with the interrupt disabled on the - * card, using the MSI mask bits. Firmware doesn't appear to unmask - * at that level, so we do it here by hand. - */ - if (irq_data_get_msi_desc(d)) - pci_msi_unmask_irq(d); -#endif - /* Pick a target */ target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d)); if (target == XIVE_INVALID_TARGET) { From 292145a6e598c1e6633b8f5f607706b46f552ab9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:25 +0200 Subject: [PATCH 102/474] powerpc/xive: Remove irqd_is_started() check when setting the affinity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the early days of XIVE support, commit cffb717ceb8e ("powerpc/xive: Ensure active irqd when setting affinity") tried to fix an issue related to interrupt migration. If the root cause was related to CPU unplug, it should have been fixed and there is no reason to keep the irqd_is_started() check. This test is also breaking affinity setting of MSIs which can set before starting the associated IRQ. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-8-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 356f584cc51b..0631c97b3a14 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -709,10 +709,6 @@ static int xive_irq_set_affinity(struct irq_data *d, if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) return -EINVAL; - /* Don't do anything if the interrupt isn't started */ - if (!irqd_is_started(d)) - return IRQ_SET_MASK_OK; - /* * If existing target is already in the new mask, and is * online then do nothing. From 07817a578a7a79638537480b8847dc7a12f293c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:26 +0200 Subject: [PATCH 103/474] powerpc/pseries/pci: Add a domain_free_irqs() handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RTAS firmware can not disable one MSI at a time. It's all or nothing. We need a custom free IRQ handler for that. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-9-clg@kaod.org --- arch/powerpc/platforms/pseries/msi.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 86c6809ebac2..591cee9cbc9e 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -529,8 +529,24 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev return rtas_prepare_msi_irqs(pdev, nvec, type, arg); } +/* + * RTAS can not disable one MSI at a time. It's all or nothing. Do it + * at the end after all IRQs have been freed. + */ +static void pseries_msi_domain_free_irqs(struct irq_domain *domain, + struct device *dev) +{ + if (WARN_ON_ONCE(!dev_is_pci(dev))) + return; + + __msi_domain_free_irqs(domain, dev); + + rtas_disable_msi(to_pci_dev(dev)); +} + static struct msi_domain_ops pseries_pci_msi_domain_ops = { .msi_prepare = pseries_msi_ops_prepare, + .domain_free_irqs = pseries_msi_domain_free_irqs, }; static void pseries_msi_shutdown(struct irq_data *d) From 9a014f456881e947bf8cdd8c984a207097e6c096 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:27 +0200 Subject: [PATCH 104/474] powerpc/pseries/pci: Add a msi_free() handler to clear XIVE data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MSI domain clears the IRQ with msi_domain_free(), which calls irq_domain_free_irqs_top(), which clears the handler data. This is a problem for the XIVE controller since we need to unmap MMIO pages and free a specific XIVE structure. The 'msi_free()' handler is called before irq_domain_free_irqs_top() when the handler data is still available. Use that to clear the XIVE controller data. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-10-clg@kaod.org --- arch/powerpc/include/asm/xive.h | 1 + arch/powerpc/platforms/pseries/msi.c | 16 +++++++++++++++- arch/powerpc/sysdev/xive/common.c | 5 ++++- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index aa094a8655b0..20ae50ab083c 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -111,6 +111,7 @@ void xive_native_free_vp_block(u32 vp_base); int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data); void xive_cleanup_irq_data(struct xive_irq_data *xd); +void xive_irq_free_data(unsigned int virq); void xive_native_free_irq(u32 irq); int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 591cee9cbc9e..f9635b01b2bf 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -529,6 +529,19 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev return rtas_prepare_msi_irqs(pdev, nvec, type, arg); } +/* + * ->msi_free() is called before irq_domain_free_irqs_top() when the + * handler data is still available. Use that to clear the XIVE + * controller data. + */ +static void pseries_msi_ops_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, + unsigned int irq) +{ + if (xive_enabled()) + xive_irq_free_data(irq); +} + /* * RTAS can not disable one MSI at a time. It's all or nothing. Do it * at the end after all IRQs have been freed. @@ -546,6 +559,7 @@ static void pseries_msi_domain_free_irqs(struct irq_domain *domain, static struct msi_domain_ops pseries_pci_msi_domain_ops = { .msi_prepare = pseries_msi_ops_prepare, + .msi_free = pseries_msi_ops_msi_free, .domain_free_irqs = pseries_msi_domain_free_irqs, }; @@ -660,7 +674,7 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs); - irq_domain_free_irqs_parent(domain, virq, nr_irqs); + /* XIVE domain data is cleared through ->msi_free() */ } static const struct irq_domain_ops pseries_irq_domain_ops = { diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 0631c97b3a14..107f442d3411 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -975,6 +975,8 @@ EXPORT_SYMBOL_GPL(is_xive_irq); void xive_cleanup_irq_data(struct xive_irq_data *xd) { + pr_debug("%s for HW %x\n", __func__, xd->hw_irq); + if (xd->eoi_mmio) { iounmap(xd->eoi_mmio); if (xd->eoi_mmio == xd->trig_mmio) @@ -1016,7 +1018,7 @@ static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) return 0; } -static void xive_irq_free_data(unsigned int virq) +void xive_irq_free_data(unsigned int virq) { struct xive_irq_data *xd = irq_get_handler_data(virq); @@ -1026,6 +1028,7 @@ static void xive_irq_free_data(unsigned int virq) xive_cleanup_irq_data(xd); kfree(xd); } +EXPORT_SYMBOL_GPL(xive_irq_free_data); #ifdef CONFIG_SMP From 174db9e7f775ce06fc6949c9abbe758b3eb8171c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:28 +0200 Subject: [PATCH 105/474] powerpc/pseries/pci: Add support of MSI domains to PHB hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simply allocate or release the MSI domains when a PHB is inserted in or removed from the machine. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-11-clg@kaod.org --- arch/powerpc/platforms/pseries/msi.c | 10 ++++++++++ arch/powerpc/platforms/pseries/pci_dlpar.c | 4 ++++ arch/powerpc/platforms/pseries/pseries.h | 1 + 3 files changed, 15 insertions(+) diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index f9635b01b2bf..e2127a3f7ebd 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -733,6 +733,16 @@ int pseries_msi_allocate_domains(struct pci_controller *phb) return __pseries_msi_allocate_domains(phb, count); } +void pseries_msi_free_domains(struct pci_controller *phb) +{ + if (phb->msi_domain) + irq_domain_remove(phb->msi_domain); + if (phb->dev_domain) + irq_domain_remove(phb->dev_domain); + if (phb->fwnode) + irq_domain_free_fwnode(phb->fwnode); +} + static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev) { /* No LSI -> leave MSIs (if any) configured */ diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index a8f9140a24fa..90c9d3531694 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -33,6 +33,8 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn) pci_devs_phb_init_dynamic(phb); + pseries_msi_allocate_domains(phb); + /* Create EEH devices for the PHB */ eeh_phb_pe_create(phb); @@ -74,6 +76,8 @@ int remove_phb_dynamic(struct pci_controller *phb) } } + pseries_msi_free_domains(phb); + /* Remove the PCI bus and unregister the bridge device from sysfs */ phb->bus = NULL; pci_remove_bus(b); diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index d9280262588b..3544778e06d0 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -86,6 +86,7 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge); extern struct pci_controller_ops pseries_pci_controller_ops; int pseries_msi_allocate_domains(struct pci_controller *phb); +void pseries_msi_free_domains(struct pci_controller *phb); unsigned long pseries_memory_block_size(void); From 2c50d7e99e39eba92b93210e740f3f9e5a06ba54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:29 +0200 Subject: [PATCH 106/474] powerpc/powernv/pci: Introduce __pnv_pci_ioda_msi_setup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It will be used as a 'compose_msg' handler of the MSI domain introduced later. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-12-clg@kaod.org --- arch/powerpc/platforms/powernv/pci-ioda.c | 28 +++++++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 7de464679292..2922674cc934 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2016,15 +2016,17 @@ bool is_pnv_opal_msi(struct irq_chip *chip) } EXPORT_SYMBOL_GPL(is_pnv_opal_msi); -static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, - unsigned int hwirq, unsigned int virq, - unsigned int is_64, struct msi_msg *msg) +static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, + unsigned int xive_num, + unsigned int is_64, struct msi_msg *msg) { struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); - unsigned int xive_num = hwirq - phb->msi_base; __be32 data; int rc; + dev_dbg(&dev->dev, "%s: setup %s-bit MSI for vector #%d\n", __func__, + is_64 ? "64" : "32", xive_num); + /* No PE assigned ? bail out ... no MSI for you ! */ if (pe == NULL) return -ENXIO; @@ -2072,12 +2074,28 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, } msg->data = be32_to_cpu(data); + return 0; +} + +static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, + unsigned int hwirq, unsigned int virq, + unsigned int is_64, struct msi_msg *msg) +{ + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); + unsigned int xive_num = hwirq - phb->msi_base; + int rc; + + rc = __pnv_pci_ioda_msi_setup(phb, dev, xive_num, is_64, msg); + if (rc) + return rc; + + /* P8 only */ pnv_set_msi_irq_chip(phb, virq); pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," " address=%x_%08x data=%x PE# %x\n", pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, - msg->address_hi, msg->address_lo, data, pe->pe_number); + msg->address_hi, msg->address_lo, msg->data, pe->pe_number); return 0; } From 0fcfe2247e75070361af2b6845030cada92cdbf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:30 +0200 Subject: [PATCH 107/474] powerpc/powernv/pci: Add MSI domains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is very similar to the MSI domains of the pSeries platform. The MSI allocator is directly handled under the Linux PHB in the in-the-middle "PNV-MSI" domain. Only the XIVE (P9/P10) parent domain is supported for now. Support for XICS will come later. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-13-clg@kaod.org --- arch/powerpc/platforms/powernv/pci-ioda.c | 188 ++++++++++++++++++++++ 1 file changed, 188 insertions(+) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 2922674cc934..d2a17fcb6002 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -2100,6 +2101,189 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, return 0; } +/* + * The msi_free() op is called before irq_domain_free_irqs_top() when + * the handler data is still available. Use that to clear the XIVE + * controller. + */ +static void pnv_msi_ops_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, + unsigned int irq) +{ + if (xive_enabled()) + xive_irq_free_data(irq); +} + +static struct msi_domain_ops pnv_pci_msi_domain_ops = { + .msi_free = pnv_msi_ops_msi_free, +}; + +static void pnv_msi_shutdown(struct irq_data *d) +{ + d = d->parent_data; + if (d->chip->irq_shutdown) + d->chip->irq_shutdown(d); +} + +static void pnv_msi_mask(struct irq_data *d) +{ + pci_msi_mask_irq(d); + irq_chip_mask_parent(d); +} + +static void pnv_msi_unmask(struct irq_data *d) +{ + pci_msi_unmask_irq(d); + irq_chip_unmask_parent(d); +} + +static struct irq_chip pnv_pci_msi_irq_chip = { + .name = "PNV-PCI-MSI", + .irq_shutdown = pnv_msi_shutdown, + .irq_mask = pnv_msi_mask, + .irq_unmask = pnv_msi_unmask, + .irq_eoi = irq_chip_eoi_parent, +}; + +static struct msi_domain_info pnv_msi_domain_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX), + .ops = &pnv_pci_msi_domain_ops, + .chip = &pnv_pci_msi_irq_chip, +}; + +static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg) +{ + struct msi_desc *entry = irq_data_get_msi_desc(d); + struct pci_dev *pdev = msi_desc_to_pci_dev(entry); + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; + int rc; + + rc = __pnv_pci_ioda_msi_setup(phb, pdev, d->hwirq, + entry->msi_attrib.is_64, msg); + if (rc) + dev_err(&pdev->dev, "Failed to setup %s-bit MSI #%ld : %d\n", + entry->msi_attrib.is_64 ? "64" : "32", d->hwirq, rc); +} + +static struct irq_chip pnv_msi_irq_chip = { + .name = "PNV-MSI", + .irq_shutdown = pnv_msi_shutdown, + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_compose_msi_msg = pnv_msi_compose_msg, +}; + +static int pnv_irq_parent_domain_alloc(struct irq_domain *domain, + unsigned int virq, int hwirq) +{ + struct irq_fwspec parent_fwspec; + int ret; + + parent_fwspec.fwnode = domain->parent->fwnode; + parent_fwspec.param_count = 2; + parent_fwspec.param[0] = hwirq; + parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + + ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &parent_fwspec); + if (ret) + return ret; + + return 0; +} + +static int pnv_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct pci_controller *hose = domain->host_data; + struct pnv_phb *phb = hose->private_data; + msi_alloc_info_t *info = arg; + struct pci_dev *pdev = msi_desc_to_pci_dev(info->desc); + int hwirq; + int i, ret; + + hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, nr_irqs); + if (hwirq < 0) { + dev_warn(&pdev->dev, "failed to find a free MSI\n"); + return -ENOSPC; + } + + dev_dbg(&pdev->dev, "%s bridge %pOF %d/%x #%d\n", __func__, + hose->dn, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) { + ret = pnv_irq_parent_domain_alloc(domain, virq + i, + phb->msi_base + hwirq + i); + if (ret) + goto out; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &pnv_msi_irq_chip, hose); + } + + return 0; + +out: + irq_domain_free_irqs_parent(domain, virq, i - 1); + msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, nr_irqs); + return ret; +} + +static void pnv_irq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; + + pr_debug("%s bridge %pOF %d/%lx #%d\n", __func__, hose->dn, + virq, d->hwirq, nr_irqs); + + msi_bitmap_free_hwirqs(&phb->msi_bmp, d->hwirq, nr_irqs); + /* XIVE domain is cleared through ->msi_free() */ +} + +static const struct irq_domain_ops pnv_irq_domain_ops = { + .alloc = pnv_irq_domain_alloc, + .free = pnv_irq_domain_free, +}; + +static int pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int count) +{ + struct pnv_phb *phb = hose->private_data; + struct irq_domain *parent = irq_get_default_host(); + + hose->fwnode = irq_domain_alloc_named_id_fwnode("PNV-MSI", phb->opal_id); + if (!hose->fwnode) + return -ENOMEM; + + hose->dev_domain = irq_domain_create_hierarchy(parent, 0, count, + hose->fwnode, + &pnv_irq_domain_ops, hose); + if (!hose->dev_domain) { + pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n", + hose->dn, hose->global_number); + irq_domain_free_fwnode(hose->fwnode); + return -ENOMEM; + } + + hose->msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(hose->dn), + &pnv_msi_domain_info, + hose->dev_domain); + if (!hose->msi_domain) { + pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n", + hose->dn, hose->global_number); + irq_domain_free_fwnode(hose->fwnode); + irq_domain_remove(hose->dev_domain); + return -ENOMEM; + } + + return 0; +} + static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { unsigned int count; @@ -2124,6 +2308,10 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) phb->msi32_support = 1; pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", count, phb->msi_base); + + /* Only supported by the XIVE driver */ + if (xive_enabled()) + pnv_msi_allocate_domains(phb->hose, count); } static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, From ba418a0278265ad65f2f9544e743b7dbff3b994b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:31 +0200 Subject: [PATCH 108/474] KVM: PPC: Book3S HV: Use the new IRQ chip to detect passthrough interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Passthrough PCI MSI interrupts are detected in KVM with a check on a specific EOI handler (P8) or on XIVE (P9). We can now check the PCI-MSI IRQ chip which is cleaner. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-14-clg@kaod.org --- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 9f957ceee58a..1a757c3d33f7 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5355,7 +5355,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) * what our real-mode EOI code does, or a XIVE interrupt */ chip = irq_data_get_irq_chip(&desc->irq_data); - if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) { + if (!chip || !is_pnv_opal_msi(chip)) { pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n", host_irq, guest_gsi); mutex_unlock(&kvm->lock); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index d2a17fcb6002..e77caa4dbbdf 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2007,13 +2007,15 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) irq_set_chip(virq, &phb->ioda.irq_chip); } +static struct irq_chip pnv_pci_msi_irq_chip; + /* * Returns true iff chip is something that we could call * pnv_opal_pci_msi_eoi for. */ bool is_pnv_opal_msi(struct irq_chip *chip) { - return chip->irq_eoi == pnv_ioda2_msi_eoi; + return chip->irq_eoi == pnv_ioda2_msi_eoi || chip == &pnv_pci_msi_irq_chip; } EXPORT_SYMBOL_GPL(is_pnv_opal_msi); From e5e78b15113a73d0294141d9796969fa7b10fa3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:32 +0200 Subject: [PATCH 109/474] KVM: PPC: Book3S HV: XIVE: Change interface of passthrough interrupt routines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The routine kvmppc_set_passthru_irq() calls kvmppc_xive_set_mapped() and kvmppc_xive_clr_mapped() with an IRQ descriptor. Use directly the host IRQ number to remove a useless conversion. Add some debug. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-15-clg@kaod.org --- arch/powerpc/include/asm/kvm_ppc.h | 4 ++-- arch/powerpc/kvm/book3s_hv.c | 4 ++-- arch/powerpc/kvm/book3s_xive.c | 17 ++++++++--------- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2d88944f9f34..671fbd1a765e 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -664,9 +664,9 @@ extern int kvmppc_xive_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu); extern void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu); extern int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, - struct irq_desc *host_desc); + unsigned long host_irq); extern int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, - struct irq_desc *host_desc); + unsigned long host_irq); extern u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1a757c3d33f7..05b3a3548c18 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5398,7 +5398,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) pimap->n_mapped++; if (xics_on_xive()) - rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc); + rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq); else kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); if (rc) @@ -5439,7 +5439,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) } if (xics_on_xive()) - rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc); + rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, host_irq); else kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq); diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 8cfab3547494..45a0434e9d85 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -922,13 +922,12 @@ int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) } int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, - struct irq_desc *host_desc) + unsigned long host_irq) { struct kvmppc_xive *xive = kvm->arch.xive; struct kvmppc_xive_src_block *sb; struct kvmppc_xive_irq_state *state; - struct irq_data *host_data = irq_desc_get_irq_data(host_desc); - unsigned int host_irq = irq_desc_get_irq(host_desc); + struct irq_data *host_data = irq_get_irq_data(host_irq); unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data); u16 idx; u8 prio; @@ -937,7 +936,8 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, if (!xive) return -ENODEV; - pr_devel("set_mapped girq 0x%lx host HW irq 0x%x...\n",guest_irq, hw_irq); + pr_debug("%s: GIRQ 0x%lx host IRQ %ld XIVE HW IRQ 0x%x\n", + __func__, guest_irq, host_irq, hw_irq); sb = kvmppc_xive_find_source(xive, guest_irq, &idx); if (!sb) @@ -959,7 +959,7 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, */ rc = irq_set_vcpu_affinity(host_irq, state); if (rc) { - pr_err("Failed to set VCPU affinity for irq %d\n", host_irq); + pr_err("Failed to set VCPU affinity for host IRQ %ld\n", host_irq); return rc; } @@ -1019,12 +1019,11 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped); int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, - struct irq_desc *host_desc) + unsigned long host_irq) { struct kvmppc_xive *xive = kvm->arch.xive; struct kvmppc_xive_src_block *sb; struct kvmppc_xive_irq_state *state; - unsigned int host_irq = irq_desc_get_irq(host_desc); u16 idx; u8 prio; int rc; @@ -1032,7 +1031,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, if (!xive) return -ENODEV; - pr_devel("clr_mapped girq 0x%lx...\n", guest_irq); + pr_debug("%s: GIRQ 0x%lx host IRQ %ld\n", __func__, guest_irq, host_irq); sb = kvmppc_xive_find_source(xive, guest_irq, &idx); if (!sb) @@ -1059,7 +1058,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, /* Release the passed-through interrupt to the host */ rc = irq_set_vcpu_affinity(host_irq, NULL); if (rc) { - pr_err("Failed to clr VCPU affinity for irq %d\n", host_irq); + pr_err("Failed to clr VCPU affinity for host IRQ %ld\n", host_irq); return rc; } From 51be9e51a8000ffc6a33083ceca9da9303ed4dc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:33 +0200 Subject: [PATCH 110/474] KVM: PPC: Book3S HV: XIVE: Fix mapping of passthrough interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI MSI interrupt numbers are now mapped in a PCI-MSI domain but the underlying calls handling the passthrough of the interrupt in the guest need a number in the XIVE IRQ domain. Use the IRQ data mapped in the XIVE IRQ domain and not the one in the PCI-MSI domain. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-16-clg@kaod.org --- arch/powerpc/kvm/book3s_xive.c | 3 ++- kernel/irq/irqdomain.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 45a0434e9d85..6878026ee8ec 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -927,7 +927,8 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, struct kvmppc_xive *xive = kvm->arch.xive; struct kvmppc_xive_src_block *sb; struct kvmppc_xive_irq_state *state; - struct irq_data *host_data = irq_get_irq_data(host_irq); + struct irq_data *host_data = + irq_domain_get_irq_data(irq_get_default_host(), host_irq); unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data); u16 idx; u8 prio; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 51c483ce2447..0eee4816edac 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -491,6 +491,7 @@ struct irq_domain *irq_get_default_host(void) { return irq_default_domain; } +EXPORT_SYMBOL_GPL(irq_get_default_host); static bool irq_domain_is_nomap(struct irq_domain *domain) { From 298f6f952885eeb1f25461f085c6c238bcd9fc5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:34 +0200 Subject: [PATCH 111/474] powerpc/xics: Remove ICS list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We always had only one ICS per machine. Simplify the XICS driver by removing the ICS list. The ICS stored in the chip data of the XICS domain becomes useless and we don't need it anymore to migrate away IRQs from a CPU. This will be removed in a subsequent patch. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-17-clg@kaod.org --- arch/powerpc/sysdev/xics/xics-common.c | 47 +++++++++++--------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index b14c502e56a8..08c25748efb9 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -38,7 +38,7 @@ DEFINE_PER_CPU(struct xics_cppr, xics_cppr); struct irq_domain *xics_host; -static LIST_HEAD(ics_list); +static struct ics *xics_ics; void xics_update_irq_servers(void) { @@ -111,12 +111,11 @@ void xics_setup_cpu(void) void xics_mask_unknown_vec(unsigned int vec) { - struct ics *ics; - pr_err("Interrupt 0x%x (real) is invalid, disabling it.\n", vec); - list_for_each_entry(ics, &ics_list, link) - ics->mask_unknown(ics, vec); + if (WARN_ON(!xics_ics)) + return; + xics_ics->mask_unknown(xics_ics, vec); } @@ -198,7 +197,6 @@ void xics_migrate_irqs_away(void) struct irq_chip *chip; long server; unsigned long flags; - struct ics *ics; /* We can't set affinity on ISA interrupts */ if (virq < NR_IRQS_LEGACY) @@ -219,13 +217,10 @@ void xics_migrate_irqs_away(void) raw_spin_lock_irqsave(&desc->lock, flags); /* Locate interrupt server */ - server = -1; - ics = irq_desc_get_chip_data(desc); - if (ics) - server = ics->get_server(ics, irq); + server = xics_ics->get_server(xics_ics, irq); if (server < 0) { - printk(KERN_ERR "%s: Can't find server for irq %d\n", - __func__, irq); + pr_err("%s: Can't find server for irq %d/%x\n", + __func__, virq, irq); goto unlock; } @@ -307,13 +302,9 @@ int xics_get_irq_server(unsigned int virq, const struct cpumask *cpumask, static int xics_host_match(struct irq_domain *h, struct device_node *node, enum irq_domain_bus_token bus_token) { - struct ics *ics; - - list_for_each_entry(ics, &ics_list, link) - if (ics->host_match(ics, node)) - return 1; - - return 0; + if (WARN_ON(!xics_ics)) + return 0; + return xics_ics->host_match(xics_ics, node) ? 1 : 0; } /* Dummies */ @@ -330,8 +321,6 @@ static struct irq_chip xics_ipi_chip = { static int xics_host_map(struct irq_domain *h, unsigned int virq, irq_hw_number_t hw) { - struct ics *ics; - pr_devel("xics: map virq %d, hwirq 0x%lx\n", virq, hw); /* @@ -348,12 +337,14 @@ static int xics_host_map(struct irq_domain *h, unsigned int virq, return 0; } - /* Let the ICS setup the chip data */ - list_for_each_entry(ics, &ics_list, link) - if (ics->map(ics, virq) == 0) - return 0; + if (WARN_ON(!xics_ics)) + return -EINVAL; - return -EINVAL; + /* Let the ICS setup the chip data */ + if (xics_ics->map(xics_ics, virq)) + return -EINVAL; + + return 0; } static int xics_host_xlate(struct irq_domain *h, struct device_node *ct, @@ -427,7 +418,9 @@ static void __init xics_init_host(void) void __init xics_register_ics(struct ics *ics) { - list_add(&ics->link, &ics_list); + if (WARN_ONCE(xics_ics, "XICS: Source Controller is already defined !")) + return; + xics_ics = ics; } static void __init xics_get_server_size(void) From 248af248a8f45461662fb633eca4adf24ae704ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:35 +0200 Subject: [PATCH 112/474] powerpc/xics: Rename the map handler in a check handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This moves the IRQ initialization done under the different ICS backends in the common part of XICS. The 'map' handler becomes a simple 'check' on the HW IRQ at the FW level. As we don't need an ICS anymore in xics_migrate_irqs_away(), the XICS domain does not set a chip data for the IRQ. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-18-clg@kaod.org --- arch/powerpc/include/asm/xics.h | 3 ++- arch/powerpc/sysdev/xics/ics-native.c | 13 +++++------- arch/powerpc/sysdev/xics/ics-opal.c | 27 +++++++++---------------- arch/powerpc/sysdev/xics/ics-rtas.c | 28 +++++++++----------------- arch/powerpc/sysdev/xics/xics-common.c | 15 ++++++++------ 5 files changed, 36 insertions(+), 50 deletions(-) diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h index d9cf192368ad..0ac9bfddf704 100644 --- a/arch/powerpc/include/asm/xics.h +++ b/arch/powerpc/include/asm/xics.h @@ -89,10 +89,11 @@ static inline int ics_opal_init(void) { return -ENODEV; } /* ICS instance, hooked up to chip_data of an irq */ struct ics { struct list_head link; - int (*map)(struct ics *ics, unsigned int virq); + int (*check)(struct ics *ics, unsigned int hwirq); void (*mask_unknown)(struct ics *ics, unsigned long vec); long (*get_server)(struct ics *ics, unsigned long vec); int (*host_match)(struct ics *ics, struct device_node *node); + struct irq_chip *chip; char data[]; }; diff --git a/arch/powerpc/sysdev/xics/ics-native.c b/arch/powerpc/sysdev/xics/ics-native.c index d450502f4053..dec7d93a8ba1 100644 --- a/arch/powerpc/sysdev/xics/ics-native.c +++ b/arch/powerpc/sysdev/xics/ics-native.c @@ -131,19 +131,15 @@ static struct irq_chip ics_native_irq_chip = { .irq_retrigger = xics_retrigger, }; -static int ics_native_map(struct ics *ics, unsigned int virq) +static int ics_native_check(struct ics *ics, unsigned int hw_irq) { - unsigned int vec = (unsigned int)virq_to_hw(virq); struct ics_native *in = to_ics_native(ics); - pr_devel("%s: vec=0x%x\n", __func__, vec); + pr_devel("%s: hw_irq=0x%x\n", __func__, hw_irq); - if (vec < in->ibase || vec >= (in->ibase + in->icount)) + if (hw_irq < in->ibase || hw_irq >= (in->ibase + in->icount)) return -EINVAL; - irq_set_chip_and_handler(virq, &ics_native_irq_chip, handle_fasteoi_irq); - irq_set_chip_data(virq, ics); - return 0; } @@ -177,10 +173,11 @@ static int ics_native_host_match(struct ics *ics, struct device_node *node) } static struct ics ics_native_template = { - .map = ics_native_map, + .check = ics_native_check, .mask_unknown = ics_native_mask_unknown, .get_server = ics_native_get_server, .host_match = ics_native_host_match, + .chip = &ics_native_irq_chip, }; static int __init ics_native_add_one(struct device_node *np) diff --git a/arch/powerpc/sysdev/xics/ics-opal.c b/arch/powerpc/sysdev/xics/ics-opal.c index 823f6c9664cd..8c7ddcc718b6 100644 --- a/arch/powerpc/sysdev/xics/ics-opal.c +++ b/arch/powerpc/sysdev/xics/ics-opal.c @@ -157,26 +157,13 @@ static struct irq_chip ics_opal_irq_chip = { .irq_retrigger = xics_retrigger, }; -static int ics_opal_map(struct ics *ics, unsigned int virq); -static void ics_opal_mask_unknown(struct ics *ics, unsigned long vec); -static long ics_opal_get_server(struct ics *ics, unsigned long vec); - static int ics_opal_host_match(struct ics *ics, struct device_node *node) { return 1; } -/* Only one global & state struct ics */ -static struct ics ics_hal = { - .map = ics_opal_map, - .mask_unknown = ics_opal_mask_unknown, - .get_server = ics_opal_get_server, - .host_match = ics_opal_host_match, -}; - -static int ics_opal_map(struct ics *ics, unsigned int virq) +static int ics_opal_check(struct ics *ics, unsigned int hw_irq) { - unsigned int hw_irq = (unsigned int)virq_to_hw(virq); int64_t rc; __be16 server; int8_t priority; @@ -189,9 +176,6 @@ static int ics_opal_map(struct ics *ics, unsigned int virq) if (rc != OPAL_SUCCESS) return -ENXIO; - irq_set_chip_and_handler(virq, &ics_opal_irq_chip, handle_fasteoi_irq); - irq_set_chip_data(virq, &ics_hal); - return 0; } @@ -222,6 +206,15 @@ static long ics_opal_get_server(struct ics *ics, unsigned long vec) return ics_opal_unmangle_server(be16_to_cpu(server)); } +/* Only one global & state struct ics */ +static struct ics ics_hal = { + .check = ics_opal_check, + .mask_unknown = ics_opal_mask_unknown, + .get_server = ics_opal_get_server, + .host_match = ics_opal_host_match, + .chip = &ics_opal_irq_chip, +}; + int __init ics_opal_init(void) { if (!firmware_has_feature(FW_FEATURE_OPAL)) diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c b/arch/powerpc/sysdev/xics/ics-rtas.c index 4cf18000f07c..6d19d711ed35 100644 --- a/arch/powerpc/sysdev/xics/ics-rtas.c +++ b/arch/powerpc/sysdev/xics/ics-rtas.c @@ -24,19 +24,6 @@ static int ibm_set_xive; static int ibm_int_on; static int ibm_int_off; -static int ics_rtas_map(struct ics *ics, unsigned int virq); -static void ics_rtas_mask_unknown(struct ics *ics, unsigned long vec); -static long ics_rtas_get_server(struct ics *ics, unsigned long vec); -static int ics_rtas_host_match(struct ics *ics, struct device_node *node); - -/* Only one global & state struct ics */ -static struct ics ics_rtas = { - .map = ics_rtas_map, - .mask_unknown = ics_rtas_mask_unknown, - .get_server = ics_rtas_get_server, - .host_match = ics_rtas_host_match, -}; - static void ics_rtas_unmask_irq(struct irq_data *d) { unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); @@ -169,9 +156,8 @@ static struct irq_chip ics_rtas_irq_chip = { .irq_retrigger = xics_retrigger, }; -static int ics_rtas_map(struct ics *ics, unsigned int virq) +static int ics_rtas_check(struct ics *ics, unsigned int hw_irq) { - unsigned int hw_irq = (unsigned int)virq_to_hw(virq); int status[2]; int rc; @@ -183,9 +169,6 @@ static int ics_rtas_map(struct ics *ics, unsigned int virq) if (rc) return -ENXIO; - irq_set_chip_and_handler(virq, &ics_rtas_irq_chip, handle_fasteoi_irq); - irq_set_chip_data(virq, &ics_rtas); - return 0; } @@ -213,6 +196,15 @@ static int ics_rtas_host_match(struct ics *ics, struct device_node *node) return !of_device_is_compatible(node, "chrp,iic"); } +/* Only one global & state struct ics */ +static struct ics ics_rtas = { + .check = ics_rtas_check, + .mask_unknown = ics_rtas_mask_unknown, + .get_server = ics_rtas_get_server, + .host_match = ics_rtas_host_match, + .chip = &ics_rtas_irq_chip, +}; + __init int ics_rtas_init(void) { ibm_get_xive = rtas_token("ibm,get-xive"); diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index 08c25748efb9..ed2bc14f56bd 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -318,10 +318,10 @@ static struct irq_chip xics_ipi_chip = { .irq_unmask = xics_ipi_unmask, }; -static int xics_host_map(struct irq_domain *h, unsigned int virq, - irq_hw_number_t hw) +static int xics_host_map(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) { - pr_devel("xics: map virq %d, hwirq 0x%lx\n", virq, hw); + pr_devel("xics: map virq %d, hwirq 0x%lx\n", virq, hwirq); /* * Mark interrupts as edge sensitive by default so that resend @@ -331,7 +331,7 @@ static int xics_host_map(struct irq_domain *h, unsigned int virq, irq_clear_status_flags(virq, IRQ_LEVEL); /* Don't call into ICS for IPIs */ - if (hw == XICS_IPI) { + if (hwirq == XICS_IPI) { irq_set_chip_and_handler(virq, &xics_ipi_chip, handle_percpu_irq); return 0; @@ -340,10 +340,13 @@ static int xics_host_map(struct irq_domain *h, unsigned int virq, if (WARN_ON(!xics_ics)) return -EINVAL; - /* Let the ICS setup the chip data */ - if (xics_ics->map(xics_ics, virq)) + if (xics_ics->check(xics_ics, hwirq)) return -EINVAL; + /* No chip data for the XICS domain */ + irq_domain_set_info(domain, virq, hwirq, xics_ics->chip, + NULL, handle_fasteoi_irq, NULL, NULL); + return 0; } From 7d14f6c60b76fa7f3f89d81a95385576ca33b483 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:36 +0200 Subject: [PATCH 113/474] powerpc/xics: Give a name to the default XICS IRQ domain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit and clean up the error path. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-19-clg@kaod.org --- arch/powerpc/sysdev/xics/xics-common.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index ed2bc14f56bd..18d3de2f2249 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -412,11 +412,22 @@ static const struct irq_domain_ops xics_host_ops = { .xlate = xics_host_xlate, }; -static void __init xics_init_host(void) +static int __init xics_allocate_domain(void) { - xics_host = irq_domain_add_tree(NULL, &xics_host_ops, NULL); - BUG_ON(xics_host == NULL); + struct fwnode_handle *fn; + + fn = irq_domain_alloc_named_fwnode("XICS"); + if (!fn) + return -ENOMEM; + + xics_host = irq_domain_create_tree(fn, &xics_host_ops, NULL); + if (!xics_host) { + irq_domain_free_fwnode(fn); + return -ENOMEM; + } + irq_set_default_host(xics_host); + return 0; } void __init xics_register_ics(struct ics *ics) @@ -480,6 +491,8 @@ void __init xics_init(void) /* Initialize common bits */ xics_get_server_size(); xics_update_irq_servers(); - xics_init_host(); + rc = xics_allocate_domain(); + if (rc < 0) + pr_err("XICS: Failed to create IRQ domain"); xics_setup_cpu(); } From 53b34e8db73af98fa652641bf490384dc665d0f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:37 +0200 Subject: [PATCH 114/474] powerpc/xics: Add debug logging to the set_irq_affinity handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It really helps to know how the HW is configured when tweaking the IRQ subsystem. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-20-clg@kaod.org --- arch/powerpc/sysdev/xics/ics-opal.c | 2 +- arch/powerpc/sysdev/xics/ics-rtas.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/xics/ics-opal.c b/arch/powerpc/sysdev/xics/ics-opal.c index 8c7ddcc718b6..bf26cae1b982 100644 --- a/arch/powerpc/sysdev/xics/ics-opal.c +++ b/arch/powerpc/sysdev/xics/ics-opal.c @@ -133,7 +133,7 @@ static int ics_opal_set_affinity(struct irq_data *d, } server = ics_opal_mangle_server(wanted_server); - pr_devel("ics-hal: set-affinity irq %d [hw 0x%x] server: 0x%x/0x%x\n", + pr_debug("ics-hal: set-affinity irq %d [hw 0x%x] server: 0x%x/0x%x\n", d->irq, hw_irq, wanted_server, server); rc = opal_set_xive(hw_irq, server, priority); diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c b/arch/powerpc/sysdev/xics/ics-rtas.c index 6d19d711ed35..b50c6341682e 100644 --- a/arch/powerpc/sysdev/xics/ics-rtas.c +++ b/arch/powerpc/sysdev/xics/ics-rtas.c @@ -133,6 +133,9 @@ static int ics_rtas_set_affinity(struct irq_data *d, return -1; } + pr_debug("%s: irq %d [hw 0x%x] server: 0x%x\n", __func__, d->irq, + hw_irq, irq_server); + status = rtas_call_reentrant(ibm_set_xive, 3, 1, NULL, hw_irq, irq_server, xics_status[1]); From e4f0aa3b4731430ad73fb4485e97f751c7500668 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:38 +0200 Subject: [PATCH 115/474] powerpc/xics: Add support for IRQ domain hierarchy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XICS doesn't have any state associated with the IRQ. The support is straightforward and simpler than for XIVE. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-21-clg@kaod.org --- arch/powerpc/sysdev/xics/xics-common.c | 41 ++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index 18d3de2f2249..febab57f060f 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -406,7 +406,48 @@ int xics_retrigger(struct irq_data *data) return 0; } +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +static int xics_host_domain_translate(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *hwirq, unsigned int *type) +{ + return xics_host_xlate(d, to_of_node(fwspec->fwnode), fwspec->param, + fwspec->param_count, hwirq, type); +} + +static int xics_host_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_fwspec *fwspec = arg; + irq_hw_number_t hwirq; + unsigned int type = IRQ_TYPE_NONE; + int i, rc; + + rc = xics_host_domain_translate(domain, fwspec, &hwirq, &type); + if (rc) + return rc; + + pr_debug("%s %d/%lx #%d\n", __func__, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) + irq_domain_set_info(domain, virq + i, hwirq + i, xics_ics->chip, + xics_ics, handle_fasteoi_irq, NULL, NULL); + + return 0; +} + +static void xics_host_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + pr_debug("%s %d #%d\n", __func__, virq, nr_irqs); +} +#endif + static const struct irq_domain_ops xics_host_ops = { +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + .alloc = xics_host_domain_alloc, + .free = xics_host_domain_free, + .translate = xics_host_domain_translate, +#endif .match = xics_host_match, .map = xics_host_map, .xlate = xics_host_xlate, From bbb25af8fbdba4acaf955e412a84eb2eea48697c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:39 +0200 Subject: [PATCH 116/474] powerpc/powernv/pci: Customize the MSI EOI handler to support PHB3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PHB3s need an extra OPAL call to EOI the interrupt. The call takes an OPAL HW IRQ number but it is translated into a vector number in OPAL. Here, we directly use the vector number of the in-the-middle "PNV-MSI" domain instead of grabbing the OPAL HW IRQ number in the XICS parent domain. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-22-clg@kaod.org --- arch/powerpc/platforms/powernv/pci-ioda.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index e77caa4dbbdf..b498876a976f 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2169,12 +2169,33 @@ static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg) entry->msi_attrib.is_64 ? "64" : "32", d->hwirq, rc); } +/* + * The IRQ data is mapped in the MSI domain in which HW IRQ numbers + * correspond to vector numbers. + */ +static void pnv_msi_eoi(struct irq_data *d) +{ + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; + + if (phb->model == PNV_PHB_MODEL_PHB3) { + /* + * The EOI OPAL call takes an OPAL HW IRQ number but + * since it is translated into a vector number in + * OPAL, use that directly. + */ + WARN_ON_ONCE(opal_pci_msi_eoi(phb->opal_id, d->hwirq)); + } + + irq_chip_eoi_parent(d); +} + static struct irq_chip pnv_msi_irq_chip = { .name = "PNV-MSI", .irq_shutdown = pnv_msi_shutdown, .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent, - .irq_eoi = irq_chip_eoi_parent, + .irq_eoi = pnv_msi_eoi, .irq_set_affinity = irq_chip_set_affinity_parent, .irq_compose_msi_msg = pnv_msi_compose_msg, }; From 679e30b9536eeb93bc8c9a39c0ddc77dec536f6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:40 +0200 Subject: [PATCH 117/474] powerpc/pci: Drop XIVE restriction on MSI domains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PowerNV and pSeries platforms now have support for both the XICS and XIVE IRQ domains. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-23-clg@kaod.org --- arch/powerpc/platforms/powernv/pci-ioda.c | 4 +--- arch/powerpc/platforms/pseries/msi.c | 4 ---- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b498876a976f..e2454439e574 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2332,9 +2332,7 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", count, phb->msi_base); - /* Only supported by the XIVE driver */ - if (xive_enabled()) - pnv_msi_allocate_domains(phb->hose, count); + pnv_msi_allocate_domains(phb->hose, count); } static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index e2127a3f7ebd..e196cc1b8540 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -720,10 +720,6 @@ int pseries_msi_allocate_domains(struct pci_controller *phb) { int count; - /* Only supported by the XIVE driver */ - if (!xive_enabled()) - return -ENODEV; - if (!__find_pe_total_msi(phb->dn, &count)) { pr_err("PCI: failed to find MSIs for bridge %pOF (domain %d)\n", phb->dn, phb->global_number); From 1e661f81a522eadfe4bc5bb1ec9fbae27c13f163 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:41 +0200 Subject: [PATCH 118/474] powerpc/xics: Drop unmask of MSIs at startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That was a workaround in the XICS domain because of the lack of MSI domain. This is now handled. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-24-clg@kaod.org --- arch/powerpc/sysdev/xics/ics-opal.c | 11 ----------- arch/powerpc/sysdev/xics/ics-rtas.c | 9 --------- 2 files changed, 20 deletions(-) diff --git a/arch/powerpc/sysdev/xics/ics-opal.c b/arch/powerpc/sysdev/xics/ics-opal.c index bf26cae1b982..c4d95d8beb6f 100644 --- a/arch/powerpc/sysdev/xics/ics-opal.c +++ b/arch/powerpc/sysdev/xics/ics-opal.c @@ -62,17 +62,6 @@ static void ics_opal_unmask_irq(struct irq_data *d) static unsigned int ics_opal_startup(struct irq_data *d) { -#ifdef CONFIG_PCI_MSI - /* - * The generic MSI code returns with the interrupt disabled on the - * card, using the MSI mask bits. Firmware doesn't appear to unmask - * at that level, so we do it here by hand. - */ - if (irq_data_get_msi_desc(d)) - pci_msi_unmask_irq(d); -#endif - - /* unmask it */ ics_opal_unmask_irq(d); return 0; } diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c b/arch/powerpc/sysdev/xics/ics-rtas.c index b50c6341682e..b9da317b7a2d 100644 --- a/arch/powerpc/sysdev/xics/ics-rtas.c +++ b/arch/powerpc/sysdev/xics/ics-rtas.c @@ -57,15 +57,6 @@ static void ics_rtas_unmask_irq(struct irq_data *d) static unsigned int ics_rtas_startup(struct irq_data *d) { -#ifdef CONFIG_PCI_MSI - /* - * The generic MSI code returns with the interrupt disabled on the - * card, using the MSI mask bits. Firmware doesn't appear to unmask - * at that level, so we do it here by hand. - */ - if (irq_data_get_msi_desc(d)) - pci_msi_unmask_irq(d); -#endif /* unmask it */ ics_rtas_unmask_irq(d); return 0; From 3005123eea0daa18d98602ab64b2ce3ad087d849 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:42 +0200 Subject: [PATCH 119/474] powerpc/pseries/pci: Drop unused MSI code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MSIs should be fully managed by the PCI and IRQ subsystems now. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-25-clg@kaod.org --- arch/powerpc/platforms/pseries/msi.c | 87 ---------------------------- 1 file changed, 87 deletions(-) diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index e196cc1b8540..1b305e411862 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -111,21 +111,6 @@ static int rtas_query_irq_number(struct pci_dn *pdn, int offset) return rtas_ret[0]; } -static void rtas_teardown_msi_irqs(struct pci_dev *pdev) -{ - struct msi_desc *entry; - - for_each_pci_msi_entry(entry, pdev) { - if (!entry->irq) - continue; - - irq_set_msi_desc(entry->irq, NULL); - irq_dispose_mapping(entry->irq); - } - - rtas_disable_msi(pdev); -} - static int check_req(struct pci_dev *pdev, int nvec, char *prop_name) { struct device_node *dn; @@ -459,66 +444,6 @@ again: return 0; } -static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type) -{ - struct pci_dn *pdn; - int hwirq, virq, i; - int rc; - struct msi_desc *entry; - struct msi_msg msg; - - rc = rtas_prepare_msi_irqs(pdev, nvec_in, type, NULL); - if (rc) - return rc; - - pdn = pci_get_pdn(pdev); - i = 0; - for_each_pci_msi_entry(entry, pdev) { - hwirq = rtas_query_irq_number(pdn, i++); - if (hwirq < 0) { - pr_debug("rtas_msi: error (%d) getting hwirq\n", rc); - return hwirq; - } - - /* - * Depending on the number of online CPUs in the original - * kernel, it is likely for CPU #0 to be offline in a kdump - * kernel. The associated IRQs in the affinity mappings - * provided by irq_create_affinity_masks() are thus not - * started by irq_startup(), as per-design for managed IRQs. - * This can be a problem with multi-queue block devices driven - * by blk-mq : such a non-started IRQ is very likely paired - * with the single queue enforced by blk-mq during kdump (see - * blk_mq_alloc_tag_set()). This causes the device to remain - * silent and likely hangs the guest at some point. - * - * We don't really care for fine-grained affinity when doing - * kdump actually : simply ignore the pre-computed affinity - * masks in this case and let the default mask with all CPUs - * be used when creating the IRQ mappings. - */ - if (is_kdump_kernel()) - virq = irq_create_mapping(NULL, hwirq); - else - virq = irq_create_mapping_affinity(NULL, hwirq, - entry->affinity); - - if (!virq) { - pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq); - return -ENOSPC; - } - - dev_dbg(&pdev->dev, "rtas_msi: allocated virq %d\n", virq); - irq_set_msi_desc(virq, entry); - - /* Read config space back so we can restore after reset */ - __pci_read_msi_msg(entry, &msg); - entry->msg = msg; - } - - return 0; -} - static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg) { @@ -759,8 +684,6 @@ static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev) static int rtas_msi_init(void) { - struct pci_controller *phb; - query_token = rtas_token("ibm,query-interrupt-source-number"); change_token = rtas_token("ibm,change-msi"); @@ -772,16 +695,6 @@ static int rtas_msi_init(void) pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n"); - WARN_ON(pseries_pci_controller_ops.setup_msi_irqs); - pseries_pci_controller_ops.setup_msi_irqs = rtas_setup_msi_irqs; - pseries_pci_controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs; - - list_for_each_entry(phb, &hose_list, list_node) { - WARN_ON(phb->controller_ops.setup_msi_irqs); - phb->controller_ops.setup_msi_irqs = rtas_setup_msi_irqs; - phb->controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs; - } - WARN_ON(ppc_md.pci_irq_fixup); ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup; From 6d9ba6121b1cf453985d08c141970a1b44cd9cf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:43 +0200 Subject: [PATCH 120/474] powerpc/powernv/pci: Drop unused MSI code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MSIs should be fully managed by the PCI and IRQ subsystems now. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-26-clg@kaod.org --- arch/powerpc/platforms/powernv/pci-ioda.c | 27 --------- arch/powerpc/platforms/powernv/pci.c | 67 ----------------------- arch/powerpc/platforms/powernv/pci.h | 6 -- 3 files changed, 100 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index e2454439e574..eb38ce1fd434 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2080,29 +2080,6 @@ static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, return 0; } -static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, - unsigned int hwirq, unsigned int virq, - unsigned int is_64, struct msi_msg *msg) -{ - struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); - unsigned int xive_num = hwirq - phb->msi_base; - int rc; - - rc = __pnv_pci_ioda_msi_setup(phb, dev, xive_num, is_64, msg); - if (rc) - return rc; - - /* P8 only */ - pnv_set_msi_irq_chip(phb, virq); - - pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," - " address=%x_%08x data=%x PE# %x\n", - pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, - msg->address_hi, msg->address_lo, msg->data, pe->pe_number); - - return 0; -} - /* * The msi_free() op is called before irq_domain_free_irqs_top() when * the handler data is still available. Use that to clear the XIVE @@ -2327,8 +2304,6 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) return; } - phb->msi_setup = pnv_pci_ioda_msi_setup; - phb->msi32_support = 1; pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", count, phb->msi_base); @@ -2936,8 +2911,6 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .dma_dev_setup = pnv_pci_ioda_dma_dev_setup, .dma_bus_setup = pnv_pci_ioda_dma_bus_setup, .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported, - .setup_msi_irqs = pnv_setup_msi_irqs, - .teardown_msi_irqs = pnv_teardown_msi_irqs, .enable_device_hook = pnv_pci_enable_device_hook, .release_device = pnv_pci_release_device, .window_alignment = pnv_pci_window_alignment, diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 6bb3c52633fb..9a8391b983d1 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -160,73 +160,6 @@ exit: } EXPORT_SYMBOL_GPL(pnv_pci_set_power_state); -int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) -{ - struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); - struct msi_desc *entry; - struct msi_msg msg; - int hwirq; - unsigned int virq; - int rc; - - if (WARN_ON(!phb) || !phb->msi_bmp.bitmap) - return -ENODEV; - - if (pdev->no_64bit_msi && !phb->msi32_support) - return -ENODEV; - - for_each_pci_msi_entry(entry, pdev) { - if (!entry->msi_attrib.is_64 && !phb->msi32_support) { - pr_warn("%s: Supports only 64-bit MSIs\n", - pci_name(pdev)); - return -ENXIO; - } - hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, 1); - if (hwirq < 0) { - pr_warn("%s: Failed to find a free MSI\n", - pci_name(pdev)); - return -ENOSPC; - } - virq = irq_create_mapping(NULL, phb->msi_base + hwirq); - if (!virq) { - pr_warn("%s: Failed to map MSI to linux irq\n", - pci_name(pdev)); - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1); - return -ENOMEM; - } - rc = phb->msi_setup(phb, pdev, phb->msi_base + hwirq, - virq, entry->msi_attrib.is_64, &msg); - if (rc) { - pr_warn("%s: Failed to setup MSI\n", pci_name(pdev)); - irq_dispose_mapping(virq); - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1); - return rc; - } - irq_set_msi_desc(virq, entry); - pci_write_msi_msg(virq, &msg); - } - return 0; -} - -void pnv_teardown_msi_irqs(struct pci_dev *pdev) -{ - struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); - struct msi_desc *entry; - irq_hw_number_t hwirq; - - if (WARN_ON(!phb)) - return; - - for_each_pci_msi_entry(entry, pdev) { - if (!entry->irq) - continue; - hwirq = virq_to_hw(entry->irq); - irq_set_msi_desc(entry->irq, NULL); - irq_dispose_mapping(entry->irq); - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, 1); - } -} - /* Nicely print the contents of the PE State Tables (PEST). */ static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size) { diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index c8d4f222a86f..966a9eb64339 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -123,11 +123,7 @@ struct pnv_phb { #endif unsigned int msi_base; - unsigned int msi32_support; struct msi_bitmap msi_bmp; - int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev, - unsigned int hwirq, unsigned int virq, - unsigned int is_64, struct msi_msg *msg); int (*init_m64)(struct pnv_phb *phb); int (*get_pe_state)(struct pnv_phb *phb, int pe_no); void (*freeze_pe)(struct pnv_phb *phb, int pe_no); @@ -289,8 +285,6 @@ extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np); extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); -extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); -extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); extern struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn); extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); From f1a377f86f51b381cfc30bf2270f8a5f81e35ee9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:44 +0200 Subject: [PATCH 121/474] powerpc/powernv/pci: Adapt is_pnv_opal_msi() to detect passthrough interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pnv_ioda2_msi_eoi() chip handler is not used anymore for MSIs. Simply use the check on the PSI-MSI chip. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-27-clg@kaod.org --- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index eb38ce1fd434..6c4b37598bcc 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2015,7 +2015,7 @@ static struct irq_chip pnv_pci_msi_irq_chip; */ bool is_pnv_opal_msi(struct irq_chip *chip) { - return chip->irq_eoi == pnv_ioda2_msi_eoi || chip == &pnv_pci_msi_irq_chip; + return chip == &pnv_pci_msi_irq_chip; } EXPORT_SYMBOL_GPL(is_pnv_opal_msi); From c80198a21792ac59412871e4e6fad5041c9be8e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:45 +0200 Subject: [PATCH 122/474] powerpc/xics: Fix IRQ migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit desc->irq_data points to the top level IRQ data descriptor which is not necessarily in the XICS IRQ domain. MSIs are in another domain for instance. Fix that by looking for a mapping on the low level XICS IRQ domain. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-28-clg@kaod.org --- arch/powerpc/sysdev/xics/xics-common.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index febab57f060f..4a7687caec75 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -183,6 +183,8 @@ void xics_migrate_irqs_away(void) unsigned int irq, virq; struct irq_desc *desc; + pr_debug("%s: CPU %u\n", __func__, cpu); + /* If we used to be the default server, move to the new "boot_cpuid" */ if (hw_cpu == xics_default_server) xics_update_irq_servers(); @@ -197,6 +199,7 @@ void xics_migrate_irqs_away(void) struct irq_chip *chip; long server; unsigned long flags; + struct irq_data *irqd; /* We can't set affinity on ISA interrupts */ if (virq < NR_IRQS_LEGACY) @@ -204,9 +207,11 @@ void xics_migrate_irqs_away(void) /* We only need to migrate enabled IRQS */ if (!desc->action) continue; - if (desc->irq_data.domain != xics_host) + /* We need a mapping in the XICS IRQ domain */ + irqd = irq_domain_get_irq_data(xics_host, virq); + if (!irqd) continue; - irq = desc->irq_data.hwirq; + irq = irqd_to_hwirq(irqd); /* We need to get IPIs still. */ if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) continue; From 5cd69651ceeed15e021cf7d19f1b1be0a80c0c7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:46 +0200 Subject: [PATCH 123/474] powerpc/powernv/pci: Set the IRQ chip data for P8/CXL devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before MSI domains, the default IRQ chip of PHB3 MSIs was patched by pnv_set_msi_irq_chip() with the custom EOI handler pnv_ioda2_msi_eoi() and the owning PHB was deduced from the 'ioda.irq_chip' field. This path has been deprecated by the MSI domains but it is still in use by the P8 CAPI 'cxl' driver. Rewriting this driver to support MSI would be a waste of time. Nevertheless, we can still remove the IRQ chip patch and set the IRQ chip data instead. This is cleaner. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-29-clg@kaod.org --- arch/powerpc/platforms/powernv/pci-ioda.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 6c4b37598bcc..aa97245eedbf 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1971,19 +1971,23 @@ int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq) return opal_pci_msi_eoi(phb->opal_id, hw_irq); } +/* + * The IRQ data is mapped in the XICS domain, with OPAL HW IRQ numbers + */ static void pnv_ioda2_msi_eoi(struct irq_data *d) { int64_t rc; unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); - struct irq_chip *chip = irq_data_get_irq_chip(d); + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; - rc = pnv_opal_pci_msi_eoi(chip, hw_irq); + rc = opal_pci_msi_eoi(phb->opal_id, hw_irq); WARN_ON_ONCE(rc); icp_native_eoi(d); } - +/* P8/CXL only */ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) { struct irq_data *idata; @@ -2005,6 +2009,7 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi; } irq_set_chip(virq, &phb->ioda.irq_chip); + irq_set_chip_data(virq, phb->hose); } static struct irq_chip pnv_pci_msi_irq_chip; From c325712b5f85e561ea89bae2ba5d0104e797e42c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:47 +0200 Subject: [PATCH 124/474] powerpc/powernv/pci: Rework pnv_opal_pci_msi_eoi() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pnv_opal_pci_msi_eoi() is called from KVM to EOI passthrough interrupts when in real mode. Adding MSI domain broke the hack using the 'ioda.irq_chip' field to deduce the owning PHB. Fix that by using the IRQ chip data in the MSI domain. The 'ioda.irq_chip' field is now unused and could be removed from the pnv_phb struct. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-30-clg@kaod.org --- arch/powerpc/include/asm/pnv-pci.h | 2 +- arch/powerpc/kvm/book3s_hv_rm_xics.c | 8 ++++---- arch/powerpc/platforms/powernv/pci-ioda.c | 17 +++++++++++++---- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h index d0ee0ede5767..b3f480799352 100644 --- a/arch/powerpc/include/asm/pnv-pci.h +++ b/arch/powerpc/include/asm/pnv-pci.h @@ -33,7 +33,7 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num); void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num); int pnv_cxl_get_irq_count(struct pci_dev *dev); struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev); -int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq); +int64_t pnv_opal_pci_msi_eoi(struct irq_data *d); bool is_pnv_opal_msi(struct irq_chip *chip); #ifdef CONFIG_CXL_BASE diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 0a11ec88a0ae..587c33fc4564 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -706,6 +706,7 @@ static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq) icp->rm_eoied_irq = irq; } + /* Handle passthrough interrupts */ if (state->host_irq) { ++vcpu->stat.pthru_all; if (state->intr_cpu != -1) { @@ -759,12 +760,12 @@ int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) static unsigned long eoi_rc; -static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) +static void icp_eoi(struct irq_data *d, u32 hwirq, __be32 xirr, bool *again) { void __iomem *xics_phys; int64_t rc; - rc = pnv_opal_pci_msi_eoi(c, hwirq); + rc = pnv_opal_pci_msi_eoi(d); if (rc) eoi_rc = rc; @@ -872,8 +873,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, icp_rm_deliver_irq(xics, icp, irq, false); /* EOI the interrupt */ - icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr, - again); + icp_eoi(irq_desc_get_irq_data(irq_map->desc), irq_map->r_hwirq, xirr, again); if (check_too_hard(xics, icp) == H_TOO_HARD) return 2; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index aa97245eedbf..2389cd79c3c8 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1963,12 +1963,21 @@ void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, pe->dma_setup_done = true; } -int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq) +/* + * Called from KVM in real mode to EOI passthru interrupts. The ICP + * EOI is handled directly in KVM in kvmppc_deliver_irq_passthru(). + * + * The IRQ data is mapped in the PCI-MSI domain and the EOI OPAL call + * needs an HW IRQ number mapped in the XICS IRQ domain. The HW IRQ + * numbers of the in-the-middle MSI domain are vector numbers and it's + * good enough for OPAL. Use that. + */ +int64_t pnv_opal_pci_msi_eoi(struct irq_data *d) { - struct pnv_phb *phb = container_of(chip, struct pnv_phb, - ioda.irq_chip); + struct pci_controller *hose = irq_data_get_irq_chip_data(d->parent_data); + struct pnv_phb *phb = hose->private_data; - return opal_pci_msi_eoi(phb->opal_id, hw_irq); + return opal_pci_msi_eoi(phb->opal_id, d->parent_data->hwirq); } /* From 1753081f2d445f9157550692fcc4221cd3ff0958 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:48 +0200 Subject: [PATCH 125/474] KVM: PPC: Book3S HV: XICS: Fix mapping of passthrough interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI MSIs now live in an MSI domain but the underlying calls, which will EOI the interrupt in real mode, need an HW IRQ number mapped in the XICS IRQ domain. Grab it there. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-31-clg@kaod.org --- arch/powerpc/kvm/book3s_hv.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 05b3a3548c18..f28f99805c4c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5328,6 +5328,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) struct kvmppc_passthru_irqmap *pimap; struct irq_chip *chip; int i, rc = 0; + struct irq_data *host_data; if (!kvm_irq_bypass) return 1; @@ -5392,7 +5393,14 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) * the KVM real mode handler. */ smp_wmb(); - irq_map->r_hwirq = desc->irq_data.hwirq; + + /* + * The 'host_irq' number is mapped in the PCI-MSI domain but + * the underlying calls, which will EOI the interrupt in real + * mode, need an HW IRQ number mapped in the XICS IRQ domain. + */ + host_data = irq_domain_get_irq_data(irq_get_default_host(), host_irq); + irq_map->r_hwirq = (unsigned int)irqd_to_hwirq(host_data); if (i == pimap->n_mapped) pimap->n_mapped++; @@ -5400,7 +5408,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) if (xics_on_xive()) rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq); else - kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); + kvmppc_xics_set_mapped(kvm, guest_gsi, irq_map->r_hwirq); if (rc) irq_map->r_hwirq = 0; From 59b2bc18b1492b46d45b6b6828ba098f09b9ba67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 1 Jul 2021 15:27:49 +0200 Subject: [PATCH 126/474] powerpc/xive: Use XIVE domain under xmon and debugfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default domain of the PCI/MSIs is not the XIVE domain anymore. To list the IRQ mappings under XMON and debugfs, query the IRQ data from the low level XIVE domain. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-32-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 107f442d3411..d0deaebbfeeb 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -312,11 +312,10 @@ void xmon_xive_get_irq_all(void) struct irq_desc *desc; for_each_irq_desc(i, desc) { - struct irq_data *d = irq_desc_get_irq_data(desc); - unsigned int hwirq = (unsigned int)irqd_to_hwirq(d); + struct irq_data *d = irq_domain_get_irq_data(xive_irq_domain, i); - if (d->domain == xive_irq_domain) - xmon_xive_get_irq_config(hwirq, d); + if (d) + xmon_xive_get_irq_config(irqd_to_hwirq(d), d); } } @@ -1757,9 +1756,9 @@ static int xive_core_debug_show(struct seq_file *m, void *private) xive_debug_show_cpu(m, cpu); for_each_irq_desc(i, desc) { - struct irq_data *d = irq_desc_get_irq_data(desc); + struct irq_data *d = irq_domain_get_irq_data(xive_irq_domain, i); - if (d->domain == xive_irq_domain) + if (d) xive_debug_show_irq(m, d); } return 0; From 17df41fec5b80b16ea4774495f1eb730e2225619 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 19 Jul 2021 15:06:14 +0200 Subject: [PATCH 127/474] powerpc: use IRQF_NO_DEBUG for IPIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no need to use the lockup detector ("noirqdebug") for IPIs. The ipistorm benchmark measures a ~10% improvement on high systems when this flag is set. Signed-off-by: Cédric Le Goater Reviewed-by: Thomas Gleixner Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210719130614.195886-1-clg@kaod.org --- arch/powerpc/sysdev/xics/xics-common.c | 2 +- arch/powerpc/sysdev/xive/common.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index 4a7687caec75..5c1a157a83b8 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -132,7 +132,7 @@ static void xics_request_ipi(void) * IPIs are marked IRQF_PERCPU. The handler was set in map. */ BUG_ON(request_irq(ipi, icp_ops->ipi_action, - IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL)); + IRQF_NO_DEBUG | IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL)); } void __init xics_smp_probe(void) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index d0deaebbfeeb..4018964bbd69 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1149,7 +1149,8 @@ static int __init xive_request_ipi(void) snprintf(xid->name, sizeof(xid->name), "IPI-%d", node); ret = request_irq(xid->irq, xive_muxed_ipi_action, - IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL); + IRQF_NO_DEBUG | IRQF_PERCPU | IRQF_NO_THREAD, + xid->name, NULL); WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); } From b68c6646cce5ee8caefa6333ee743f960222dcea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 20 Jul 2021 15:42:08 +0200 Subject: [PATCH 128/474] KVM: PPC: Book3S HV: XIVE: Add a 'flags' field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use it to hold platform specific features. P9 DD2 introduced single-escalation support. P10 will add others. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210720134209.256133-2-clg@kaod.org --- arch/powerpc/kvm/book3s_xive.c | 19 ++++++++++--------- arch/powerpc/kvm/book3s_xive.h | 9 ++++++++- arch/powerpc/kvm/book3s_xive_native.c | 12 +++++++----- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 6878026ee8ec..555cc610e7ab 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -363,9 +363,9 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio) if (!vcpu->arch.xive_vcpu) continue; rc = xive_provision_queue(vcpu, prio); - if (rc == 0 && !xive->single_escalation) + if (rc == 0 && !kvmppc_xive_has_single_escalation(xive)) kvmppc_xive_attach_escalation(vcpu, prio, - xive->single_escalation); + kvmppc_xive_has_single_escalation(xive)); if (rc) return rc; } @@ -1199,7 +1199,7 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) /* Free escalations */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { if (xc->esc_virq[i]) { - if (xc->xive->single_escalation) + if (kvmppc_xive_has_single_escalation(xc->xive)) xive_cleanup_single_escalation(vcpu, xc, xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); @@ -1340,7 +1340,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, * Enable the VP first as the single escalation mode will * affect escalation interrupts numbering */ - r = xive_native_enable_vp(xc->vp_id, xive->single_escalation); + r = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive)); if (r) { pr_err("Failed to enable VP in OPAL, err %d\n", r); goto bail; @@ -1357,15 +1357,15 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, struct xive_q *q = &xc->queues[i]; /* Single escalation, no queue 7 */ - if (i == 7 && xive->single_escalation) + if (i == 7 && kvmppc_xive_has_single_escalation(xive)) break; /* Is queue already enabled ? Provision it */ if (xive->qmap & (1 << i)) { r = xive_provision_queue(vcpu, i); - if (r == 0 && !xive->single_escalation) + if (r == 0 && !kvmppc_xive_has_single_escalation(xive)) kvmppc_xive_attach_escalation( - vcpu, i, xive->single_escalation); + vcpu, i, kvmppc_xive_has_single_escalation(xive)); if (r) goto bail; } else { @@ -1380,7 +1380,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, } /* If not done above, attach priority 0 escalation */ - r = kvmppc_xive_attach_escalation(vcpu, 0, xive->single_escalation); + r = kvmppc_xive_attach_escalation(vcpu, 0, kvmppc_xive_has_single_escalation(xive)); if (r) goto bail; @@ -2135,7 +2135,8 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) */ xive->nr_servers = KVM_MAX_VCPUS; - xive->single_escalation = xive_native_has_single_escalation(); + if (xive_native_has_single_escalation()) + xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION; kvm->arch.xive = xive; return 0; diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index afe9eeac6d56..73c3cd25093c 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -97,6 +97,8 @@ struct kvmppc_xive_ops { int (*reset_mapped)(struct kvm *kvm, unsigned long guest_irq); }; +#define KVMPPC_XIVE_FLAG_SINGLE_ESCALATION 0x1 + struct kvmppc_xive { struct kvm *kvm; struct kvm_device *dev; @@ -133,7 +135,7 @@ struct kvmppc_xive { u32 q_page_order; /* Flags */ - u8 single_escalation; + u8 flags; /* Number of entries in the VP block */ u32 nr_servers; @@ -308,5 +310,10 @@ void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp); int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr); +static inline bool kvmppc_xive_has_single_escalation(struct kvmppc_xive *xive) +{ + return xive->flags & KVMPPC_XIVE_FLAG_SINGLE_ESCALATION; +} + #endif /* CONFIG_KVM_XICS */ #endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 573ecaab3597..2abb1358a268 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -93,7 +93,7 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { /* Free the escalation irq */ if (xc->esc_virq[i]) { - if (xc->xive->single_escalation) + if (kvmppc_xive_has_single_escalation(xc->xive)) xive_cleanup_single_escalation(vcpu, xc, xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); @@ -172,7 +172,7 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, * Enable the VP first as the single escalation mode will * affect escalation interrupts numbering */ - rc = xive_native_enable_vp(xc->vp_id, xive->single_escalation); + rc = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive)); if (rc) { pr_err("Failed to enable VP in OPAL: %d\n", rc); goto bail; @@ -693,7 +693,7 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, } rc = kvmppc_xive_attach_escalation(vcpu, priority, - xive->single_escalation); + kvmppc_xive_has_single_escalation(xive)); error: if (rc) kvmppc_xive_native_cleanup_queue(vcpu, priority); @@ -820,7 +820,7 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive) for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) { /* Single escalation, no queue 7 */ - if (prio == 7 && xive->single_escalation) + if (prio == 7 && kvmppc_xive_has_single_escalation(xive)) break; if (xc->esc_virq[prio]) { @@ -1111,7 +1111,9 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) */ xive->nr_servers = KVM_MAX_VCPUS; - xive->single_escalation = xive_native_has_single_escalation(); + if (xive_native_has_single_escalation()) + xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION; + xive->ops = &kvmppc_xive_native_ops; kvm->arch.xive = xive; From f5af0a978776b710f16dc99a85496b1e760bf9e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 20 Jul 2021 15:42:09 +0200 Subject: [PATCH 129/474] KVM: PPC: Book3S HV: XIVE: Add support for automatic save-restore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On P10, the feature doing an automatic "save & restore" of a VCPU interrupt context is set by default in OPAL. When a VP context is pulled out, the state of the interrupt registers are saved by the XIVE interrupt controller under the internal NVP structure representing the VP. This saves a costly store/load in guest entries and exits. If OPAL advertises the "save & restore" feature in the device tree, it should also have set the 'H' bit in the CAM line. Check that when vCPUs are connected to their ICP in KVM before going any further. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210720134209.256133-3-clg@kaod.org --- arch/powerpc/include/asm/xive-regs.h | 3 +++ arch/powerpc/include/asm/xive.h | 1 + arch/powerpc/kvm/book3s_xive.c | 34 +++++++++++++++++++++++++-- arch/powerpc/kvm/book3s_xive.h | 2 ++ arch/powerpc/kvm/book3s_xive_native.c | 9 +++++++ arch/powerpc/sysdev/xive/native.c | 10 ++++++++ 6 files changed, 57 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/xive-regs.h b/arch/powerpc/include/asm/xive-regs.h index 8b211faa0e42..cf8bb6ac4463 100644 --- a/arch/powerpc/include/asm/xive-regs.h +++ b/arch/powerpc/include/asm/xive-regs.h @@ -80,10 +80,13 @@ #define TM_QW0W2_VU PPC_BIT32(0) #define TM_QW0W2_LOGIC_SERV PPC_BITMASK32(1,31) // XX 2,31 ? #define TM_QW1W2_VO PPC_BIT32(0) +#define TM_QW1W2_HO PPC_BIT32(1) /* P10 XIVE2 */ #define TM_QW1W2_OS_CAM PPC_BITMASK32(8,31) #define TM_QW2W2_VP PPC_BIT32(0) +#define TM_QW2W2_HP PPC_BIT32(1) /* P10 XIVE2 */ #define TM_QW2W2_POOL_CAM PPC_BITMASK32(8,31) #define TM_QW3W2_VT PPC_BIT32(0) +#define TM_QW3W2_HT PPC_BIT32(1) /* P10 XIVE2 */ #define TM_QW3W2_LP PPC_BIT32(6) #define TM_QW3W2_LE PPC_BIT32(7) #define TM_QW3W2_T PPC_BIT32(31) diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index 20ae50ab083c..92930b0b5d0e 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -126,6 +126,7 @@ int xive_native_enable_vp(u32 vp_id, bool single_escalation); int xive_native_disable_vp(u32 vp_id); int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id); bool xive_native_has_single_escalation(void); +bool xive_native_has_save_restore(void); int xive_native_get_queue_info(u32 vp_id, uint32_t prio, u64 *out_qpage, diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 555cc610e7ab..912c1e9eef6b 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -59,6 +59,25 @@ */ #define XIVE_Q_GAP 2 +static bool kvmppc_xive_vcpu_has_save_restore(struct kvm_vcpu *vcpu) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + + /* Check enablement at VP level */ + return xc->vp_cam & TM_QW1W2_HO; +} + +bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + struct kvmppc_xive *xive = xc->xive; + + if (xive->flags & KVMPPC_XIVE_FLAG_SAVE_RESTORE) + return kvmppc_xive_vcpu_has_save_restore(vcpu); + + return true; +} + /* * Push a vcpu's context to the XIVE on guest entry. * This assumes we are in virtual mode (MMU on) @@ -77,7 +96,8 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) return; eieio(); - __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); + if (!kvmppc_xive_vcpu_has_save_restore(vcpu)) + __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); vcpu->arch.xive_pushed = 1; eieio(); @@ -149,7 +169,8 @@ void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) /* First load to pull the context, we ignore the value */ __raw_readl(tima + TM_SPC_PULL_OS_CTX); /* Second load to recover the context state (Words 0 and 1) */ - vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS); + if (!kvmppc_xive_vcpu_has_save_restore(vcpu)) + vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS); /* Fixup some of the state for the next load */ vcpu->arch.xive_saved_state.lsmfb = 0; @@ -1319,6 +1340,12 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, if (r) goto bail; + if (!kvmppc_xive_check_save_restore(vcpu)) { + pr_err("inconsistent save-restore setup for VCPU %d\n", cpu); + r = -EIO; + goto bail; + } + /* Configure VCPU fields for use by assembly push/pull */ vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000); vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO); @@ -2138,6 +2165,9 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) if (xive_native_has_single_escalation()) xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION; + if (xive_native_has_save_restore()) + xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE; + kvm->arch.xive = xive; return 0; } diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 73c3cd25093c..e6a9651c6f1e 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -98,6 +98,7 @@ struct kvmppc_xive_ops { }; #define KVMPPC_XIVE_FLAG_SINGLE_ESCALATION 0x1 +#define KVMPPC_XIVE_FLAG_SAVE_RESTORE 0x2 struct kvmppc_xive { struct kvm *kvm; @@ -309,6 +310,7 @@ void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, struct kvmppc_xive_vcpu *xc, int irq); int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp); int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr); +bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu); static inline bool kvmppc_xive_has_single_escalation(struct kvmppc_xive *xive) { diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 2abb1358a268..af65ea21bde7 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -168,6 +168,12 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, goto bail; } + if (!kvmppc_xive_check_save_restore(vcpu)) { + pr_err("inconsistent save-restore setup for VCPU %d\n", server_num); + rc = -EIO; + goto bail; + } + /* * Enable the VP first as the single escalation mode will * affect escalation interrupts numbering @@ -1114,6 +1120,9 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) if (xive_native_has_single_escalation()) xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION; + if (xive_native_has_save_restore()) + xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE; + xive->ops = &kvmppc_xive_native_ops; kvm->arch.xive = xive; diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 57e3f1540435..1aec282cd650 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -41,6 +41,7 @@ static u32 xive_queue_shift; static u32 xive_pool_vps = XIVE_INVALID_VP; static struct kmem_cache *xive_provision_cache; static bool xive_has_single_esc; +static bool xive_has_save_restore; int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) { @@ -588,6 +589,9 @@ bool __init xive_native_init(void) if (of_get_property(np, "single-escalation-support", NULL) != NULL) xive_has_single_esc = true; + if (of_get_property(np, "vp-save-restore", NULL)) + xive_has_save_restore = true; + /* Configure Thread Management areas for KVM */ for_each_possible_cpu(cpu) kvmppc_set_xive_tima(cpu, r.start, tima); @@ -752,6 +756,12 @@ bool xive_native_has_single_escalation(void) } EXPORT_SYMBOL_GPL(xive_native_has_single_escalation); +bool xive_native_has_save_restore(void) +{ + return xive_has_save_restore; +} +EXPORT_SYMBOL_GPL(xive_native_has_save_restore); + int xive_native_get_queue_info(u32 vp_id, u32 prio, u64 *out_qpage, u64 *out_qsize, From 1bce54250045443d55659b0b23be51e87ed2b919 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Aug 2021 17:26:28 +0100 Subject: [PATCH 130/474] powerpc: Bulk conversion to generic_handle_domain_irq() Wherever possible, replace constructs that match either generic_handle_irq(irq_find_mapping()) or generic_handle_irq(irq_linear_revmap()) to a single call to generic_handle_domain_irq(). Signed-off-by: Marc Zyngier Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210802162630.2219813-13-maz@kernel.org --- arch/powerpc/platforms/4xx/uic.c | 4 +--- .../powerpc/platforms/512x/mpc5121_ads_cpld.c | 23 ++++++++----------- arch/powerpc/platforms/52xx/media5200.c | 9 ++++---- arch/powerpc/platforms/52xx/mpc52xx_gpt.c | 7 ++---- arch/powerpc/platforms/82xx/pq2ads-pci-pic.c | 6 ++--- arch/powerpc/platforms/cell/interrupt.c | 8 ++----- arch/powerpc/platforms/cell/spider-pic.c | 11 +++------ arch/powerpc/platforms/embedded6xx/hlwd-pic.c | 15 ++++++------ arch/powerpc/platforms/powernv/opal-irqchip.c | 11 ++++----- arch/powerpc/sysdev/fsl_mpic_err.c | 11 ++++----- arch/powerpc/sysdev/fsl_msi.c | 12 ++++------ 11 files changed, 43 insertions(+), 74 deletions(-) diff --git a/arch/powerpc/platforms/4xx/uic.c b/arch/powerpc/platforms/4xx/uic.c index 36fb66ce54cf..89e2587b1a59 100644 --- a/arch/powerpc/platforms/4xx/uic.c +++ b/arch/powerpc/platforms/4xx/uic.c @@ -198,7 +198,6 @@ static void uic_irq_cascade(struct irq_desc *desc) struct uic *uic = irq_desc_get_handler_data(desc); u32 msr; int src; - int subvirq; raw_spin_lock(&desc->lock); if (irqd_is_level_type(idata)) @@ -213,8 +212,7 @@ static void uic_irq_cascade(struct irq_desc *desc) src = 32 - ffs(msr); - subvirq = irq_linear_revmap(uic->irqhost, src); - generic_handle_irq(subvirq); + generic_handle_domain_irq(uic->irqhost, src); uic_irq_ret: raw_spin_lock(&desc->lock); diff --git a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c index b2981634f1f8..ea46870e5d6e 100644 --- a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c +++ b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c @@ -81,11 +81,10 @@ static struct irq_chip cpld_pic = { .irq_unmask = cpld_unmask_irq, }; -static int +static unsigned int cpld_pic_get_irq(int offset, u8 ignore, u8 __iomem *statusp, u8 __iomem *maskp) { - int cpld_irq; u8 status = in_8(statusp); u8 mask = in_8(maskp); @@ -93,28 +92,26 @@ cpld_pic_get_irq(int offset, u8 ignore, u8 __iomem *statusp, status |= (ignore | mask); if (status == 0xff) - return 0; + return ~0; - cpld_irq = ffz(status) + offset; - - return irq_linear_revmap(cpld_pic_host, cpld_irq); + return ffz(status) + offset; } static void cpld_pic_cascade(struct irq_desc *desc) { - unsigned int irq; + unsigned int hwirq; - irq = cpld_pic_get_irq(0, PCI_IGNORE, &cpld_regs->pci_status, + hwirq = cpld_pic_get_irq(0, PCI_IGNORE, &cpld_regs->pci_status, &cpld_regs->pci_mask); - if (irq) { - generic_handle_irq(irq); + if (hwirq != ~0) { + generic_handle_domain_irq(cpld_pic_host, hwirq); return; } - irq = cpld_pic_get_irq(8, MISC_IGNORE, &cpld_regs->misc_status, + hwirq = cpld_pic_get_irq(8, MISC_IGNORE, &cpld_regs->misc_status, &cpld_regs->misc_mask); - if (irq) { - generic_handle_irq(irq); + if (hwirq != ~0) { + generic_handle_domain_irq(cpld_pic_host, hwirq); return; } } diff --git a/arch/powerpc/platforms/52xx/media5200.c b/arch/powerpc/platforms/52xx/media5200.c index efb8bdecbcc7..110c444f4bc7 100644 --- a/arch/powerpc/platforms/52xx/media5200.c +++ b/arch/powerpc/platforms/52xx/media5200.c @@ -78,7 +78,7 @@ static struct irq_chip media5200_irq_chip = { static void media5200_irq_cascade(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - int sub_virq, val; + int val; u32 status, enable; /* Mask off the cascaded IRQ */ @@ -92,11 +92,10 @@ static void media5200_irq_cascade(struct irq_desc *desc) enable = in_be32(media5200_irq.regs + MEDIA5200_IRQ_STATUS); val = ffs((status & enable) >> MEDIA5200_IRQ_SHIFT); if (val) { - sub_virq = irq_linear_revmap(media5200_irq.irqhost, val - 1); - /* pr_debug("%s: virq=%i s=%.8x e=%.8x hwirq=%i subvirq=%i\n", - * __func__, virq, status, enable, val - 1, sub_virq); + generic_handle_domain_irq(media5200_irq.irqhost, val - 1); + /* pr_debug("%s: virq=%i s=%.8x e=%.8x hwirq=%i\n", + * __func__, virq, status, enable, val - 1); */ - generic_handle_irq(sub_virq); } /* Processing done; can reenable the cascade now */ diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c index 3823df235f25..f862b48b4824 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c @@ -190,14 +190,11 @@ static struct irq_chip mpc52xx_gpt_irq_chip = { static void mpc52xx_gpt_irq_cascade(struct irq_desc *desc) { struct mpc52xx_gpt_priv *gpt = irq_desc_get_handler_data(desc); - int sub_virq; u32 status; status = in_be32(&gpt->regs->status) & MPC52xx_GPT_STATUS_IRQMASK; - if (status) { - sub_virq = irq_linear_revmap(gpt->irqhost, 0); - generic_handle_irq(sub_virq); - } + if (status) + generic_handle_domain_irq(gpt->irqhost, 0); } static int mpc52xx_gpt_irq_map(struct irq_domain *h, unsigned int virq, diff --git a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c index f82f75a6085c..285bfe19b798 100644 --- a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c +++ b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c @@ -91,10 +91,8 @@ static void pq2ads_pci_irq_demux(struct irq_desc *desc) break; for (bit = 0; pend != 0; ++bit, pend <<= 1) { - if (pend & 0x80000000) { - int virq = irq_linear_revmap(priv->host, bit); - generic_handle_irq(virq); - } + if (pend & 0x80000000) + generic_handle_domain_irq(priv->host, bit); } } } diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c index c0ab62ba6f16..0873a7a20271 100644 --- a/arch/powerpc/platforms/cell/interrupt.c +++ b/arch/powerpc/platforms/cell/interrupt.c @@ -106,13 +106,9 @@ static void iic_ioexc_cascade(struct irq_desc *desc) out_be64(&node_iic->iic_is, ack); /* handle them */ for (cascade = 63; cascade >= 0; cascade--) - if (bits & (0x8000000000000000UL >> cascade)) { - unsigned int cirq = - irq_linear_revmap(iic_host, + if (bits & (0x8000000000000000UL >> cascade)) + generic_handle_domain_irq(iic_host, base | cascade); - if (cirq) - generic_handle_irq(cirq); - } /* post-ack level interrupts */ ack = bits & ~IIC_ISR_EDGE_MASK; if (ack) diff --git a/arch/powerpc/platforms/cell/spider-pic.c b/arch/powerpc/platforms/cell/spider-pic.c index 210785f59271..8af75867cb42 100644 --- a/arch/powerpc/platforms/cell/spider-pic.c +++ b/arch/powerpc/platforms/cell/spider-pic.c @@ -190,16 +190,11 @@ static void spider_irq_cascade(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct spider_pic *pic = irq_desc_get_handler_data(desc); - unsigned int cs, virq; + unsigned int cs; cs = in_be32(pic->regs + TIR_CS) >> 24; - if (cs == SPIDER_IRQ_INVALID) - virq = 0; - else - virq = irq_linear_revmap(pic->host, cs); - - if (virq) - generic_handle_irq(virq); + if (cs != SPIDER_IRQ_INVALID) + generic_handle_domain_irq(pic->host, cs); chip->irq_eoi(&desc->irq_data); } diff --git a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c index a1b7f79a8a15..15396333a90b 100644 --- a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c +++ b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c @@ -108,7 +108,6 @@ static const struct irq_domain_ops hlwd_irq_domain_ops = { static unsigned int __hlwd_pic_get_irq(struct irq_domain *h) { void __iomem *io_base = h->host_data; - int irq; u32 irq_status; irq_status = in_be32(io_base + HW_BROADWAY_ICR) & @@ -116,23 +115,22 @@ static unsigned int __hlwd_pic_get_irq(struct irq_domain *h) if (irq_status == 0) return 0; /* no more IRQs pending */ - irq = __ffs(irq_status); - return irq_linear_revmap(h, irq); + return __ffs(irq_status); } static void hlwd_pic_irq_cascade(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irq_domain *irq_domain = irq_desc_get_handler_data(desc); - unsigned int virq; + unsigned int hwirq; raw_spin_lock(&desc->lock); chip->irq_mask(&desc->irq_data); /* IRQ_LEVEL */ raw_spin_unlock(&desc->lock); - virq = __hlwd_pic_get_irq(irq_domain); - if (virq) - generic_handle_irq(virq); + hwirq = __hlwd_pic_get_irq(irq_domain); + if (hwirq) + generic_handle_domain_irq(irq_domain, hwirq); else pr_err("spurious interrupt!\n"); @@ -190,7 +188,8 @@ static struct irq_domain *hlwd_pic_init(struct device_node *np) unsigned int hlwd_pic_get_irq(void) { - return __hlwd_pic_get_irq(hlwd_irq_host); + unsigned int hwirq = __hlwd_pic_get_irq(hlwd_irq_host); + return hwirq ? irq_linear_revmap(hlwd_irq_host, hwirq) : 0; } /* diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index c164419e254d..d55652b5f6fa 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -46,18 +46,15 @@ void opal_handle_events(void) e = READ_ONCE(last_outstanding_events) & opal_event_irqchip.mask; again: while (e) { - int virq, hwirq; + int hwirq; hwirq = fls64(e) - 1; e &= ~BIT_ULL(hwirq); local_irq_disable(); - virq = irq_find_mapping(opal_event_irqchip.domain, hwirq); - if (virq) { - irq_enter(); - generic_handle_irq(virq); - irq_exit(); - } + irq_enter(); + generic_handle_domain_irq(opal_event_irqchip.domain, hwirq); + irq_exit(); local_irq_enable(); cond_resched(); diff --git a/arch/powerpc/sysdev/fsl_mpic_err.c b/arch/powerpc/sysdev/fsl_mpic_err.c index 5fa5fa215541..9a98bb212922 100644 --- a/arch/powerpc/sysdev/fsl_mpic_err.c +++ b/arch/powerpc/sysdev/fsl_mpic_err.c @@ -99,7 +99,6 @@ static irqreturn_t fsl_error_int_handler(int irq, void *data) struct mpic *mpic = (struct mpic *) data; u32 eisr, eimr; int errint; - unsigned int cascade_irq; eisr = mpic_fsl_err_read(mpic->err_regs, MPIC_ERR_INT_EISR); eimr = mpic_fsl_err_read(mpic->err_regs, MPIC_ERR_INT_EIMR); @@ -108,13 +107,11 @@ static irqreturn_t fsl_error_int_handler(int irq, void *data) return IRQ_NONE; while (eisr) { + int ret; errint = __builtin_clz(eisr); - cascade_irq = irq_linear_revmap(mpic->irqhost, - mpic->err_int_vecs[errint]); - WARN_ON(!cascade_irq); - if (cascade_irq) { - generic_handle_irq(cascade_irq); - } else { + ret = generic_handle_domain_irq(mpic->irqhost, + mpic->err_int_vecs[errint]); + if (WARN_ON(ret)) { eimr |= 1 << (31 - errint); mpic_fsl_err_write(mpic->err_regs, eimr); } diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index 808e7118abfc..e6b06c3f8197 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -266,7 +266,6 @@ out_free: static irqreturn_t fsl_msi_cascade(int irq, void *data) { - unsigned int cascade_irq; struct fsl_msi *msi_data; int msir_index = -1; u32 msir_value = 0; @@ -279,9 +278,6 @@ static irqreturn_t fsl_msi_cascade(int irq, void *data) msir_index = cascade_data->index; - if (msir_index >= NR_MSI_REG_MAX) - cascade_irq = 0; - switch (msi_data->feature & FSL_PIC_IP_MASK) { case FSL_PIC_IP_MPIC: msir_value = fsl_msi_read(msi_data->msi_regs, @@ -305,15 +301,15 @@ static irqreturn_t fsl_msi_cascade(int irq, void *data) } while (msir_value) { + int err; intr_index = ffs(msir_value) - 1; - cascade_irq = irq_linear_revmap(msi_data->irqhost, + err = generic_handle_domain_irq(msi_data->irqhost, msi_hwirq(msi_data, msir_index, intr_index + have_shift)); - if (cascade_irq) { - generic_handle_irq(cascade_irq); + if (!err) ret = IRQ_HANDLED; - } + have_shift += intr_index + 1; msir_value = msir_value >> (intr_index + 1); } From afefe67e0893325d75eb7b816dd394eef2eac628 Mon Sep 17 00:00:00 2001 From: Sai Prakash Ranjan Date: Tue, 10 Aug 2021 12:18:08 +0530 Subject: [PATCH 131/474] iommu/arm-smmu: Add clk_bulk_{prepare/unprepare} to system pm callbacks Some clocks for SMMU can have parent as XO such as gpu_cc_hub_cx_int_clk of GPU SMMU in QTI SC7280 SoC and in order to enter deep sleep states in such cases, we would need to drop the XO clock vote in unprepare call and this unprepare callback for XO is in RPMh (Resource Power Manager-Hardened) clock driver which controls RPMh managed clock resources for new QTI SoCs. Given we cannot have a sleeping calls such as clk_bulk_prepare() and clk_bulk_unprepare() in arm-smmu runtime pm callbacks since the iommu operations like map and unmap can be in atomic context and are in fast path, add this prepare and unprepare call to drop the XO vote only for system pm callbacks since it is not a fast path and we expect the system to enter deep sleep states with system pm as opposed to runtime pm. This is a similar sequence of clock requests (prepare,enable and disable,unprepare) in arm-smmu probe and remove. Signed-off-by: Sai Prakash Ranjan Co-developed-by: Rajendra Nayak Signed-off-by: Rajendra Nayak Link: https://lore.kernel.org/r/20210810064808.32486-1-saiprakash.ranjan@codeaurora.org Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 28 +++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index f22dbeb1e510..fc8b932b47d4 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -2281,18 +2281,38 @@ static int __maybe_unused arm_smmu_runtime_suspend(struct device *dev) static int __maybe_unused arm_smmu_pm_resume(struct device *dev) { + int ret; + struct arm_smmu_device *smmu = dev_get_drvdata(dev); + + ret = clk_bulk_prepare(smmu->num_clks, smmu->clks); + if (ret) + return ret; + if (pm_runtime_suspended(dev)) return 0; - return arm_smmu_runtime_resume(dev); + ret = arm_smmu_runtime_resume(dev); + if (ret) + clk_bulk_unprepare(smmu->num_clks, smmu->clks); + + return ret; } static int __maybe_unused arm_smmu_pm_suspend(struct device *dev) { - if (pm_runtime_suspended(dev)) - return 0; + int ret = 0; + struct arm_smmu_device *smmu = dev_get_drvdata(dev); - return arm_smmu_runtime_suspend(dev); + if (pm_runtime_suspended(dev)) + goto clk_unprepare; + + ret = arm_smmu_runtime_suspend(dev); + if (ret) + return ret; + +clk_unprepare: + clk_bulk_unprepare(smmu->num_clks, smmu->clks); + return ret; } static const struct dev_pm_ops arm_smmu_pm_ops = { From 211ff31b3d33b56aa12937e898c9280d07daf0d9 Mon Sep 17 00:00:00 2001 From: Ashish Mhetre Date: Tue, 10 Aug 2021 10:14:00 +0530 Subject: [PATCH 132/474] iommu: Fix race condition during default domain allocation When two devices with same SID are getting probed concurrently through iommu_probe_device(), the iommu_domain sometimes is getting allocated more than once as call to iommu_alloc_default_domain() is not protected for concurrency. Furthermore, it leads to each device holding a different iommu_domain pointer, separate IOVA space and only one of the devices' domain is used for translations from IOMMU. This causes accesses from other device to fault or see incorrect translations. Fix this by protecting iommu_alloc_default_domain() call with group->mutex and let all devices with same SID share same iommu_domain. Signed-off-by: Ashish Mhetre Link: https://lore.kernel.org/r/1628570641-9127-2-git-send-email-amhetre@nvidia.com Signed-off-by: Will Deacon --- drivers/iommu/iommu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5419c4b9f27a..80c5a1c57216 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -273,7 +273,9 @@ int iommu_probe_device(struct device *dev) * support default domains, so the return value is not yet * checked. */ + mutex_lock(&group->mutex); iommu_alloc_default_domain(group, dev); + mutex_unlock(&group->mutex); if (group->default_domain) { ret = __iommu_attach_device(group->default_domain, dev); From b1a1347912a742a4e1fcdc9df6302dd9dd2c3405 Mon Sep 17 00:00:00 2001 From: Krishna Reddy Date: Tue, 10 Aug 2021 10:14:01 +0530 Subject: [PATCH 133/474] iommu/arm-smmu: Fix race condition during iommu_group creation When two devices with same SID are getting probed concurrently through iommu_probe_device(), the iommu_group sometimes is getting allocated more than once as call to arm_smmu_device_group() is not protected for concurrency. Furthermore, it leads to each device holding a different iommu_group and domain pointer, separate IOVA space and only one of the devices' domain is used for translations from IOMMU. This causes accesses from other device to fault or see incorrect translations. Fix this by protecting iommu_group allocation from concurrency in arm_smmu_device_group(). Signed-off-by: Krishna Reddy Signed-off-by: Ashish Mhetre Link: https://lore.kernel.org/r/1628570641-9127-3-git-send-email-amhetre@nvidia.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index fc8b932b47d4..f7da8953afbe 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1478,6 +1478,7 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) struct iommu_group *group = NULL; int i, idx; + mutex_lock(&smmu->stream_map_mutex); for_each_cfg_sme(cfg, fwspec, i, idx) { if (group && smmu->s2crs[idx].group && group != smmu->s2crs[idx].group) @@ -1486,8 +1487,10 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) group = smmu->s2crs[idx].group; } - if (group) + if (group) { + mutex_unlock(&smmu->stream_map_mutex); return iommu_group_ref_get(group); + } if (dev_is_pci(dev)) group = pci_device_group(dev); @@ -1501,6 +1504,7 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) for_each_cfg_sme(cfg, fwspec, i, idx) smmu->s2crs[idx].group = group; + mutex_unlock(&smmu->stream_map_mutex); return group; } From 94effcedaa543825ad9c80831450d4fbfa284880 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Thu, 5 Aug 2021 11:49:36 +0900 Subject: [PATCH 134/474] openrisc: Fix compiler warnings in setup This was pointed out with the recent name change of or32_early_setup to or1k_early_setup. Investigating the file I found a few other warnings so cleaning them up here. arch/openrisc/kernel/setup.c:220:13: warning: no previous prototype for 'or1k_early_setup' [-Wmissing-prototypes] 220 | void __init or1k_early_setup(void *fdt) | ^~~~~~~~~~~~~~~~ Fix this the missing or1k_early_setup prototype warning by adding an asm/setup.h file to define the prototype. arch/openrisc/kernel/setup.c:246:13: warning: no previous prototype for 'detect_unit_config' [-Wmissing-prototypes] 246 | void __init detect_unit_config(unsigned long upr, unsigned long mask, | ^~~~~~~~~~~~~~~~~~ The function detect_unit_config is not used, just remove it. arch/openrisc/kernel/setup.c:221: warning: Function parameter or member 'fdt' not described in 'or1k_early_setup' Add @fdt docs to the function comment to suppress this warning. Reported-by: kernel test robot Signed-off-by: Stafford Horne Reviewed-by: Randy Dunlap --- arch/openrisc/include/asm/setup.h | 15 +++++++++++++++ arch/openrisc/kernel/setup.c | 16 +--------------- 2 files changed, 16 insertions(+), 15 deletions(-) create mode 100644 arch/openrisc/include/asm/setup.h diff --git a/arch/openrisc/include/asm/setup.h b/arch/openrisc/include/asm/setup.h new file mode 100644 index 000000000000..9acbc5deda69 --- /dev/null +++ b/arch/openrisc/include/asm/setup.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021 Stafford Horne + */ +#ifndef _ASM_OR1K_SETUP_H +#define _ASM_OR1K_SETUP_H + +#include +#include + +#ifndef __ASSEMBLY__ +void __init or1k_early_setup(void *fdt); +#endif + +#endif /* _ASM_OR1K_SETUP_H */ diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c index 7eddcac0ef2f..0cd04d936a7a 100644 --- a/arch/openrisc/kernel/setup.c +++ b/arch/openrisc/kernel/setup.c @@ -210,6 +210,7 @@ void __init setup_cpuinfo(void) /** * or1k_early_setup + * @fdt: pointer to the start of the device tree in memory or NULL * * Handles the pointer to the device tree that this kernel is to use * for establishing the available platform devices. @@ -243,21 +244,6 @@ static inline unsigned long extract_value(unsigned long reg, unsigned long mask) return mask & reg; } -void __init detect_unit_config(unsigned long upr, unsigned long mask, - char *text, void (*func) (void)) -{ - if (text != NULL) - printk("%s", text); - - if (upr & mask) { - if (func != NULL) - func(); - else - printk("present\n"); - } else - printk("not present\n"); -} - /* * calibrate_delay * From 892384cd998a17960dff6ebefc27375f63364111 Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Tue, 3 Aug 2021 14:16:49 +0200 Subject: [PATCH 135/474] iommu/io-pgtable: Add DART pagetable format Apple's DART iommu uses a pagetable format that shares some similarities with the ones already implemented by io-pgtable.c. Add a new format variant to support the required differences so that we don't have to duplicate the pagetable handling code. Reviewed-by: Alexander Graf Reviewed-by: Alyssa Rosenzweig Reviewed-by: Robin Murphy Signed-off-by: Sven Peter Link: https://lore.kernel.org/r/20210803121651.61594-2-sven@svenpeter.dev Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-arm.c | 63 ++++++++++++++++++++++++++++++++++ drivers/iommu/io-pgtable.c | 1 + include/linux/io-pgtable.h | 7 ++++ 3 files changed, 71 insertions(+) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 053df4048a29..0779eb96bd29 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -130,6 +130,9 @@ #define ARM_MALI_LPAE_MEMATTR_IMP_DEF 0x88ULL #define ARM_MALI_LPAE_MEMATTR_WRITE_ALLOC 0x8DULL +#define APPLE_DART_PTE_PROT_NO_WRITE (1<<7) +#define APPLE_DART_PTE_PROT_NO_READ (1<<8) + /* IOPTE accessors */ #define iopte_deref(pte,d) __va(iopte_to_paddr(pte, d)) @@ -402,6 +405,15 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, { arm_lpae_iopte pte; + if (data->iop.fmt == APPLE_DART) { + pte = 0; + if (!(prot & IOMMU_WRITE)) + pte |= APPLE_DART_PTE_PROT_NO_WRITE; + if (!(prot & IOMMU_READ)) + pte |= APPLE_DART_PTE_PROT_NO_READ; + return pte; + } + if (data->iop.fmt == ARM_64_LPAE_S1 || data->iop.fmt == ARM_32_LPAE_S1) { pte = ARM_LPAE_PTE_nG; @@ -1102,6 +1114,52 @@ out_free_data: return NULL; } +static struct io_pgtable * +apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) +{ + struct arm_lpae_io_pgtable *data; + int i; + + if (cfg->oas > 36) + return NULL; + + data = arm_lpae_alloc_pgtable(cfg); + if (!data) + return NULL; + + /* + * The table format itself always uses two levels, but the total VA + * space is mapped by four separate tables, making the MMIO registers + * an effective "level 1". For simplicity, though, we treat this + * equivalently to LPAE stage 2 concatenation at level 2, with the + * additional TTBRs each just pointing at consecutive pages. + */ + if (data->start_level < 1) + goto out_free_data; + if (data->start_level == 1 && data->pgd_bits > 2) + goto out_free_data; + if (data->start_level > 1) + data->pgd_bits = 0; + data->start_level = 2; + cfg->apple_dart_cfg.n_ttbrs = 1 << data->pgd_bits; + data->pgd_bits += data->bits_per_level; + + data->pgd = __arm_lpae_alloc_pages(ARM_LPAE_PGD_SIZE(data), GFP_KERNEL, + cfg); + if (!data->pgd) + goto out_free_data; + + for (i = 0; i < cfg->apple_dart_cfg.n_ttbrs; ++i) + cfg->apple_dart_cfg.ttbr[i] = + virt_to_phys(data->pgd + i * ARM_LPAE_GRANULE(data)); + + return &data->iop; + +out_free_data: + kfree(data); + return NULL; +} + struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns = { .alloc = arm_64_lpae_alloc_pgtable_s1, .free = arm_lpae_free_pgtable, @@ -1127,6 +1185,11 @@ struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns = { .free = arm_lpae_free_pgtable, }; +struct io_pgtable_init_fns io_pgtable_apple_dart_init_fns = { + .alloc = apple_dart_alloc_pgtable, + .free = arm_lpae_free_pgtable, +}; + #ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE_SELFTEST static struct io_pgtable_cfg *cfg_cookie __initdata; diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c index 6e9917ce980f..f4bfcef98297 100644 --- a/drivers/iommu/io-pgtable.c +++ b/drivers/iommu/io-pgtable.c @@ -20,6 +20,7 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = { [ARM_64_LPAE_S1] = &io_pgtable_arm_64_lpae_s1_init_fns, [ARM_64_LPAE_S2] = &io_pgtable_arm_64_lpae_s2_init_fns, [ARM_MALI_LPAE] = &io_pgtable_arm_mali_lpae_init_fns, + [APPLE_DART] = &io_pgtable_apple_dart_init_fns, #endif #ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S [ARM_V7S] = &io_pgtable_arm_v7s_init_fns, diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index c43f3b899d2a..a738483fb4da 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -16,6 +16,7 @@ enum io_pgtable_fmt { ARM_V7S, ARM_MALI_LPAE, AMD_IOMMU_V1, + APPLE_DART, IO_PGTABLE_NUM_FMTS, }; @@ -136,6 +137,11 @@ struct io_pgtable_cfg { u64 transtab; u64 memattr; } arm_mali_lpae_cfg; + + struct { + u64 ttbr[4]; + u32 n_ttbrs; + } apple_dart_cfg; }; }; @@ -254,5 +260,6 @@ extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns; extern struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns; extern struct io_pgtable_init_fns io_pgtable_arm_mali_lpae_init_fns; extern struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns; +extern struct io_pgtable_init_fns io_pgtable_apple_dart_init_fns; #endif /* __IO_PGTABLE_H */ From 9d9cafb45c71c9fe302234807fae8f743056f88a Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Tue, 3 Aug 2021 14:16:50 +0200 Subject: [PATCH 136/474] dt-bindings: iommu: add DART iommu bindings DART (Device Address Resolution Table) is the iommu found on Apple ARM SoCs such as the M1. Reviewed-by: Rob Herring Reviewed-by: Alyssa Rosenzweig Signed-off-by: Sven Peter Link: https://lore.kernel.org/r/20210803121651.61594-3-sven@svenpeter.dev Signed-off-by: Joerg Roedel --- .../devicetree/bindings/iommu/apple,dart.yaml | 81 +++++++++++++++++++ MAINTAINERS | 6 ++ 2 files changed, 87 insertions(+) create mode 100644 Documentation/devicetree/bindings/iommu/apple,dart.yaml diff --git a/Documentation/devicetree/bindings/iommu/apple,dart.yaml b/Documentation/devicetree/bindings/iommu/apple,dart.yaml new file mode 100644 index 000000000000..94aa9e9afa59 --- /dev/null +++ b/Documentation/devicetree/bindings/iommu/apple,dart.yaml @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/iommu/apple,dart.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Apple DART IOMMU + +maintainers: + - Sven Peter + +description: |+ + Apple SoCs may contain an implementation of their Device Address + Resolution Table which provides a mandatory layer of address + translations for various masters. + + Each DART instance is capable of handling up to 16 different streams + with individual pagetables and page-level read/write protection flags. + + This DART IOMMU also raises interrupts in response to various + fault conditions. + +properties: + compatible: + const: apple,t8103-dart + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + description: + Reference to the gate clock phandle if required for this IOMMU. + Optional since not all IOMMUs are attached to a clock gate. + + '#iommu-cells': + const: 1 + description: + Has to be one. The single cell describes the stream id emitted by + a master to the IOMMU. + +required: + - compatible + - reg + - '#iommu-cells' + - interrupts + +additionalProperties: false + +examples: + - |+ + dart1: iommu@82f80000 { + compatible = "apple,t8103-dart"; + reg = <0x82f80000 0x4000>; + interrupts = <1 781 4>; + #iommu-cells = <1>; + }; + + master1 { + iommus = <&dart1 0>; + }; + + - |+ + dart2a: iommu@82f00000 { + compatible = "apple,t8103-dart"; + reg = <0x82f00000 0x4000>; + interrupts = <1 781 4>; + #iommu-cells = <1>; + }; + dart2b: iommu@82f80000 { + compatible = "apple,t8103-dart"; + reg = <0x82f80000 0x4000>; + interrupts = <1 781 4>; + #iommu-cells = <1>; + }; + + master2 { + iommus = <&dart2a 0>, <&dart2b 1>; + }; diff --git a/MAINTAINERS b/MAINTAINERS index 19135a9d778e..d5a8f7087a74 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1262,6 +1262,12 @@ L: linux-input@vger.kernel.org S: Odd fixes F: drivers/input/mouse/bcm5974.c +APPLE DART IOMMU DRIVER +M: Sven Peter +L: iommu@lists.linux-foundation.org +S: Maintained +F: Documentation/devicetree/bindings/iommu/apple,dart.yaml + APPLE SMC DRIVER M: Henrik Rydberg L: linux-hwmon@vger.kernel.org From 46d1fb072e76b161b0fb1ada9e37bf7e4d1f123f Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Tue, 3 Aug 2021 14:16:51 +0200 Subject: [PATCH 137/474] iommu/dart: Add DART iommu driver Apple's new SoCs use iommus for almost all peripherals. These Device Address Resolution Tables must be setup before these peripherals can act as DMA masters. Tested-by: Alyssa Rosenzweig Signed-off-by: Sven Peter Link: https://lore.kernel.org/r/20210803121651.61594-4-sven@svenpeter.dev Signed-off-by: Joerg Roedel --- MAINTAINERS | 1 + drivers/iommu/Kconfig | 14 + drivers/iommu/Makefile | 1 + drivers/iommu/apple-dart.c | 923 +++++++++++++++++++++++++++++++++++++ 4 files changed, 939 insertions(+) create mode 100644 drivers/iommu/apple-dart.c diff --git a/MAINTAINERS b/MAINTAINERS index d5a8f7087a74..5e03acfb00f9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1267,6 +1267,7 @@ M: Sven Peter L: iommu@lists.linux-foundation.org S: Maintained F: Documentation/devicetree/bindings/iommu/apple,dart.yaml +F: drivers/iommu/apple-dart.c APPLE SMC DRIVER M: Henrik Rydberg diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index c84da8205be7..dfe81da483e9 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -290,6 +290,20 @@ config SPAPR_TCE_IOMMU Enables bits of IOMMU API required by VFIO. The iommu_ops is not implemented as it is not necessary for VFIO. +config APPLE_DART + tristate "Apple DART IOMMU Support" + depends on ARM64 || (COMPILE_TEST && !GENERIC_ATOMIC64) + select IOMMU_API + select IOMMU_IO_PGTABLE_LPAE + default ARCH_APPLE + help + Support for Apple DART (Device Address Resolution Table) IOMMUs + found in Apple ARM SoCs like the M1. + This IOMMU is required for most peripherals using DMA to access + the main memory. + + Say Y here if you are using an Apple SoC. + # ARM IOMMU support config ARM_SMMU tristate "ARM Ltd. System MMU (SMMU) Support" diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index c0fb0ba88143..bc7f730edbb0 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -29,3 +29,4 @@ obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o obj-$(CONFIG_IOMMU_SVA_LIB) += iommu-sva-lib.o io-pgfault.o obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o +obj-$(CONFIG_APPLE_DART) += apple-dart.o diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c new file mode 100644 index 000000000000..559db9259e65 --- /dev/null +++ b/drivers/iommu/apple-dart.c @@ -0,0 +1,923 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Apple DART (Device Address Resolution Table) IOMMU driver + * + * Copyright (C) 2021 The Asahi Linux Contributors + * + * Based on arm/arm-smmu/arm-ssmu.c and arm/arm-smmu-v3/arm-smmu-v3.c + * Copyright (C) 2013 ARM Limited + * Copyright (C) 2015 ARM Limited + * and on exynos-iommu.c + * Copyright (c) 2011,2016 Samsung Electronics Co., Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DART_MAX_STREAMS 16 +#define DART_MAX_TTBR 4 +#define MAX_DARTS_PER_DEVICE 2 + +#define DART_STREAM_ALL 0xffff + +#define DART_PARAMS1 0x00 +#define DART_PARAMS_PAGE_SHIFT GENMASK(27, 24) + +#define DART_PARAMS2 0x04 +#define DART_PARAMS_BYPASS_SUPPORT BIT(0) + +#define DART_STREAM_COMMAND 0x20 +#define DART_STREAM_COMMAND_BUSY BIT(2) +#define DART_STREAM_COMMAND_INVALIDATE BIT(20) + +#define DART_STREAM_SELECT 0x34 + +#define DART_ERROR 0x40 +#define DART_ERROR_STREAM GENMASK(27, 24) +#define DART_ERROR_CODE GENMASK(11, 0) +#define DART_ERROR_FLAG BIT(31) + +#define DART_ERROR_READ_FAULT BIT(4) +#define DART_ERROR_WRITE_FAULT BIT(3) +#define DART_ERROR_NO_PTE BIT(2) +#define DART_ERROR_NO_PMD BIT(1) +#define DART_ERROR_NO_TTBR BIT(0) + +#define DART_CONFIG 0x60 +#define DART_CONFIG_LOCK BIT(15) + +#define DART_STREAM_COMMAND_BUSY_TIMEOUT 100 + +#define DART_ERROR_ADDR_HI 0x54 +#define DART_ERROR_ADDR_LO 0x50 + +#define DART_TCR(sid) (0x100 + 4 * (sid)) +#define DART_TCR_TRANSLATE_ENABLE BIT(7) +#define DART_TCR_BYPASS0_ENABLE BIT(8) +#define DART_TCR_BYPASS1_ENABLE BIT(12) + +#define DART_TTBR(sid, idx) (0x200 + 16 * (sid) + 4 * (idx)) +#define DART_TTBR_VALID BIT(31) +#define DART_TTBR_SHIFT 12 + +/* + * Private structure associated with each DART device. + * + * @dev: device struct + * @regs: mapped MMIO region + * @irq: interrupt number, can be shared with other DARTs + * @clks: clocks associated with this DART + * @num_clks: number of @clks + * @lock: lock for hardware operations involving this dart + * @pgsize: pagesize supported by this DART + * @supports_bypass: indicates if this DART supports bypass mode + * @force_bypass: force bypass mode due to pagesize mismatch? + * @sid2group: maps stream ids to iommu_groups + * @iommu: iommu core device + */ +struct apple_dart { + struct device *dev; + + void __iomem *regs; + + int irq; + struct clk_bulk_data *clks; + int num_clks; + + spinlock_t lock; + + u32 pgsize; + u32 supports_bypass : 1; + u32 force_bypass : 1; + + struct iommu_group *sid2group[DART_MAX_STREAMS]; + struct iommu_device iommu; +}; + +/* + * Convenience struct to identify streams. + * + * The normal variant is used inside apple_dart_master_cfg which isn't written + * to concurrently. + * The atomic variant is used inside apple_dart_domain where we have to guard + * against races from potential parallel calls to attach/detach_device. + * Note that even inside the atomic variant the apple_dart pointer is not + * protected: This pointer is initialized once under the domain init mutex + * and never changed again afterwards. Devices with different dart pointers + * cannot be attached to the same domain. + * + * @dart dart pointer + * @sid stream id bitmap + */ +struct apple_dart_stream_map { + struct apple_dart *dart; + unsigned long sidmap; +}; +struct apple_dart_atomic_stream_map { + struct apple_dart *dart; + atomic64_t sidmap; +}; + +/* + * This structure is attached to each iommu domain handled by a DART. + * + * @pgtbl_ops: pagetable ops allocated by io-pgtable + * @finalized: true if the domain has been completely initialized + * @init_lock: protects domain initialization + * @stream_maps: streams attached to this domain (valid for DMA/UNMANAGED only) + * @domain: core iommu domain pointer + */ +struct apple_dart_domain { + struct io_pgtable_ops *pgtbl_ops; + + bool finalized; + struct mutex init_lock; + struct apple_dart_atomic_stream_map stream_maps[MAX_DARTS_PER_DEVICE]; + + struct iommu_domain domain; +}; + +/* + * This structure is attached to devices with dev_iommu_priv_set() on of_xlate + * and contains a list of streams bound to this device. + * So far the worst case seen is a single device with two streams + * from different darts, such that this simple static array is enough. + * + * @streams: streams for this device + */ +struct apple_dart_master_cfg { + struct apple_dart_stream_map stream_maps[MAX_DARTS_PER_DEVICE]; +}; + +/* + * Helper macro to iterate over apple_dart_master_cfg.stream_maps and + * apple_dart_domain.stream_maps + * + * @i int used as loop variable + * @base pointer to base struct (apple_dart_master_cfg or apple_dart_domain) + * @stream pointer to the apple_dart_streams struct for each loop iteration + */ +#define for_each_stream_map(i, base, stream_map) \ + for (i = 0, stream_map = &(base)->stream_maps[0]; \ + i < MAX_DARTS_PER_DEVICE && stream_map->dart; \ + stream_map = &(base)->stream_maps[++i]) + +static struct platform_driver apple_dart_driver; +static const struct iommu_ops apple_dart_iommu_ops; +static const struct iommu_flush_ops apple_dart_tlb_ops; + +static struct apple_dart_domain *to_dart_domain(struct iommu_domain *dom) +{ + return container_of(dom, struct apple_dart_domain, domain); +} + +static void +apple_dart_hw_enable_translation(struct apple_dart_stream_map *stream_map) +{ + int sid; + + for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS) + writel(DART_TCR_TRANSLATE_ENABLE, + stream_map->dart->regs + DART_TCR(sid)); +} + +static void apple_dart_hw_disable_dma(struct apple_dart_stream_map *stream_map) +{ + int sid; + + for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS) + writel(0, stream_map->dart->regs + DART_TCR(sid)); +} + +static void +apple_dart_hw_enable_bypass(struct apple_dart_stream_map *stream_map) +{ + int sid; + + WARN_ON(!stream_map->dart->supports_bypass); + for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS) + writel(DART_TCR_BYPASS0_ENABLE | DART_TCR_BYPASS1_ENABLE, + stream_map->dart->regs + DART_TCR(sid)); +} + +static void apple_dart_hw_set_ttbr(struct apple_dart_stream_map *stream_map, + u8 idx, phys_addr_t paddr) +{ + int sid; + + WARN_ON(paddr & ((1 << DART_TTBR_SHIFT) - 1)); + for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS) + writel(DART_TTBR_VALID | (paddr >> DART_TTBR_SHIFT), + stream_map->dart->regs + DART_TTBR(sid, idx)); +} + +static void apple_dart_hw_clear_ttbr(struct apple_dart_stream_map *stream_map, + u8 idx) +{ + int sid; + + for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS) + writel(0, stream_map->dart->regs + DART_TTBR(sid, idx)); +} + +static void +apple_dart_hw_clear_all_ttbrs(struct apple_dart_stream_map *stream_map) +{ + int i; + + for (i = 0; i < DART_MAX_TTBR; ++i) + apple_dart_hw_clear_ttbr(stream_map, i); +} + +static int +apple_dart_hw_stream_command(struct apple_dart_stream_map *stream_map, + u32 command) +{ + unsigned long flags; + int ret; + u32 command_reg; + + spin_lock_irqsave(&stream_map->dart->lock, flags); + + writel(stream_map->sidmap, stream_map->dart->regs + DART_STREAM_SELECT); + writel(command, stream_map->dart->regs + DART_STREAM_COMMAND); + + ret = readl_poll_timeout_atomic( + stream_map->dart->regs + DART_STREAM_COMMAND, command_reg, + !(command_reg & DART_STREAM_COMMAND_BUSY), 1, + DART_STREAM_COMMAND_BUSY_TIMEOUT); + + spin_unlock_irqrestore(&stream_map->dart->lock, flags); + + if (ret) { + dev_err(stream_map->dart->dev, + "busy bit did not clear after command %x for streams %lx\n", + command, stream_map->sidmap); + return ret; + } + + return 0; +} + +static int +apple_dart_hw_invalidate_tlb(struct apple_dart_stream_map *stream_map) +{ + return apple_dart_hw_stream_command(stream_map, + DART_STREAM_COMMAND_INVALIDATE); +} + +static int apple_dart_hw_reset(struct apple_dart *dart) +{ + u32 config; + struct apple_dart_stream_map stream_map; + + config = readl(dart->regs + DART_CONFIG); + if (config & DART_CONFIG_LOCK) { + dev_err(dart->dev, "DART is locked down until reboot: %08x\n", + config); + return -EINVAL; + } + + stream_map.dart = dart; + stream_map.sidmap = DART_STREAM_ALL; + apple_dart_hw_disable_dma(&stream_map); + apple_dart_hw_clear_all_ttbrs(&stream_map); + + /* clear any pending errors before the interrupt is unmasked */ + writel(readl(dart->regs + DART_ERROR), dart->regs + DART_ERROR); + + return apple_dart_hw_invalidate_tlb(&stream_map); +} + +static void apple_dart_domain_flush_tlb(struct apple_dart_domain *domain) +{ + int i; + struct apple_dart_atomic_stream_map *domain_stream_map; + struct apple_dart_stream_map stream_map; + + for_each_stream_map(i, domain, domain_stream_map) { + stream_map.dart = domain_stream_map->dart; + stream_map.sidmap = atomic64_read(&domain_stream_map->sidmap); + apple_dart_hw_invalidate_tlb(&stream_map); + } +} + +static void apple_dart_flush_iotlb_all(struct iommu_domain *domain) +{ + apple_dart_domain_flush_tlb(to_dart_domain(domain)); +} + +static void apple_dart_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + apple_dart_domain_flush_tlb(to_dart_domain(domain)); +} + +static void apple_dart_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ + apple_dart_domain_flush_tlb(to_dart_domain(domain)); +} + +static void apple_dart_tlb_flush_all(void *cookie) +{ + apple_dart_domain_flush_tlb(cookie); +} + +static void apple_dart_tlb_flush_walk(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ + apple_dart_domain_flush_tlb(cookie); +} + +static const struct iommu_flush_ops apple_dart_tlb_ops = { + .tlb_flush_all = apple_dart_tlb_flush_all, + .tlb_flush_walk = apple_dart_tlb_flush_walk, +}; + +static phys_addr_t apple_dart_iova_to_phys(struct iommu_domain *domain, + dma_addr_t iova) +{ + struct apple_dart_domain *dart_domain = to_dart_domain(domain); + struct io_pgtable_ops *ops = dart_domain->pgtbl_ops; + + if (!ops) + return 0; + + return ops->iova_to_phys(ops, iova); +} + +static int apple_dart_map_pages(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t pgsize, + size_t pgcount, int prot, gfp_t gfp, + size_t *mapped) +{ + struct apple_dart_domain *dart_domain = to_dart_domain(domain); + struct io_pgtable_ops *ops = dart_domain->pgtbl_ops; + + if (!ops) + return -ENODEV; + + return ops->map_pages(ops, iova, paddr, pgsize, pgcount, prot, gfp, + mapped); +} + +static size_t apple_dart_unmap_pages(struct iommu_domain *domain, + unsigned long iova, size_t pgsize, + size_t pgcount, + struct iommu_iotlb_gather *gather) +{ + struct apple_dart_domain *dart_domain = to_dart_domain(domain); + struct io_pgtable_ops *ops = dart_domain->pgtbl_ops; + + return ops->unmap_pages(ops, iova, pgsize, pgcount, gather); +} + +static void +apple_dart_setup_translation(struct apple_dart_domain *domain, + struct apple_dart_stream_map *stream_map) +{ + int i; + struct io_pgtable_cfg *pgtbl_cfg = + &io_pgtable_ops_to_pgtable(domain->pgtbl_ops)->cfg; + + for (i = 0; i < pgtbl_cfg->apple_dart_cfg.n_ttbrs; ++i) + apple_dart_hw_set_ttbr(stream_map, i, + pgtbl_cfg->apple_dart_cfg.ttbr[i]); + for (; i < DART_MAX_TTBR; ++i) + apple_dart_hw_clear_ttbr(stream_map, i); + + apple_dart_hw_enable_translation(stream_map); + apple_dart_hw_invalidate_tlb(stream_map); +} + +static int apple_dart_finalize_domain(struct iommu_domain *domain, + struct apple_dart_master_cfg *cfg) +{ + struct apple_dart_domain *dart_domain = to_dart_domain(domain); + struct apple_dart *dart = cfg->stream_maps[0].dart; + struct io_pgtable_cfg pgtbl_cfg; + int ret = 0; + int i; + + mutex_lock(&dart_domain->init_lock); + + if (dart_domain->finalized) + goto done; + + for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) { + dart_domain->stream_maps[i].dart = cfg->stream_maps[i].dart; + atomic64_set(&dart_domain->stream_maps[i].sidmap, + cfg->stream_maps[i].sidmap); + } + + pgtbl_cfg = (struct io_pgtable_cfg){ + .pgsize_bitmap = dart->pgsize, + .ias = 32, + .oas = 36, + .coherent_walk = 1, + .tlb = &apple_dart_tlb_ops, + .iommu_dev = dart->dev, + }; + + dart_domain->pgtbl_ops = + alloc_io_pgtable_ops(APPLE_DART, &pgtbl_cfg, domain); + if (!dart_domain->pgtbl_ops) { + ret = -ENOMEM; + goto done; + } + + domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap; + domain->geometry.aperture_start = 0; + domain->geometry.aperture_end = DMA_BIT_MASK(32); + domain->geometry.force_aperture = true; + + dart_domain->finalized = true; + +done: + mutex_unlock(&dart_domain->init_lock); + return ret; +} + +static int +apple_dart_mod_streams(struct apple_dart_atomic_stream_map *domain_maps, + struct apple_dart_stream_map *master_maps, + bool add_streams) +{ + int i; + + for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) { + if (domain_maps[i].dart != master_maps[i].dart) + return -EINVAL; + } + + for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) { + if (!domain_maps[i].dart) + break; + if (add_streams) + atomic64_or(master_maps[i].sidmap, + &domain_maps[i].sidmap); + else + atomic64_and(~master_maps[i].sidmap, + &domain_maps[i].sidmap); + } + + return 0; +} + +static int apple_dart_domain_add_streams(struct apple_dart_domain *domain, + struct apple_dart_master_cfg *cfg) +{ + return apple_dart_mod_streams(domain->stream_maps, cfg->stream_maps, + true); +} + +static int apple_dart_domain_remove_streams(struct apple_dart_domain *domain, + struct apple_dart_master_cfg *cfg) +{ + return apple_dart_mod_streams(domain->stream_maps, cfg->stream_maps, + false); +} + +static int apple_dart_attach_dev(struct iommu_domain *domain, + struct device *dev) +{ + int ret, i; + struct apple_dart_stream_map *stream_map; + struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); + struct apple_dart_domain *dart_domain = to_dart_domain(domain); + + if (cfg->stream_maps[0].dart->force_bypass && + domain->type != IOMMU_DOMAIN_IDENTITY) + return -EINVAL; + if (!cfg->stream_maps[0].dart->supports_bypass && + domain->type == IOMMU_DOMAIN_IDENTITY) + return -EINVAL; + + ret = apple_dart_finalize_domain(domain, cfg); + if (ret) + return ret; + + switch (domain->type) { + case IOMMU_DOMAIN_DMA: + case IOMMU_DOMAIN_UNMANAGED: + ret = apple_dart_domain_add_streams(dart_domain, cfg); + if (ret) + return ret; + + for_each_stream_map(i, cfg, stream_map) + apple_dart_setup_translation(dart_domain, stream_map); + break; + case IOMMU_DOMAIN_BLOCKED: + for_each_stream_map(i, cfg, stream_map) + apple_dart_hw_disable_dma(stream_map); + break; + case IOMMU_DOMAIN_IDENTITY: + for_each_stream_map(i, cfg, stream_map) + apple_dart_hw_enable_bypass(stream_map); + break; + } + + return ret; +} + +static void apple_dart_detach_dev(struct iommu_domain *domain, + struct device *dev) +{ + int i; + struct apple_dart_stream_map *stream_map; + struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); + struct apple_dart_domain *dart_domain = to_dart_domain(domain); + + for_each_stream_map(i, cfg, stream_map) + apple_dart_hw_disable_dma(stream_map); + + if (domain->type == IOMMU_DOMAIN_DMA || + domain->type == IOMMU_DOMAIN_UNMANAGED) + apple_dart_domain_remove_streams(dart_domain, cfg); +} + +static struct iommu_device *apple_dart_probe_device(struct device *dev) +{ + struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); + struct apple_dart_stream_map *stream_map; + int i; + + if (!cfg) + return ERR_PTR(-ENODEV); + + for_each_stream_map(i, cfg, stream_map) + device_link_add( + dev, stream_map->dart->dev, + DL_FLAG_PM_RUNTIME | DL_FLAG_AUTOREMOVE_SUPPLIER); + + return &cfg->stream_maps[0].dart->iommu; +} + +static void apple_dart_release_device(struct device *dev) +{ + struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); + + if (!cfg) + return; + + dev_iommu_priv_set(dev, NULL); + kfree(cfg); +} + +static struct iommu_domain *apple_dart_domain_alloc(unsigned int type) +{ + struct apple_dart_domain *dart_domain; + + if (type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_UNMANAGED && + type != IOMMU_DOMAIN_IDENTITY && type != IOMMU_DOMAIN_BLOCKED) + return NULL; + + dart_domain = kzalloc(sizeof(*dart_domain), GFP_KERNEL); + if (!dart_domain) + return NULL; + + iommu_get_dma_cookie(&dart_domain->domain); + mutex_init(&dart_domain->init_lock); + + /* no need to allocate pgtbl_ops or do any other finalization steps */ + if (type == IOMMU_DOMAIN_IDENTITY || type == IOMMU_DOMAIN_BLOCKED) + dart_domain->finalized = true; + + return &dart_domain->domain; +} + +static void apple_dart_domain_free(struct iommu_domain *domain) +{ + struct apple_dart_domain *dart_domain = to_dart_domain(domain); + + if (dart_domain->pgtbl_ops) + free_io_pgtable_ops(dart_domain->pgtbl_ops); + + kfree(dart_domain); +} + +static int apple_dart_of_xlate(struct device *dev, struct of_phandle_args *args) +{ + struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); + struct platform_device *iommu_pdev = of_find_device_by_node(args->np); + struct apple_dart *dart = platform_get_drvdata(iommu_pdev); + struct apple_dart *cfg_dart; + int i, sid; + + if (args->args_count != 1) + return -EINVAL; + sid = args->args[0]; + + if (!cfg) + cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); + if (!cfg) + return -ENOMEM; + dev_iommu_priv_set(dev, cfg); + + cfg_dart = cfg->stream_maps[0].dart; + if (cfg_dart) { + if (cfg_dart->supports_bypass != dart->supports_bypass) + return -EINVAL; + if (cfg_dart->force_bypass != dart->force_bypass) + return -EINVAL; + if (cfg_dart->pgsize != dart->pgsize) + return -EINVAL; + } + + for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) { + if (cfg->stream_maps[i].dart == dart) { + cfg->stream_maps[i].sidmap |= 1 << sid; + return 0; + } + } + for (i = 0; i < MAX_DARTS_PER_DEVICE; ++i) { + if (!cfg->stream_maps[i].dart) { + cfg->stream_maps[i].dart = dart; + cfg->stream_maps[i].sidmap = 1 << sid; + return 0; + } + } + + return -EINVAL; +} + +static struct iommu_group *apple_dart_device_group(struct device *dev) +{ + static DEFINE_MUTEX(lock); + int i, sid; + struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); + struct apple_dart_stream_map *stream_map; + struct iommu_group *group = NULL; + struct iommu_group *res = ERR_PTR(-EINVAL); + + mutex_lock(&lock); + + for_each_stream_map(i, cfg, stream_map) { + for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS) { + struct iommu_group *stream_group = + stream_map->dart->sid2group[sid]; + + if (group && group != stream_group) { + res = ERR_PTR(-EINVAL); + goto out; + } + + group = stream_group; + } + } + + if (group) { + res = iommu_group_ref_get(group); + goto out; + } + +#ifdef CONFIG_PCI + if (dev_is_pci(dev)) + group = pci_device_group(dev); + else +#endif + group = generic_device_group(dev); + + for_each_stream_map(i, cfg, stream_map) + for_each_set_bit(sid, &stream_map->sidmap, DART_MAX_STREAMS) + stream_map->dart->sid2group[sid] = group; + + res = group; + +out: + mutex_unlock(&lock); + return res; +} + +static int apple_dart_def_domain_type(struct device *dev) +{ + struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); + + if (cfg->stream_maps[0].dart->force_bypass) + return IOMMU_DOMAIN_IDENTITY; + if (!cfg->stream_maps[0].dart->supports_bypass) + return IOMMU_DOMAIN_DMA; + + return 0; +} + +static const struct iommu_ops apple_dart_iommu_ops = { + .domain_alloc = apple_dart_domain_alloc, + .domain_free = apple_dart_domain_free, + .attach_dev = apple_dart_attach_dev, + .detach_dev = apple_dart_detach_dev, + .map_pages = apple_dart_map_pages, + .unmap_pages = apple_dart_unmap_pages, + .flush_iotlb_all = apple_dart_flush_iotlb_all, + .iotlb_sync = apple_dart_iotlb_sync, + .iotlb_sync_map = apple_dart_iotlb_sync_map, + .iova_to_phys = apple_dart_iova_to_phys, + .probe_device = apple_dart_probe_device, + .release_device = apple_dart_release_device, + .device_group = apple_dart_device_group, + .of_xlate = apple_dart_of_xlate, + .def_domain_type = apple_dart_def_domain_type, + .pgsize_bitmap = -1UL, /* Restricted during dart probe */ +}; + +static irqreturn_t apple_dart_irq(int irq, void *dev) +{ + struct apple_dart *dart = dev; + const char *fault_name = NULL; + u32 error = readl(dart->regs + DART_ERROR); + u32 error_code = FIELD_GET(DART_ERROR_CODE, error); + u32 addr_lo = readl(dart->regs + DART_ERROR_ADDR_LO); + u32 addr_hi = readl(dart->regs + DART_ERROR_ADDR_HI); + u64 addr = addr_lo | (((u64)addr_hi) << 32); + u8 stream_idx = FIELD_GET(DART_ERROR_STREAM, error); + + if (!(error & DART_ERROR_FLAG)) + return IRQ_NONE; + + /* there should only be a single bit set but let's use == to be sure */ + if (error_code == DART_ERROR_READ_FAULT) + fault_name = "READ FAULT"; + else if (error_code == DART_ERROR_WRITE_FAULT) + fault_name = "WRITE FAULT"; + else if (error_code == DART_ERROR_NO_PTE) + fault_name = "NO PTE FOR IOVA"; + else if (error_code == DART_ERROR_NO_PMD) + fault_name = "NO PMD FOR IOVA"; + else if (error_code == DART_ERROR_NO_TTBR) + fault_name = "NO TTBR FOR IOVA"; + else + fault_name = "unknown"; + + dev_err_ratelimited( + dart->dev, + "translation fault: status:0x%x stream:%d code:0x%x (%s) at 0x%llx", + error, stream_idx, error_code, fault_name, addr); + + writel(error, dart->regs + DART_ERROR); + return IRQ_HANDLED; +} + +static int apple_dart_set_bus_ops(const struct iommu_ops *ops) +{ + int ret; + + if (!iommu_present(&platform_bus_type)) { + ret = bus_set_iommu(&platform_bus_type, ops); + if (ret) + return ret; + } +#ifdef CONFIG_PCI + if (!iommu_present(&pci_bus_type)) { + ret = bus_set_iommu(&pci_bus_type, ops); + if (ret) { + bus_set_iommu(&platform_bus_type, NULL); + return ret; + } + } +#endif + return 0; +} + +static int apple_dart_probe(struct platform_device *pdev) +{ + int ret; + u32 dart_params[2]; + struct resource *res; + struct apple_dart *dart; + struct device *dev = &pdev->dev; + + dart = devm_kzalloc(dev, sizeof(*dart), GFP_KERNEL); + if (!dart) + return -ENOMEM; + + dart->dev = dev; + spin_lock_init(&dart->lock); + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (resource_size(res) < 0x4000) { + dev_err(dev, "MMIO region too small (%pr)\n", res); + return -EINVAL; + } + + dart->regs = devm_ioremap_resource(dev, res); + if (IS_ERR(dart->regs)) + return PTR_ERR(dart->regs); + + dart->irq = platform_get_irq(pdev, 0); + if (dart->irq < 0) + return -ENODEV; + + ret = devm_clk_bulk_get_all(dev, &dart->clks); + if (ret < 0) + return ret; + dart->num_clks = ret; + + ret = clk_bulk_prepare_enable(dart->num_clks, dart->clks); + if (ret) + return ret; + + ret = apple_dart_hw_reset(dart); + if (ret) + goto err_clk_disable; + + dart_params[0] = readl(dart->regs + DART_PARAMS1); + dart_params[1] = readl(dart->regs + DART_PARAMS2); + dart->pgsize = 1 << FIELD_GET(DART_PARAMS_PAGE_SHIFT, dart_params[0]); + dart->supports_bypass = dart_params[1] & DART_PARAMS_BYPASS_SUPPORT; + dart->force_bypass = dart->pgsize > PAGE_SIZE; + + ret = request_irq(dart->irq, apple_dart_irq, IRQF_SHARED, + "apple-dart fault handler", dart); + if (ret) + goto err_clk_disable; + + platform_set_drvdata(pdev, dart); + + ret = apple_dart_set_bus_ops(&apple_dart_iommu_ops); + if (ret) + goto err_free_irq; + + ret = iommu_device_sysfs_add(&dart->iommu, dev, NULL, "apple-dart.%s", + dev_name(&pdev->dev)); + if (ret) + goto err_remove_bus_ops; + + ret = iommu_device_register(&dart->iommu, &apple_dart_iommu_ops, dev); + if (ret) + goto err_sysfs_remove; + + dev_info( + &pdev->dev, + "DART [pagesize %x, bypass support: %d, bypass forced: %d] initialized\n", + dart->pgsize, dart->supports_bypass, dart->force_bypass); + return 0; + +err_sysfs_remove: + iommu_device_sysfs_remove(&dart->iommu); +err_remove_bus_ops: + apple_dart_set_bus_ops(NULL); +err_free_irq: + free_irq(dart->irq, dart); +err_clk_disable: + clk_bulk_disable_unprepare(dart->num_clks, dart->clks); + + return ret; +} + +static int apple_dart_remove(struct platform_device *pdev) +{ + struct apple_dart *dart = platform_get_drvdata(pdev); + + apple_dart_hw_reset(dart); + free_irq(dart->irq, dart); + apple_dart_set_bus_ops(NULL); + + iommu_device_unregister(&dart->iommu); + iommu_device_sysfs_remove(&dart->iommu); + + clk_bulk_disable_unprepare(dart->num_clks, dart->clks); + + return 0; +} + +static const struct of_device_id apple_dart_of_match[] = { + { .compatible = "apple,t8103-dart", .data = NULL }, + {}, +}; +MODULE_DEVICE_TABLE(of, apple_dart_of_match); + +static struct platform_driver apple_dart_driver = { + .driver = { + .name = "apple-dart", + .of_match_table = apple_dart_of_match, + .suppress_bind_attrs = true, + }, + .probe = apple_dart_probe, + .remove = apple_dart_remove, +}; + +module_platform_driver(apple_dart_driver); + +MODULE_DESCRIPTION("IOMMU API for Apple's DART"); +MODULE_AUTHOR("Sven Peter "); +MODULE_LICENSE("GPL v2"); From faf8e7539643ed958174ad3dfeb43c179a9c4397 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 10 Aug 2021 15:47:19 +0200 Subject: [PATCH 138/474] iommu/dart: APPLE_DART should depend on ARCH_APPLE The Apple DART (Device Address Resolution Table) IOMMU is only present on Apple ARM SoCs like the M1. Hence add a dependency on ARCH_APPLE, to prevent asking the user about this driver when configuring a kernel without support for the Apple Silicon SoC family. Signed-off-by: Geert Uytterhoeven Acked-by: Sven Peter Link: https://lore.kernel.org/r/44fcf525273b32c9afcd7e99acbd346d47f0e047.1628603162.git.geert+renesas@glider.be Signed-off-by: Joerg Roedel --- drivers/iommu/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index dfe81da483e9..e908b8222e4e 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -292,7 +292,7 @@ config SPAPR_TCE_IOMMU config APPLE_DART tristate "Apple DART IOMMU Support" - depends on ARM64 || (COMPILE_TEST && !GENERIC_ATOMIC64) + depends on ARCH_APPLE || (COMPILE_TEST && !GENERIC_ATOMIC64) select IOMMU_API select IOMMU_IO_PGTABLE_LPAE default ARCH_APPLE From 666173ee32e2d6f2853bd54b11b8127ac97eb019 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Fri, 23 Jul 2021 10:25:35 +0800 Subject: [PATCH 139/474] MIPS: generic: Allow generating FIT image for Marduk board Marduk is based on IMG pistachio SoC. The platform is using MIPS UHI booting protocol and does have a proper devicetree implement, thus it could be a part of generic kernel. Signed-off-by: Jiaxun Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/Makefile | 1 + arch/mips/boot/dts/img/Makefile | 2 ++ arch/mips/generic/Kconfig | 6 ++++++ arch/mips/generic/Platform | 1 + arch/mips/generic/board-marduk.its.S | 22 ++++++++++++++++++++++ 5 files changed, 32 insertions(+) create mode 100644 arch/mips/generic/board-marduk.its.S diff --git a/arch/mips/boot/dts/Makefile b/arch/mips/boot/dts/Makefile index 60bd7d2a9ad8..188301164d9e 100644 --- a/arch/mips/boot/dts/Makefile +++ b/arch/mips/boot/dts/Makefile @@ -2,6 +2,7 @@ subdir-$(CONFIG_BMIPS_GENERIC) += brcm subdir-$(CONFIG_CAVIUM_OCTEON_SOC) += cavium-octeon subdir-$(CONFIG_MACH_PISTACHIO) += img +subdir-$(CONFIG_FIT_IMAGE_FDT_MARDUK) += img subdir-$(CONFIG_FIT_IMAGE_FDT_BOSTON) += img subdir-$(CONFIG_MACH_INGENIC) += ingenic subdir-$(CONFIG_LANTIQ) += lantiq diff --git a/arch/mips/boot/dts/img/Makefile b/arch/mips/boot/dts/img/Makefile index 441a3c16efb0..24f6bbeadd48 100644 --- a/arch/mips/boot/dts/img/Makefile +++ b/arch/mips/boot/dts/img/Makefile @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 dtb-$(CONFIG_FIT_IMAGE_FDT_BOSTON) += boston.dtb +dtb-$(CONFIG_FIT_IMAGE_FDT_MARDUK) += pistachio_marduk.dtb + dtb-$(CONFIG_MACH_PISTACHIO) += pistachio_marduk.dtb obj-$(CONFIG_MACH_PISTACHIO) += pistachio_marduk.dtb.o diff --git a/arch/mips/generic/Kconfig b/arch/mips/generic/Kconfig index 657dd93c5e76..7dc5b3821cc6 100644 --- a/arch/mips/generic/Kconfig +++ b/arch/mips/generic/Kconfig @@ -58,6 +58,12 @@ config FIT_IMAGE_FDT_BOSTON enable this if you wish to boot on a MIPS Boston board, as it is expected by the bootloader. +config FIT_IMAGE_FDT_MARDUK + bool "Include FDT for IMG Pistachio Marduk (CI40) boards" + help + Enable this to include the FDT for the IMG Pistachio Marduk (CI40) + from Imagination Technologies in the FIT kernel image. + config FIT_IMAGE_FDT_NI169445 bool "Include FDT for NI 169445" help diff --git a/arch/mips/generic/Platform b/arch/mips/generic/Platform index b871af16b5b6..e1abc113b409 100644 --- a/arch/mips/generic/Platform +++ b/arch/mips/generic/Platform @@ -24,3 +24,4 @@ its-$(CONFIG_FIT_IMAGE_FDT_LUTON) += board-luton.its.S its-$(CONFIG_FIT_IMAGE_FDT_JAGUAR2) += board-jaguar2.its.S its-$(CONFIG_FIT_IMAGE_FDT_SERVAL) += board-serval.its.S its-$(CONFIG_FIT_IMAGE_FDT_XILFPGA) += board-xilfpga.its.S +its-$(CONFIG_FIT_IMAGE_FDT_MARDUK) += board-marduk.its.S diff --git a/arch/mips/generic/board-marduk.its.S b/arch/mips/generic/board-marduk.its.S new file mode 100644 index 000000000000..4f633794db90 --- /dev/null +++ b/arch/mips/generic/board-marduk.its.S @@ -0,0 +1,22 @@ +/ { + images { + fdt-marduk { + description = "img,pistachio-marduk Device Tree"; + data = /incbin/("boot/dts/img/pistachio_marduk.dtb"); + type = "flat_dt"; + arch = "mips"; + compression = "none"; + hash { + algo = "sha1"; + }; + }; + }; + + configurations { + conf-marduk { + description = "Marduk Linux kernel"; + kernel = "kernel"; + fdt = "fdt-marduk"; + }; + }; +}; From d32524a2d05791181756bb4ab4f6e0628471fde2 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Fri, 23 Jul 2021 10:25:36 +0800 Subject: [PATCH 140/474] MIPS: DTS: Pistachio add missing cpc and cdmm CPC and CDMM addresses are adjustable and we should tell kernel how to place them in devicetree. Note that MACH_PISTACHIO code hardcoded CDMM base to 0x1bdd0000, however it will collide with GIC address range. As we don't have any CDMM device on this platform it won't be a problem. I found another spare range, 0x1bdf0000~0x1be00000 to place CDMM instead. Signed-off-by: Jiaxun Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/img/pistachio.dtsi | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/mips/boot/dts/img/pistachio.dtsi b/arch/mips/boot/dts/img/pistachio.dtsi index dc3b7909de73..b1db8b8f446f 100644 --- a/arch/mips/boot/dts/img/pistachio.dtsi +++ b/arch/mips/boot/dts/img/pistachio.dtsi @@ -900,6 +900,16 @@ }; }; + cpc: cpc@1bde0000 { + compatible = "mti,mips-cpc"; + reg = <0x1bde0000 0x10000>; + }; + + cdmm: cdmm@1bdf0000 { + compatible = "mti,mips-cdmm"; + reg = <0x1bdf0000 0x10000>; + }; + usb_phy: usb-phy { compatible = "img,pistachio-usb-phy"; clocks = <&clk_core CLK_USB_PHY>; From 90429205c000f4befdc212cfade39e358292584c Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Fri, 23 Jul 2021 10:25:37 +0800 Subject: [PATCH 141/474] clk: pistachio: Make it selectable for generic MIPS kernel We're moving pistachio to generic MIPS kernel. The clk driver should be avilable to the generic MIPS kernel. Signed-off-by: Jiaxun Yang Acked-by: Stephen Boyd Signed-off-by: Thomas Bogendoerfer --- drivers/clk/Kconfig | 1 + drivers/clk/Makefile | 2 +- drivers/clk/pistachio/Kconfig | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 drivers/clk/pistachio/Kconfig diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig index e873f9ea2e65..c5b3dc97396a 100644 --- a/drivers/clk/Kconfig +++ b/drivers/clk/Kconfig @@ -403,6 +403,7 @@ source "drivers/clk/mediatek/Kconfig" source "drivers/clk/meson/Kconfig" source "drivers/clk/mstar/Kconfig" source "drivers/clk/mvebu/Kconfig" +source "drivers/clk/pistachio/Kconfig" source "drivers/clk/qcom/Kconfig" source "drivers/clk/ralink/Kconfig" source "drivers/clk/renesas/Kconfig" diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile index 2b91d34c582b..e42312121e51 100644 --- a/drivers/clk/Makefile +++ b/drivers/clk/Makefile @@ -97,7 +97,7 @@ obj-y += mstar/ obj-y += mvebu/ obj-$(CONFIG_ARCH_MXS) += mxs/ obj-$(CONFIG_COMMON_CLK_NXP) += nxp/ -obj-$(CONFIG_MACH_PISTACHIO) += pistachio/ +obj-$(CONFIG_COMMON_CLK_PISTACHIO) += pistachio/ obj-$(CONFIG_COMMON_CLK_PXA) += pxa/ obj-$(CONFIG_COMMON_CLK_QCOM) += qcom/ obj-y += ralink/ diff --git a/drivers/clk/pistachio/Kconfig b/drivers/clk/pistachio/Kconfig new file mode 100644 index 000000000000..d00f7b4a25fc --- /dev/null +++ b/drivers/clk/pistachio/Kconfig @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 + +config COMMON_CLK_PISTACHIO + bool "Support for IMG Pistachio SoC clock controllers" + depends on MIPS || COMPILE_TEST + help + Support for the IMG Pistachio SoC clock controller. + Say Y if you want to include clock support. From 1e4fd60b54cf9f65d2d5b89a9289cbbc42922adf Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Fri, 23 Jul 2021 10:25:38 +0800 Subject: [PATCH 142/474] clocksource/drivers/pistachio: Make it selectable for MIPS So it will be avilable for generic MIPS kernel. Signed-off-by: Jiaxun Yang Signed-off-by: Thomas Bogendoerfer --- drivers/clocksource/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index eb661b539a3e..0f5e3983951a 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -234,8 +234,9 @@ config CLKSRC_LPC32XX Support for the LPC32XX clocksource. config CLKSRC_PISTACHIO - bool "Clocksource for Pistachio SoC" if COMPILE_TEST + bool "Clocksource for Pistachio SoC" depends on HAS_IOMEM + depends on MIPS || COMPILE_TEST select TIMER_OF help Enables the clocksource for the Pistachio SoC. From e238f10d8606ddb9a1728628a218c3f740b940be Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Fri, 23 Jul 2021 10:25:39 +0800 Subject: [PATCH 143/474] phy: pistachio-usb: Depend on MIPS || COMPILE_TEST So it will be avilable for generic MIPS kernel. Signed-off-by: Jiaxun Yang Acked-By: Vinod Koul Signed-off-by: Thomas Bogendoerfer --- drivers/phy/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/Kconfig b/drivers/phy/Kconfig index 7dd35f1b9cc5..82b63e60c5a2 100644 --- a/drivers/phy/Kconfig +++ b/drivers/phy/Kconfig @@ -37,7 +37,7 @@ config PHY_LPC18XX_USB_OTG config PHY_PISTACHIO_USB tristate "IMG Pistachio USB2.0 PHY driver" - depends on MACH_PISTACHIO + depends on MIPS || COMPILE_TEST select GENERIC_PHY help Enable this to support the USB2.0 PHY on the IMG Pistachio SoC. From f14973038d814c3ad032209794693dceca5eb7cf Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Fri, 23 Jul 2021 10:25:40 +0800 Subject: [PATCH 144/474] pinctrl: pistachio: Make it as an option So it will be avilable for generic MIPS kernel. -- Signed-off-by: Jiaxun Yang v3: Depend on OF as well Acked-by: Linus Walleij Signed-off-by: Thomas Bogendoerfer --- drivers/pinctrl/Kconfig | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/Kconfig b/drivers/pinctrl/Kconfig index f38f12801f18..eb981713b40d 100644 --- a/drivers/pinctrl/Kconfig +++ b/drivers/pinctrl/Kconfig @@ -248,12 +248,15 @@ config PINCTRL_SX150X - 16 bits: sx1509q, sx1506q config PINCTRL_PISTACHIO - def_bool y if MACH_PISTACHIO + bool "IMG Pistachio SoC pinctrl driver" + depends on OF && (MIPS || COMPILE_TEST) depends on GPIOLIB select PINMUX select GENERIC_PINCONF select GPIOLIB_IRQCHIP select OF_GPIO + help + This support pinctrl and gpio driver for IMG Pistachio SoC. config PINCTRL_ST bool From 917b64f1df2b9cec0a0859bd8d96b24679038f78 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Fri, 23 Jul 2021 10:25:41 +0800 Subject: [PATCH 145/474] MIPS: config: generic: Add config for Marduk board The config is minimal config to allow it boot from SD Card. Signed-off-by: Jiaxun Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/configs/generic/board-marduk.config | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 arch/mips/configs/generic/board-marduk.config diff --git a/arch/mips/configs/generic/board-marduk.config b/arch/mips/configs/generic/board-marduk.config new file mode 100644 index 000000000000..05ca34cd5a73 --- /dev/null +++ b/arch/mips/configs/generic/board-marduk.config @@ -0,0 +1,53 @@ +CONFIG_FIT_IMAGE_FDT_MARDUK=y + +CONFIG_SCSI=y +CONFIG_BLK_DEV_SD=y + +CONFIG_CLKSRC_PISTACHIO=y + +CONFIG_COMMON_CLK_PISTACHIO=y + +CONFIG_DMADEVICES=y +CONFIG_IMG_MDC_DMA=y + +CONFIG_GPIOLIB=y +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_PCH=y + +CONFIG_I2C=y +CONFIG_I2C_IMG=y + +CONFIG_MMC=y +CONFIG_MMC_SDHCI=y +CONFIG_MMC_DW=y +CONFIG_MMC_DW_PLTFM=y + +CONFIG_NETDEVICES=y +CONFIG_STMMAC_ETH=y +CONFIG_STMMAC_PLATFORM=y + +CONFIG_PHY_PISTACHIO_USB=y + +CONFIG_PINCTRL=y +CONFIG_PINCTRL_PISTACHIO=y + +CONFIG_RESET_PISTACHIO=y + +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +CONFIG_SERIAL_8250_DW=y + +CONFIG_SPI=y +CONFIG_SRAM=y + +CONFIG_USB=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_DWC2=y + +CONFIG_CRYPTO_DEV_IMGTEC_HASH=y +CONFIG_IMGPDC_WDT=y +CONFIG_IR_IMG=y +CONFIG_CC10001_ADC=y +CONFIG_SND_SOC_IMG=y From 104f942b2832ab1340dab34ae2dad8b31b772788 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Fri, 23 Jul 2021 10:25:42 +0800 Subject: [PATCH 146/474] MIPS: Retire MACH_PISTACHIO Now it can be replaced by generic kernel. Signed-off-by: Jiaxun Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/Kbuild.platforms | 1 - arch/mips/Kconfig | 30 --- arch/mips/boot/dts/Makefile | 1 - arch/mips/boot/dts/img/Makefile | 3 - arch/mips/configs/pistachio_defconfig | 316 -------------------------- arch/mips/pistachio/Kconfig | 14 -- arch/mips/pistachio/Makefile | 2 - arch/mips/pistachio/Platform | 6 - arch/mips/pistachio/init.c | 125 ---------- arch/mips/pistachio/irq.c | 24 -- arch/mips/pistachio/time.c | 55 ----- 11 files changed, 577 deletions(-) delete mode 100644 arch/mips/configs/pistachio_defconfig delete mode 100644 arch/mips/pistachio/Kconfig delete mode 100644 arch/mips/pistachio/Makefile delete mode 100644 arch/mips/pistachio/Platform delete mode 100644 arch/mips/pistachio/init.c delete mode 100644 arch/mips/pistachio/irq.c delete mode 100644 arch/mips/pistachio/time.c diff --git a/arch/mips/Kbuild.platforms b/arch/mips/Kbuild.platforms index e4f6e49417a9..584081df89c2 100644 --- a/arch/mips/Kbuild.platforms +++ b/arch/mips/Kbuild.platforms @@ -21,7 +21,6 @@ platform-$(CONFIG_MIPS_MALTA) += mti-malta/ platform-$(CONFIG_MACH_NINTENDO64) += n64/ platform-$(CONFIG_NLM_COMMON) += netlogic/ platform-$(CONFIG_PIC32MZDA) += pic32/ -platform-$(CONFIG_MACH_PISTACHIO) += pistachio/ platform-$(CONFIG_RALINK) += ralink/ platform-$(CONFIG_MIKROTIK_RB532) += rb532/ platform-$(CONFIG_SGI_IP22) += sgi-ip22/ diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index cee6087cd686..b783264243e4 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -513,35 +513,6 @@ config MACH_LOONGSON64 and Loongson-2F which will be removed), developed by the Institute of Computing Technology (ICT), Chinese Academy of Sciences (CAS). -config MACH_PISTACHIO - bool "IMG Pistachio SoC based boards" - select BOOT_ELF32 - select BOOT_RAW - select CEVT_R4K - select CLKSRC_MIPS_GIC - select COMMON_CLK - select CSRC_R4K - select DMA_NONCOHERENT - select GPIOLIB - select IRQ_MIPS_CPU - select MFD_SYSCON - select MIPS_CPU_SCACHE - select MIPS_GIC - select PINCTRL - select REGULATOR - select SYS_HAS_CPU_MIPS32_R2 - select SYS_SUPPORTS_32BIT_KERNEL - select SYS_SUPPORTS_LITTLE_ENDIAN - select SYS_SUPPORTS_MIPS_CPS - select SYS_SUPPORTS_MULTITHREADING - select SYS_SUPPORTS_RELOCATABLE - select SYS_SUPPORTS_ZBOOT - select SYS_HAS_EARLY_PRINTK - select USE_GENERIC_EARLY_PRINTK_8250 - select USE_OF - help - This enables support for the IMG Pistachio SoC platform. - config MIPS_MALTA bool "MIPS Malta board" select ARCH_MAY_HAVE_PC_FDC @@ -1088,7 +1059,6 @@ source "arch/mips/ingenic/Kconfig" source "arch/mips/jazz/Kconfig" source "arch/mips/lantiq/Kconfig" source "arch/mips/pic32/Kconfig" -source "arch/mips/pistachio/Kconfig" source "arch/mips/ralink/Kconfig" source "arch/mips/sgi-ip27/Kconfig" source "arch/mips/sibyte/Kconfig" diff --git a/arch/mips/boot/dts/Makefile b/arch/mips/boot/dts/Makefile index 188301164d9e..be96d35eb582 100644 --- a/arch/mips/boot/dts/Makefile +++ b/arch/mips/boot/dts/Makefile @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 subdir-$(CONFIG_BMIPS_GENERIC) += brcm subdir-$(CONFIG_CAVIUM_OCTEON_SOC) += cavium-octeon -subdir-$(CONFIG_MACH_PISTACHIO) += img subdir-$(CONFIG_FIT_IMAGE_FDT_MARDUK) += img subdir-$(CONFIG_FIT_IMAGE_FDT_BOSTON) += img subdir-$(CONFIG_MACH_INGENIC) += ingenic diff --git a/arch/mips/boot/dts/img/Makefile b/arch/mips/boot/dts/img/Makefile index 24f6bbeadd48..ebb47490b04b 100644 --- a/arch/mips/boot/dts/img/Makefile +++ b/arch/mips/boot/dts/img/Makefile @@ -2,6 +2,3 @@ dtb-$(CONFIG_FIT_IMAGE_FDT_BOSTON) += boston.dtb dtb-$(CONFIG_FIT_IMAGE_FDT_MARDUK) += pistachio_marduk.dtb - -dtb-$(CONFIG_MACH_PISTACHIO) += pistachio_marduk.dtb -obj-$(CONFIG_MACH_PISTACHIO) += pistachio_marduk.dtb.o diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig deleted file mode 100644 index b9adf15ebbec..000000000000 --- a/arch/mips/configs/pistachio_defconfig +++ /dev/null @@ -1,316 +0,0 @@ -# CONFIG_LOCALVERSION_AUTO is not set -CONFIG_DEFAULT_HOSTNAME="localhost" -CONFIG_SYSVIPC=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_PREEMPT_VOLUNTARY=y -CONFIG_IKCONFIG=m -CONFIG_IKCONFIG_PROC=y -CONFIG_LOG_BUF_SHIFT=18 -CONFIG_CGROUPS=y -CONFIG_CGROUP_SCHED=y -CONFIG_CFS_BANDWIDTH=y -CONFIG_CGROUP_FREEZER=y -CONFIG_NAMESPACES=y -CONFIG_USER_NS=y -CONFIG_BLK_DEV_INITRD=y -# CONFIG_RD_BZIP2 is not set -# CONFIG_RD_LZMA is not set -# CONFIG_RD_LZO is not set -# CONFIG_RD_LZ4 is not set -CONFIG_CC_OPTIMIZE_FOR_SIZE=y -CONFIG_EMBEDDED=y -# CONFIG_COMPAT_BRK is not set -CONFIG_PROFILING=y -CONFIG_MACH_PISTACHIO=y -CONFIG_MIPS_CPS=y -CONFIG_NR_CPUS=4 -CONFIG_PM_DEBUG=y -CONFIG_PM_ADVANCED_DEBUG=y -CONFIG_CPU_IDLE=y -# CONFIG_MIPS_CPS_CPUIDLE is not set -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_PARTITION_ADVANCED=y -# CONFIG_COMPACTION is not set -CONFIG_DEFAULT_MMAP_MIN_ADDR=32768 -CONFIG_ZSMALLOC=y -CONFIG_NET=y -CONFIG_PACKET=y -CONFIG_UNIX=y -CONFIG_NET_KEY=m -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_ADVANCED_ROUTER=y -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_ROUTE_MULTIPATH=y -CONFIG_IP_ROUTE_VERBOSE=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -CONFIG_IP_MROUTE=y -CONFIG_IP_PIMSM_V1=y -CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y -CONFIG_INET_AH=m -CONFIG_INET_ESP=m -CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m -# CONFIG_INET_DIAG is not set -CONFIG_TCP_CONG_ADVANCED=y -# CONFIG_TCP_CONG_BIC is not set -# CONFIG_TCP_CONG_WESTWOOD is not set -# CONFIG_TCP_CONG_HTCP is not set -CONFIG_TCP_CONG_LP=m -CONFIG_TCP_MD5SIG=y -CONFIG_INET6_AH=m -CONFIG_INET6_ESP=m -CONFIG_INET6_XFRM_MODE_TRANSPORT=m -CONFIG_INET6_XFRM_MODE_TUNNEL=m -CONFIG_INET6_XFRM_MODE_BEET=m -CONFIG_IPV6_SIT=m -CONFIG_NETWORK_SECMARK=y -CONFIG_NETFILTER=y -# CONFIG_BRIDGE_NETFILTER is not set -CONFIG_NF_CONNTRACK=y -CONFIG_NF_CT_NETLINK=y -CONFIG_NETFILTER_XT_MARK=m -CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y -CONFIG_NETFILTER_XT_TARGET_DSCP=y -CONFIG_NETFILTER_XT_TARGET_NFLOG=y -CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y -CONFIG_NETFILTER_XT_TARGET_SECMARK=y -CONFIG_NETFILTER_XT_TARGET_TCPMSS=m -CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y -CONFIG_NETFILTER_XT_MATCH_DSCP=y -CONFIG_NETFILTER_XT_MATCH_POLICY=y -CONFIG_NETFILTER_XT_MATCH_STATE=y -CONFIG_NF_NAT_IPV4=m -CONFIG_IP_NF_IPTABLES=y -CONFIG_IP_NF_FILTER=y -CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_IP_NF_MANGLE=y -CONFIG_NF_NAT_IPV6=m -CONFIG_IP6_NF_IPTABLES=m -CONFIG_IP6_NF_MATCH_IPV6HEADER=m -CONFIG_IP6_NF_FILTER=m -CONFIG_IP6_NF_TARGET_REJECT=m -CONFIG_IP6_NF_MANGLE=m -CONFIG_BRIDGE=m -CONFIG_VLAN_8021Q=m -CONFIG_NET_SCHED=y -CONFIG_NET_SCH_HTB=m -CONFIG_NET_SCH_CODEL=m -CONFIG_NET_SCH_FQ_CODEL=m -CONFIG_NET_CLS_U32=m -CONFIG_CLS_U32_MARK=y -CONFIG_BT=m -CONFIG_BT_RFCOMM=m -CONFIG_BT_HCIBTUSB=m -CONFIG_BT_HCIBFUSB=m -CONFIG_BT_HCIVHCI=m -CONFIG_CFG80211=m -CONFIG_NL80211_TESTMODE=y -CONFIG_CFG80211_DEBUGFS=y -CONFIG_CFG80211_WEXT=y -CONFIG_MAC80211=m -CONFIG_MAC80211_LEDS=y -CONFIG_MAC80211_DEBUGFS=y -CONFIG_MAC80211_DEBUG_MENU=y -CONFIG_MAC80211_VERBOSE_DEBUG=y -CONFIG_RFKILL=y -CONFIG_DEVTMPFS=y -CONFIG_DEVTMPFS_MOUNT=y -CONFIG_DEBUG_DEVRES=y -CONFIG_CONNECTOR=y -CONFIG_MTD=y -CONFIG_MTD_BLOCK=y -CONFIG_MTD_SPI_NOR=y -CONFIG_MTD_UBI=y -CONFIG_MTD_UBI_BLOCK=y -CONFIG_ZRAM=m -CONFIG_BLK_DEV_LOOP=y -CONFIG_SCSI=y -CONFIG_BLK_DEV_SD=y -CONFIG_BLK_DEV_SR=m -CONFIG_SCSI_SPI_ATTRS=y -CONFIG_MD=y -CONFIG_BLK_DEV_DM=y -CONFIG_DM_CRYPT=y -CONFIG_DM_VERITY=y -CONFIG_NETDEVICES=y -CONFIG_TUN=m -CONFIG_VETH=m -# CONFIG_NET_VENDOR_MARVELL is not set -# CONFIG_NET_VENDOR_MICREL is not set -# CONFIG_NET_VENDOR_MICROCHIP is not set -# CONFIG_NET_VENDOR_NATSEMI is not set -# CONFIG_NET_VENDOR_SEEQ is not set -# CONFIG_NET_VENDOR_SMSC is not set -CONFIG_STMMAC_ETH=y -# CONFIG_NET_VENDOR_VIA is not set -CONFIG_PPP=m -CONFIG_PPP_ASYNC=m -CONFIG_USB_PEGASUS=m -CONFIG_USB_RTL8150=m -CONFIG_USB_RTL8152=m -CONFIG_USB_NET_DM9601=m -CONFIG_USB_NET_SMSC75XX=m -CONFIG_USB_NET_SMSC95XX=m -CONFIG_USB_NET_MCS7830=m -# CONFIG_USB_NET_CDC_SUBSET is not set -# CONFIG_USB_NET_ZAURUS is not set -CONFIG_HOSTAP=m -CONFIG_HOSTAP_FIRMWARE=y -CONFIG_HOSTAP_FIRMWARE_NVRAM=y -CONFIG_LIBERTAS_THINFIRM=m -CONFIG_RT2X00=m -CONFIG_RT2800USB=m -CONFIG_MAC80211_HWSIM=m -CONFIG_USB_NET_RNDIS_WLAN=m -CONFIG_INPUT_EVDEV=y -# CONFIG_KEYBOARD_ATKBD is not set -CONFIG_KEYBOARD_GPIO=y -# CONFIG_INPUT_MOUSE is not set -# CONFIG_SERIO is not set -# CONFIG_VT is not set -# CONFIG_LEGACY_PTYS is not set -CONFIG_SERIAL_8250=y -# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_8250_DW=y -CONFIG_SERIAL_OF_PLATFORM=y -CONFIG_HW_RANDOM=y -CONFIG_TCG_TPM=y -CONFIG_I2C=y -CONFIG_I2C_CHARDEV=m -CONFIG_I2C_IMG=y -CONFIG_I2C_STUB=m -CONFIG_SPI=y -CONFIG_SPI_BITBANG=m -CONFIG_SPI_IMG_SPFI=y -CONFIG_SPI_SPIDEV=y -CONFIG_DEBUG_GPIO=y -CONFIG_GPIO_SYSFS=y -CONFIG_POWER_SUPPLY=y -CONFIG_THERMAL=y -CONFIG_WATCHDOG=y -CONFIG_IMGPDC_WDT=y -CONFIG_REGULATOR_FIXED_VOLTAGE=y -CONFIG_REGULATOR_GPIO=y -CONFIG_RC_CORE=y -CONFIG_RC_DEVICES=y -CONFIG_IR_IMG=y -CONFIG_IR_IMG_NEC=y -CONFIG_IR_IMG_JVC=y -CONFIG_IR_IMG_SONY=y -CONFIG_IR_IMG_SHARP=y -CONFIG_IR_IMG_SANYO=y -CONFIG_IR_IMG_RC5=y -CONFIG_IR_IMG_RC6=y -CONFIG_MEDIA_SUPPORT=y -CONFIG_FB=y -CONFIG_FB_MODE_HELPERS=y -# CONFIG_LCD_CLASS_DEVICE is not set -CONFIG_BACKLIGHT_CLASS_DEVICE=y -CONFIG_SOUND=y -CONFIG_SND=y -CONFIG_SND_HRTIMER=m -CONFIG_SND_DYNAMIC_MINORS=y -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m -# CONFIG_SND_SPI is not set -CONFIG_SND_USB_AUDIO=m -CONFIG_USB=y -CONFIG_USB_ANNOUNCE_NEW_DEVICES=y -# CONFIG_USB_DEFAULT_PERSIST is not set -CONFIG_USB_MON=y -CONFIG_USB_EHCI_HCD=y -CONFIG_USB_EHCI_ROOT_HUB_TT=y -CONFIG_USB_ACM=y -CONFIG_USB_STORAGE=y -CONFIG_USB_DWC2=y -CONFIG_USB_SERIAL=y -CONFIG_USB_SERIAL_GENERIC=y -CONFIG_USB_SERIAL_CP210X=m -CONFIG_USB_SERIAL_FTDI_SIO=m -CONFIG_USB_SERIAL_KEYSPAN=m -CONFIG_USB_SERIAL_PL2303=m -CONFIG_USB_SERIAL_OTI6858=m -CONFIG_USB_SERIAL_QUALCOMM=m -CONFIG_USB_SERIAL_SIERRAWIRELESS=m -CONFIG_USB_SERIAL_OPTION=m -CONFIG_MMC=y -CONFIG_MMC_BLOCK_MINORS=16 -CONFIG_MMC_TEST=m -CONFIG_MMC_DW=y -CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=y -CONFIG_RTC_CLASS=y -CONFIG_DMADEVICES=y -CONFIG_IMG_MDC_DMA=y -CONFIG_STAGING=y -CONFIG_ASHMEM=y -# CONFIG_IOMMU_SUPPORT is not set -CONFIG_MEMORY=y -CONFIG_IIO=y -CONFIG_CC10001_ADC=y -CONFIG_PWM=y -CONFIG_PWM_IMG=y -CONFIG_PHY_PISTACHIO_USB=y -CONFIG_ANDROID=y -CONFIG_EXT4_FS=y -CONFIG_EXT4_FS_POSIX_ACL=y -CONFIG_EXT4_FS_SECURITY=y -# CONFIG_DNOTIFY is not set -CONFIG_FUSE_FS=m -CONFIG_ISO9660_FS=m -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -CONFIG_UDF_FS=m -CONFIG_VFAT_FS=m -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_ECRYPT_FS=y -CONFIG_HFSPLUS_FS=m -CONFIG_UBIFS_FS=y -CONFIG_SQUASHFS=y -CONFIG_SQUASHFS_FILE_DIRECT=y -CONFIG_SQUASHFS_LZO=y -CONFIG_PSTORE=y -CONFIG_PSTORE_CONSOLE=y -CONFIG_PSTORE_RAM=y -CONFIG_NFS_FS=y -CONFIG_ROOT_NFS=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=m -CONFIG_NLS_ASCII=m -CONFIG_NLS_ISO8859_1=m -CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y -CONFIG_SECURITY_YAMA=y -CONFIG_CRYPTO_AUTHENC=y -CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_SHA1=y -CONFIG_CRYPTO_SHA256=y -CONFIG_CRYPTO_SHA512=m -CONFIG_CRYPTO_ARC4=y -CONFIG_CRYPTO_DES=y -CONFIG_CRC_CCITT=y -CONFIG_CRC_T10DIF=m -CONFIG_CRC7=m -# CONFIG_XZ_DEC_X86 is not set -CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y -CONFIG_MAGIC_SYSRQ=y -CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0 -# CONFIG_SCHED_DEBUG is not set -CONFIG_SCHEDSTATS=y -CONFIG_DEBUG_SPINLOCK=y -CONFIG_DEBUG_CREDENTIALS=y -CONFIG_FUNCTION_TRACER=y -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_LKDTM=y -CONFIG_TEST_UDELAY=m diff --git a/arch/mips/pistachio/Kconfig b/arch/mips/pistachio/Kconfig deleted file mode 100644 index 9a0e06c95184..000000000000 --- a/arch/mips/pistachio/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -config PISTACHIO_GPTIMER_CLKSRC - bool "Enable General Purpose Timer based clocksource" - depends on MACH_PISTACHIO - select CLKSRC_PISTACHIO - select MIPS_EXTERNAL_TIMER - help - This option enables a clocksource driver based on a Pistachio - SoC General Purpose external timer. - - If you want to enable the CPUFreq, you need to enable - this option. - - If you don't want to enable CPUFreq, you can leave this disabled. diff --git a/arch/mips/pistachio/Makefile b/arch/mips/pistachio/Makefile deleted file mode 100644 index 66f4af17fb66..000000000000 --- a/arch/mips/pistachio/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -obj-y += init.o irq.o time.o diff --git a/arch/mips/pistachio/Platform b/arch/mips/pistachio/Platform deleted file mode 100644 index c59de86dbddf..000000000000 --- a/arch/mips/pistachio/Platform +++ /dev/null @@ -1,6 +0,0 @@ -# -# IMG Pistachio SoC -# -load-$(CONFIG_MACH_PISTACHIO) += 0xffffffff80400000 -zload-$(CONFIG_MACH_PISTACHIO) += 0xffffffff81000000 -all-$(CONFIG_MACH_PISTACHIO) := uImage.gz diff --git a/arch/mips/pistachio/init.c b/arch/mips/pistachio/init.c deleted file mode 100644 index e0bacfc3c6b4..000000000000 --- a/arch/mips/pistachio/init.c +++ /dev/null @@ -1,125 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Pistachio platform setup - * - * Copyright (C) 2014 Google, Inc. - * Copyright (C) 2016 Imagination Technologies - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -/* - * Core revision register decoding - * Bits 23 to 20: Major rev - * Bits 15 to 8: Minor rev - * Bits 7 to 0: Maintenance rev - */ -#define PISTACHIO_CORE_REV_REG 0xB81483D0 -#define PISTACHIO_CORE_REV_A1 0x00100006 -#define PISTACHIO_CORE_REV_B0 0x00100106 - -const char *get_system_type(void) -{ - u32 core_rev; - const char *sys_type; - - core_rev = __raw_readl((const void *)PISTACHIO_CORE_REV_REG); - - switch (core_rev) { - case PISTACHIO_CORE_REV_B0: - sys_type = "IMG Pistachio SoC (B0)"; - break; - - case PISTACHIO_CORE_REV_A1: - sys_type = "IMG Pistachio SoC (A1)"; - break; - - default: - sys_type = "IMG Pistachio SoC"; - break; - } - - return sys_type; -} - -void __init *plat_get_fdt(void) -{ - if (fw_arg0 != -2) - panic("Device-tree not present"); - return (void *)fw_arg1; -} - -void __init plat_mem_setup(void) -{ - __dt_setup_arch(plat_get_fdt()); -} - -#define DEFAULT_CPC_BASE_ADDR 0x1bde0000 -#define DEFAULT_CDMM_BASE_ADDR 0x1bdd0000 - -phys_addr_t mips_cpc_default_phys_base(void) -{ - return DEFAULT_CPC_BASE_ADDR; -} - -phys_addr_t mips_cdmm_phys_base(void) -{ - return DEFAULT_CDMM_BASE_ADDR; -} - -static void __init mips_nmi_setup(void) -{ - void *base; - - base = cpu_has_veic ? - (void *)(CAC_BASE + 0xa80) : - (void *)(CAC_BASE + 0x380); - memcpy(base, except_vec_nmi, 0x80); - flush_icache_range((unsigned long)base, - (unsigned long)base + 0x80); -} - -static void __init mips_ejtag_setup(void) -{ - void *base; - extern char except_vec_ejtag_debug[]; - - base = cpu_has_veic ? - (void *)(CAC_BASE + 0xa00) : - (void *)(CAC_BASE + 0x300); - memcpy(base, except_vec_ejtag_debug, 0x80); - flush_icache_range((unsigned long)base, - (unsigned long)base + 0x80); -} - -void __init prom_init(void) -{ - board_nmi_handler_setup = mips_nmi_setup; - board_ejtag_handler_setup = mips_ejtag_setup; - - mips_cm_probe(); - mips_cpc_probe(); - register_cps_smp_ops(); - - pr_info("SoC Type: %s\n", get_system_type()); -} - -void __init device_tree_init(void) -{ - if (!initial_boot_params) - return; - - unflatten_and_copy_device_tree(); -} diff --git a/arch/mips/pistachio/irq.c b/arch/mips/pistachio/irq.c deleted file mode 100644 index 437c3101ac45..000000000000 --- a/arch/mips/pistachio/irq.c +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Pistachio IRQ setup - * - * Copyright (C) 2014 Google, Inc. - */ - -#include -#include -#include - -#include -#include - -void __init arch_init_irq(void) -{ - pr_info("EIC is %s\n", cpu_has_veic ? "on" : "off"); - pr_info("VINT is %s\n", cpu_has_vint ? "on" : "off"); - - if (!cpu_has_veic) - mips_cpu_irq_init(); - - irqchip_init(); -} diff --git a/arch/mips/pistachio/time.c b/arch/mips/pistachio/time.c deleted file mode 100644 index de64751dec40..000000000000 --- a/arch/mips/pistachio/time.c +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Pistachio clocksource/timer setup - * - * Copyright (C) 2014 Google, Inc. - */ - -#include -#include -#include -#include -#include - -#include -#include - -unsigned int get_c0_compare_int(void) -{ - return gic_get_c0_compare_int(); -} - -int get_c0_perfcount_int(void) -{ - return gic_get_c0_perfcount_int(); -} -EXPORT_SYMBOL_GPL(get_c0_perfcount_int); - -int get_c0_fdc_int(void) -{ - return gic_get_c0_fdc_int(); -} - -void __init plat_time_init(void) -{ - struct device_node *np; - struct clk *clk; - - of_clk_init(NULL); - timer_probe(); - - np = of_get_cpu_node(0, NULL); - if (!np) { - pr_err("Failed to get CPU node\n"); - return; - } - - clk = of_clk_get(np, 0); - if (IS_ERR(clk)) { - pr_err("Failed to get CPU clock: %ld\n", PTR_ERR(clk)); - return; - } - - mips_hpt_frequency = clk_get_rate(clk) / 2; - clk_put(clk); -} From 3f66601ef3f3e336c1f8181f183001b58be62067 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Fri, 23 Jul 2021 10:25:43 +0800 Subject: [PATCH 147/474] MIPS: Make a alias for pistachio_defconfig For those who miss the old defconfig, make a alias to generic kernel. Signed-off-by: Jiaxun Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/mips/Makefile b/arch/mips/Makefile index 4e942b7ef022..6eaeee3fb46c 100644 --- a/arch/mips/Makefile +++ b/arch/mips/Makefile @@ -560,6 +560,9 @@ sead3micro_defconfig-y := micro32r2el_defconfig BOARDS=sead-3 legacy_defconfigs += xilfpga_defconfig xilfpga_defconfig-y := 32r2el_defconfig BOARDS=xilfpga +legacy_defconfigs += pistachio_defconfig +pistachio_defconfig-y := 32r2el_defconfig BOARDS=marduk + .PHONY: $(legacy_defconfigs) $(legacy_defconfigs): $(Q)$(MAKE) -f $(srctree)/Makefile $($@-y) From 4d2ee1be4c2a5552f083b6d7db4be224f96313b5 Mon Sep 17 00:00:00 2001 From: Huilong Deng Date: Wed, 11 Aug 2021 12:36:15 +0800 Subject: [PATCH 148/474] MIPS: generic: Return true/false (not 1/0) from bool functions ./arch/mips/generic/board-ocelot.c:29:9-10: WARNING: return of 0/1 in function 'ocelot_detect' with return type bool Signed-off-by: Huilong Deng Signed-off-by: Thomas Bogendoerfer --- arch/mips/generic/board-ocelot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/mips/generic/board-ocelot.c b/arch/mips/generic/board-ocelot.c index c238e95190ac..7115410acb4f 100644 --- a/arch/mips/generic/board-ocelot.c +++ b/arch/mips/generic/board-ocelot.c @@ -26,13 +26,13 @@ static __init bool ocelot_detect(void) tlb_probe_hazard(); idx = read_c0_index(); if (idx < 0) - return 0; + return false; /* A TLB entry exists, lets assume its usable and check the CHIP ID */ rev = __raw_readl((void __iomem *)DEVCPU_GCB_CHIP_REGS_CHIP_ID); if ((rev & CHIP_ID_PART_ID) != OCELOT_PART_ID) - return 0; + return false; /* Copy command line from bootloader early for Initrd detection */ if (fw_arg0 < 10 && (fw_arg1 & 0xFFF00000) == 0x80000000) { @@ -44,7 +44,7 @@ static __init bool ocelot_detect(void) strcpy(arcs_cmdline, prom_argv[1]); } - return 1; + return true; } static void __init ocelot_earlyprintk_init(void) From b11748e693166679acc13c8a4328a71efe1d4a89 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Sun, 1 Aug 2021 09:38:20 +0200 Subject: [PATCH 149/474] powerpc: wii.dts: Reduce the size of the control area This is wrong, but needed in order to avoid overlapping ranges with the OTP area added in the next commit. A refactor of this part of the device tree is needed: according to Wiibrew[1], this area starts at 0x0d800000 and spans 0x400 bytes (that is, 0x100 32-bit registers), encompassing PIC and GPIO registers, amongst the ones already exposed in this device tree, which should become children of the control@d800000 node. [1] https://wiibrew.org/wiki/Hardware/Hollywood_Registers Signed-off-by: Emmanuel Gil Peyrot Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210801073822.12452-4-linkmauve@linkmauve.fr --- arch/powerpc/boot/dts/wii.dts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/boot/dts/wii.dts b/arch/powerpc/boot/dts/wii.dts index aaa381da1906..c5fb54f8cc02 100644 --- a/arch/powerpc/boot/dts/wii.dts +++ b/arch/powerpc/boot/dts/wii.dts @@ -216,7 +216,13 @@ control@d800100 { compatible = "nintendo,hollywood-control"; - reg = <0x0d800100 0x300>; + /* + * Both the address and length are wrong, according to + * Wiibrew this should be <0x0d800000 0x400>, but it + * requires refactoring the PIC1 and GPIO nodes before + * changing that. + */ + reg = <0x0d800100 0xa0>; }; disk@d806000 { From 562a610b4c5119034aed300f6ae212ec7a20c4b4 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Sun, 1 Aug 2021 09:38:21 +0200 Subject: [PATCH 150/474] powerpc: wii.dts: Expose the OTP on this platform This can be used by the newly-added nintendo-otp nvmem module. Signed-off-by: Emmanuel Gil Peyrot Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210801073822.12452-5-linkmauve@linkmauve.fr --- arch/powerpc/boot/dts/wii.dts | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/boot/dts/wii.dts b/arch/powerpc/boot/dts/wii.dts index c5fb54f8cc02..e9c945b123c6 100644 --- a/arch/powerpc/boot/dts/wii.dts +++ b/arch/powerpc/boot/dts/wii.dts @@ -219,12 +219,17 @@ /* * Both the address and length are wrong, according to * Wiibrew this should be <0x0d800000 0x400>, but it - * requires refactoring the PIC1 and GPIO nodes before - * changing that. + * requires refactoring the PIC1, GPIO and OTP nodes + * before changing that. */ reg = <0x0d800100 0xa0>; }; + otp@d8001ec { + compatible = "nintendo,hollywood-otp"; + reg = <0x0d8001ec 0x8>; + }; + disk@d806000 { compatible = "nintendo,hollywood-di"; reg = <0x0d806000 0x40>; From 140a89b7bfe65e9649c4a3678f74c32556834ec1 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Sun, 1 Aug 2021 09:38:22 +0200 Subject: [PATCH 151/474] powerpc: wii_defconfig: Enable OTP by default This selects the nintendo-otp module when building for this platform. Signed-off-by: Emmanuel Gil Peyrot Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210801073822.12452-6-linkmauve@linkmauve.fr --- arch/powerpc/configs/wii_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/configs/wii_defconfig b/arch/powerpc/configs/wii_defconfig index 379c171f3ddd..a0c45bf2bfb1 100644 --- a/arch/powerpc/configs/wii_defconfig +++ b/arch/powerpc/configs/wii_defconfig @@ -99,6 +99,7 @@ CONFIG_LEDS_TRIGGER_HEARTBEAT=y CONFIG_LEDS_TRIGGER_PANIC=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_GENERIC=y +CONFIG_NVMEM_NINTENDO_OTP=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_FUSE_FS=m From f34ee9cb2c5ac5af426fee6fa4591a34d187e696 Mon Sep 17 00:00:00 2001 From: "Pratik R. Sampat" Date: Wed, 28 Jul 2021 17:35:00 +0530 Subject: [PATCH 152/474] cpufreq: powernv: Fix init_chip_info initialization in numa=off In the numa=off kernel command-line configuration init_chip_info() loops around the number of chips and attempts to copy the cpumask of that node which is NULL for all iterations after the first chip. Hence, store the cpu mask for each chip instead of derving cpumask from node while populating the "chips" struct array and copy that to the chips[i].mask Fixes: 053819e0bf84 ("cpufreq: powernv: Handle throttling due to Pmax capping at chip level") Cc: stable@vger.kernel.org # v4.3+ Reported-by: Shirisha Ganta Signed-off-by: Pratik R. Sampat Reviewed-by: Gautham R. Shenoy [mpe: Rename goto label to out_free_chip_cpu_mask] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210728120500.87549-2-psampat@linux.ibm.com --- drivers/cpufreq/powernv-cpufreq.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 005600cef273..6fbb46b2f6da 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -36,6 +36,7 @@ #define MAX_PSTATE_SHIFT 32 #define LPSTATE_SHIFT 48 #define GPSTATE_SHIFT 56 +#define MAX_NR_CHIPS 32 #define MAX_RAMP_DOWN_TIME 5120 /* @@ -1046,12 +1047,20 @@ static int init_chip_info(void) unsigned int *chip; unsigned int cpu, i; unsigned int prev_chip_id = UINT_MAX; + cpumask_t *chip_cpu_mask; int ret = 0; chip = kcalloc(num_possible_cpus(), sizeof(*chip), GFP_KERNEL); if (!chip) return -ENOMEM; + /* Allocate a chip cpu mask large enough to fit mask for all chips */ + chip_cpu_mask = kcalloc(MAX_NR_CHIPS, sizeof(cpumask_t), GFP_KERNEL); + if (!chip_cpu_mask) { + ret = -ENOMEM; + goto free_and_return; + } + for_each_possible_cpu(cpu) { unsigned int id = cpu_to_chip_id(cpu); @@ -1059,22 +1068,25 @@ static int init_chip_info(void) prev_chip_id = id; chip[nr_chips++] = id; } + cpumask_set_cpu(cpu, &chip_cpu_mask[nr_chips-1]); } chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL); if (!chips) { ret = -ENOMEM; - goto free_and_return; + goto out_free_chip_cpu_mask; } for (i = 0; i < nr_chips; i++) { chips[i].id = chip[i]; - cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); + cpumask_copy(&chips[i].mask, &chip_cpu_mask[i]); INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); for_each_cpu(cpu, &chips[i].mask) per_cpu(chip_info, cpu) = &chips[i]; } +out_free_chip_cpu_mask: + kfree(chip_cpu_mask); free_and_return: kfree(chip); return ret; From 3e188b1ae8807f26cc5a530a9d55f3f643fe050a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:58:30 +0530 Subject: [PATCH 153/474] powerpc/book3s64/radix: make tlb_single_page_flush_ceiling a debugfs entry Similar to x86/s390 add a debugfs file to tune tlb_single_page_flush_ceiling. Also add a debugfs entry for tlb_local_single_page_flush_ceiling. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132831.233794-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/book3s64/radix_tlb.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index aefc100d79a7..1fa2bc6a969e 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "internal.h" @@ -1106,8 +1107,8 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range); * invalidating a full PID, so it has a far lower threshold to change from * individual page flushes to full-pid flushes. */ -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; -static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; +static u32 tlb_single_page_flush_ceiling __read_mostly = 33; +static u32 tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; static inline void __radix__flush_tlb_range(struct mm_struct *mm, unsigned long start, unsigned long end) @@ -1524,3 +1525,14 @@ void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid, EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + +static int __init create_tlb_single_page_flush_ceiling(void) +{ + debugfs_create_u32("tlb_single_page_flush_ceiling", 0600, + powerpc_debugfs_root, &tlb_single_page_flush_ceiling); + debugfs_create_u32("tlb_local_single_page_flush_ceiling", 0600, + powerpc_debugfs_root, &tlb_local_single_page_flush_ceiling); + return 0; +} +late_initcall(create_tlb_single_page_flush_ceiling); + From dbf77fed8b302e87561c7c2fc06050c88f4d3120 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:58:31 +0530 Subject: [PATCH 154/474] powerpc: rename powerpc_debugfs_root to arch_debugfs_dir No functional change in this patch. arch_debugfs_dir is the generic kernel name declared in linux/debugfs.h for arch-specific debugfs directory. Architectures like x86/s390 already use the name. Rename powerpc specific powerpc_debugfs_root to arch_debugfs_dir. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132831.233794-2-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/debugfs.h | 13 ------------- arch/powerpc/kernel/Makefile | 3 ++- arch/powerpc/kernel/dawr.c | 3 +-- arch/powerpc/kernel/eeh.c | 16 ++++++++-------- arch/powerpc/kernel/eeh_cache.c | 4 ++-- arch/powerpc/kernel/fadump.c | 4 ++-- arch/powerpc/kernel/hw_breakpoint.c | 1 - arch/powerpc/kernel/kdebugfs.c | 14 ++++++++++++++ arch/powerpc/kernel/security.c | 16 ++++++++-------- arch/powerpc/kernel/setup-common.c | 13 ------------- arch/powerpc/kernel/setup_64.c | 1 - arch/powerpc/kernel/traps.c | 4 ++-- arch/powerpc/kvm/book3s_xics.c | 6 +++--- arch/powerpc/kvm/book3s_xive.c | 3 +-- arch/powerpc/kvm/book3s_xive_native.c | 3 +-- arch/powerpc/mm/book3s64/hash_utils.c | 4 ++-- arch/powerpc/mm/book3s64/pgtable.c | 4 ++-- arch/powerpc/mm/book3s64/radix_tlb.c | 6 +++--- arch/powerpc/mm/ptdump/bats.c | 4 ++-- arch/powerpc/mm/ptdump/segment_regs.c | 4 ++-- arch/powerpc/platforms/cell/axon_msi.c | 4 ++-- arch/powerpc/platforms/powernv/memtrace.c | 3 +-- arch/powerpc/platforms/powernv/opal-imc.c | 4 ++-- arch/powerpc/platforms/powernv/opal-lpc.c | 4 ++-- arch/powerpc/platforms/powernv/opal-xscom.c | 4 ++-- arch/powerpc/platforms/powernv/pci-ioda.c | 4 ++-- arch/powerpc/platforms/pseries/dtl.c | 4 ++-- arch/powerpc/platforms/pseries/lpar.c | 5 +++-- arch/powerpc/sysdev/xive/common.c | 3 +-- arch/powerpc/xmon/xmon.c | 6 +++--- 30 files changed, 75 insertions(+), 92 deletions(-) delete mode 100644 arch/powerpc/include/asm/debugfs.h create mode 100644 arch/powerpc/kernel/kdebugfs.c diff --git a/arch/powerpc/include/asm/debugfs.h b/arch/powerpc/include/asm/debugfs.h deleted file mode 100644 index 2c5c48571d75..000000000000 --- a/arch/powerpc/include/asm/debugfs.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _ASM_POWERPC_DEBUGFS_H -#define _ASM_POWERPC_DEBUGFS_H - -/* - * Copyright 2017, Michael Ellerman, IBM Corporation. - */ - -#include - -extern struct dentry *powerpc_debugfs_root; - -#endif /* _ASM_POWERPC_DEBUGFS_H */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index f66b63e81c3b..7be36c1e1db6 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -46,7 +46,8 @@ obj-y := cputable.o syscalls.o \ prom.o traps.o setup-common.o \ udbg.o misc.o io.o misc_$(BITS).o \ of_platform.o prom_parse.o firmware.o \ - hw_breakpoint_constraints.o interrupt.o + hw_breakpoint_constraints.o interrupt.o \ + kdebugfs.o obj-y += ptrace/ obj-$(CONFIG_PPC64) += setup_64.o \ paca.o nvram_64.o note.o diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c index cdc2dccb987d..64e423d2fe0f 100644 --- a/arch/powerpc/kernel/dawr.c +++ b/arch/powerpc/kernel/dawr.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -101,7 +100,7 @@ static int __init dawr_force_setup(void) if (PVR_VER(mfspr(SPRN_PVR)) == PVR_POWER9) { /* Turn DAWR off by default, but allow admin to turn it on */ debugfs_create_file_unsafe("dawr_enable_dangerous", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &dawr_force_enable, &dawr_enable_fops); } diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 3bbdcc86d01b..e9b597ed423c 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -21,9 +21,9 @@ #include #include #include +#include #include -#include #include #include #include @@ -1901,24 +1901,24 @@ static int __init eeh_init_proc(void) proc_create_single("powerpc/eeh", 0, NULL, proc_eeh_show); #ifdef CONFIG_DEBUG_FS debugfs_create_file_unsafe("eeh_enable", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_enable_dbgfs_ops); debugfs_create_u32("eeh_max_freezes", 0600, - powerpc_debugfs_root, &eeh_max_freezes); + arch_debugfs_dir, &eeh_max_freezes); debugfs_create_bool("eeh_disable_recovery", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &eeh_debugfs_no_recover); debugfs_create_file_unsafe("eeh_dev_check", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_dev_check_fops); debugfs_create_file_unsafe("eeh_dev_break", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_dev_break_fops); debugfs_create_file_unsafe("eeh_force_recover", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_force_recover_fops); debugfs_create_file_unsafe("eeh_dev_can_recover", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_dev_can_recover_fops); eeh_cache_debugfs_init(); #endif diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index bf3270426d82..9bdaaf7fddc9 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -12,8 +12,8 @@ #include #include #include +#include #include -#include #include @@ -283,6 +283,6 @@ DEFINE_SHOW_ATTRIBUTE(eeh_addr_cache); void eeh_cache_debugfs_init(void) { debugfs_create_file_unsafe("eeh_address_cache", 0400, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_addr_cache_fops); } diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index b990075285f5..b7ceb041743c 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -24,8 +24,8 @@ #include #include #include +#include -#include #include #include #include @@ -1557,7 +1557,7 @@ static void fadump_init_files(void) return; } - debugfs_create_file("fadump_region", 0444, powerpc_debugfs_root, NULL, + debugfs_create_file("fadump_region", 0444, arch_debugfs_dir, NULL, &fadump_region_fops); if (fw_dump.dump_active) { diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 21a638aff72f..91a3be14808b 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/kdebugfs.c b/arch/powerpc/kernel/kdebugfs.c new file mode 100644 index 000000000000..36d3124d5a8b --- /dev/null +++ b/arch/powerpc/kernel/kdebugfs.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + +static int __init arch_kdebugfs_init(void) +{ + arch_debugfs_dir = debugfs_create_dir("powerpc", NULL); + return 0; +} +arch_initcall(arch_kdebugfs_init); diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index cc51fa52e783..1a998490fe60 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -11,10 +11,10 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -106,7 +106,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_barrier_nospec, barrier_nospec_get, static __init int barrier_nospec_debugfs_init(void) { debugfs_create_file_unsafe("barrier_nospec", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &fops_barrier_nospec); return 0; } @@ -114,7 +114,7 @@ device_initcall(barrier_nospec_debugfs_init); static __init int security_feature_debugfs_init(void) { - debugfs_create_x64("security_features", 0400, powerpc_debugfs_root, + debugfs_create_x64("security_features", 0400, arch_debugfs_dir, &powerpc_security_features); return 0; } @@ -420,7 +420,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_stf_barrier, stf_barrier_get, stf_barrier_set, static __init int stf_barrier_debugfs_init(void) { - debugfs_create_file_unsafe("stf_barrier", 0600, powerpc_debugfs_root, + debugfs_create_file_unsafe("stf_barrier", 0600, arch_debugfs_dir, NULL, &fops_stf_barrier); return 0; } @@ -748,7 +748,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_count_cache_flush, count_cache_flush_get, static __init int count_cache_flush_debugfs_init(void) { debugfs_create_file_unsafe("count_cache_flush", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &fops_count_cache_flush); return 0; } @@ -834,9 +834,9 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_uaccess_flush, uaccess_flush_get, uaccess_flush_set static __init int rfi_flush_debugfs_init(void) { - debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush); - debugfs_create_file("entry_flush", 0600, powerpc_debugfs_root, NULL, &fops_entry_flush); - debugfs_create_file("uaccess_flush", 0600, powerpc_debugfs_root, NULL, &fops_uaccess_flush); + debugfs_create_file("rfi_flush", 0600, arch_debugfs_dir, NULL, &fops_rfi_flush); + debugfs_create_file("entry_flush", 0600, arch_debugfs_dir, NULL, &fops_entry_flush); + debugfs_create_file("uaccess_flush", 0600, arch_debugfs_dir, NULL, &fops_uaccess_flush); return 0; } device_initcall(rfi_flush_debugfs_init); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index aa9c2d01424a..b1e43b69a559 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -773,18 +772,6 @@ static int __init check_cache_coherency(void) late_initcall(check_cache_coherency); #endif /* CONFIG_CHECK_CACHE_COHERENCY */ -#ifdef CONFIG_DEBUG_FS -struct dentry *powerpc_debugfs_root; -EXPORT_SYMBOL(powerpc_debugfs_root); - -static int powerpc_debugfs_init(void) -{ - powerpc_debugfs_root = debugfs_create_dir("powerpc", NULL); - return 0; -} -arch_initcall(powerpc_debugfs_init); -#endif - void ppc_printk_progress(char *s, unsigned short hex) { pr_info("%s\n", s); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 1ff258f6c76c..eaa79a0996d1 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -32,7 +32,6 @@ #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index dfbce527c98e..c8f648727d36 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -37,10 +37,10 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -2267,7 +2267,7 @@ static int __init ppc_warn_emulated_init(void) struct ppc_emulated_entry *entries = (void *)&ppc_emulated; dir = debugfs_create_dir("emulated_instructions", - powerpc_debugfs_root); + arch_debugfs_dir); debugfs_create_u32("do_warn", 0644, dir, &ppc_warn_emulated); diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 303e3cb096db..ebd5d920de8c 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -10,13 +10,13 @@ #include #include #include - +#include #include + #include #include #include #include -#include #include #include @@ -1024,7 +1024,7 @@ static void xics_debugfs_init(struct kvmppc_xics *xics) return; } - xics->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root, + xics->dentry = debugfs_create_file(name, 0444, arch_debugfs_dir, xics, &xics_debug_fops); pr_debug("%s: created %s\n", __func__, name); diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 912c1e9eef6b..a18db9e16ea4 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -2360,7 +2359,7 @@ static void xive_debugfs_init(struct kvmppc_xive *xive) return; } - xive->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root, + xive->dentry = debugfs_create_file(name, S_IRUGO, arch_debugfs_dir, xive, &xive_debug_fops); pr_debug("%s: created %s\n", __func__, name); diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index af65ea21bde7..99db9ac49901 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -1268,7 +1267,7 @@ static void xive_native_debugfs_init(struct kvmppc_xive *xive) return; } - xive->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root, + xive->dentry = debugfs_create_file(name, 0444, arch_debugfs_dir, xive, &xive_native_debug_fops); pr_debug("%s: created %s\n", __func__, name); diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index ac5720371c0d..c145776d3ae5 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -36,8 +36,8 @@ #include #include #include +#include -#include #include #include #include @@ -2072,7 +2072,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n") static int __init hash64_debugfs(void) { - debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root, NULL, + debugfs_create_file("hpt_order", 0600, arch_debugfs_dir, NULL, &fops_hpt_order); return 0; } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 300099de553b..9e16c7b1a6c5 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -6,9 +6,9 @@ #include #include #include +#include #include -#include #include #include #include @@ -520,7 +520,7 @@ static int __init pgtable_debugfs_setup(void) * invalidated as expected. */ debugfs_create_bool("tlbie_enabled", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &tlbie_enabled); return 0; diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 1fa2bc6a969e..7724af19ed7e 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -17,7 +18,6 @@ #include #include #include -#include #include "internal.h" @@ -1529,9 +1529,9 @@ EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt); static int __init create_tlb_single_page_flush_ceiling(void) { debugfs_create_u32("tlb_single_page_flush_ceiling", 0600, - powerpc_debugfs_root, &tlb_single_page_flush_ceiling); + arch_debugfs_dir, &tlb_single_page_flush_ceiling); debugfs_create_u32("tlb_local_single_page_flush_ceiling", 0600, - powerpc_debugfs_root, &tlb_local_single_page_flush_ceiling); + arch_debugfs_dir, &tlb_local_single_page_flush_ceiling); return 0; } late_initcall(create_tlb_single_page_flush_ceiling); diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c index c4c628b03cf8..8bf7383fb26c 100644 --- a/arch/powerpc/mm/ptdump/bats.c +++ b/arch/powerpc/mm/ptdump/bats.c @@ -7,7 +7,7 @@ */ #include -#include +#include #include #include "ptdump.h" @@ -103,7 +103,7 @@ static const struct file_operations bats_fops = { static int __init bats_init(void) { debugfs_create_file("block_address_translation", 0400, - powerpc_debugfs_root, NULL, &bats_fops); + arch_debugfs_dir, NULL, &bats_fops); return 0; } device_initcall(bats_init); diff --git a/arch/powerpc/mm/ptdump/segment_regs.c b/arch/powerpc/mm/ptdump/segment_regs.c index 565048a0c9be..9223dfb85c51 100644 --- a/arch/powerpc/mm/ptdump/segment_regs.c +++ b/arch/powerpc/mm/ptdump/segment_regs.c @@ -6,7 +6,7 @@ * This dumps the content of Segment Registers */ -#include +#include static void seg_show(struct seq_file *m, int i) { @@ -55,7 +55,7 @@ static const struct file_operations sr_fops = { static int __init sr_init(void) { - debugfs_create_file("segment_registers", 0400, powerpc_debugfs_root, + debugfs_create_file("segment_registers", 0400, arch_debugfs_dir, NULL, &sr_fops); return 0; } diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c index ca2555b8a0c2..82335e364c44 100644 --- a/arch/powerpc/platforms/cell/axon_msi.c +++ b/arch/powerpc/platforms/cell/axon_msi.c @@ -12,8 +12,8 @@ #include #include #include +#include -#include #include #include #include @@ -480,6 +480,6 @@ void axon_msi_debug_setup(struct device_node *dn, struct axon_msic *msic) snprintf(name, sizeof(name), "msic_%d", of_node_to_nid(dn)); - debugfs_create_file(name, 0600, powerpc_debugfs_root, msic, &fops_msic); + debugfs_create_file(name, 0600, arch_debugfs_dir, msic, &fops_msic); } #endif /* DEBUG */ diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 537a4daed614..877720c64515 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -18,7 +18,6 @@ #include #include #include -#include #include /* This enables us to keep track of the memory removed from each node. */ @@ -330,7 +329,7 @@ DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get, static int memtrace_init(void) { memtrace_debugfs_dir = debugfs_create_dir("memtrace", - powerpc_debugfs_root); + arch_debugfs_dir); debugfs_create_file("enable", 0600, memtrace_debugfs_dir, NULL, &memtrace_init_fops); diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index ba02a75c1410..05d3832019b9 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -13,11 +13,11 @@ #include #include #include +#include #include #include #include #include -#include static struct dentry *imc_debugfs_parent; @@ -56,7 +56,7 @@ static void export_imc_mode_and_cmd(struct device_node *node, u32 cb_offset; struct imc_mem_info *ptr = pmu_ptr->mem_info; - imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); + imc_debugfs_parent = debugfs_create_dir("imc", arch_debugfs_dir); if (of_property_read_u32(node, "cb_offset", &cb_offset)) cb_offset = IMC_CNTL_BLK_OFFSET; diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c index 608569082ba0..1e5d51db40f8 100644 --- a/arch/powerpc/platforms/powernv/opal-lpc.c +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -10,13 +10,13 @@ #include #include #include +#include #include #include #include #include #include -#include #include static int opal_lpc_chip_id = -1; @@ -371,7 +371,7 @@ static int opal_lpc_init_debugfs(void) if (opal_lpc_chip_id < 0) return -ENODEV; - root = debugfs_create_dir("lpc", powerpc_debugfs_root); + root = debugfs_create_dir("lpc", arch_debugfs_dir); rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO); rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index fd510d961b8c..6b4eed2ef4fa 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -14,11 +14,11 @@ #include #include #include +#include #include #include #include -#include #include static u64 opal_scom_unmangle(u64 addr) @@ -189,7 +189,7 @@ static int scom_debug_init(void) if (!firmware_has_feature(FW_FEATURE_OPAL)) return 0; - root = debugfs_create_dir("scom", powerpc_debugfs_root); + root = debugfs_create_dir("scom", arch_debugfs_dir); if (!root) return -1; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 2389cd79c3c8..3dd35c327d1c 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -32,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -2475,7 +2475,7 @@ static void pnv_pci_ioda_create_dbgfs(void) phb = hose->private_data; sprintf(name, "PCI%04x", hose->global_number); - phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root); + phb->dbgfs = debugfs_create_dir(name, arch_debugfs_dir); debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs, phb, &pnv_pci_diag_data_fops); diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 982f069e4c31..352af5b14a0f 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -11,10 +11,10 @@ #include #include #include +#include #include #include #include -#include #include #include @@ -338,7 +338,7 @@ static int dtl_init(void) /* set up common debugfs structure */ - dtl_dir = debugfs_create_dir("dtl", powerpc_debugfs_root); + dtl_dir = debugfs_create_dir("dtl", arch_debugfs_dir); debugfs_create_x8("dtl_event_mask", 0600, dtl_dir, &dtl_event_mask); debugfs_create_u32("dtl_buf_entries", 0400, dtl_dir, &dtl_buf_entries); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 869ef638698a..bd1fcb0881ea 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -22,6 +22,8 @@ #include #include #include +#include + #include #include #include @@ -39,7 +41,6 @@ #include #include #include -#include #include #include "pseries.h" @@ -2019,7 +2020,7 @@ static int __init vpa_debugfs_init(void) if (!firmware_has_feature(FW_FEATURE_SPLPAR)) return 0; - vpa_dir = debugfs_create_dir("vpa", powerpc_debugfs_root); + vpa_dir = debugfs_create_dir("vpa", arch_debugfs_dir); /* set up the per-cpu vpa file*/ for_each_possible_cpu(i) { diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 4018964bbd69..458645c7a72b 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -21,7 +21,6 @@ #include #include -#include #include #include #include @@ -1769,7 +1768,7 @@ DEFINE_SHOW_ATTRIBUTE(xive_core_debug); int xive_core_debug_init(void) { if (xive_enabled()) - debugfs_create_file("xive", 0400, powerpc_debugfs_root, + debugfs_create_file("xive", 0400, arch_debugfs_dir, NULL, &xive_core_debug_fops); return 0; } diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index da4d7f225a40..ead460b80905 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -26,8 +26,8 @@ #include #include #include +#include -#include #include #include #include @@ -4077,8 +4077,8 @@ DEFINE_SIMPLE_ATTRIBUTE(xmon_dbgfs_ops, xmon_dbgfs_get, static int __init setup_xmon_dbgfs(void) { - debugfs_create_file("xmon", 0600, powerpc_debugfs_root, NULL, - &xmon_dbgfs_ops); + debugfs_create_file("xmon", 0600, arch_debugfs_dir, NULL, + &xmon_dbgfs_ops); return 0; } device_initcall(setup_xmon_dbgfs); From 7e35ef662ca05c42dbc2f401bb76d9219dd7fd02 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:52:19 +0530 Subject: [PATCH 155/474] powerpc/pseries: rename min_common_depth to primary_domain_index No functional change in this patch. Signed-off-by: Aneesh Kumar K.V Reviewed-by: David Gibson Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132223.225214-2-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/numa.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 094a1076fd1f..79132744b728 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL(numa_cpu_lookup_table); EXPORT_SYMBOL(node_to_cpumask_map); EXPORT_SYMBOL(node_data); -static int min_common_depth; +static int primary_domain_index; static int n_mem_addr_cells, n_mem_size_cells; static int form1_affinity; @@ -232,8 +232,8 @@ static int associativity_to_nid(const __be32 *associativity) if (!numa_enabled) goto out; - if (of_read_number(associativity, 1) >= min_common_depth) - nid = of_read_number(&associativity[min_common_depth], 1); + if (of_read_number(associativity, 1) >= primary_domain_index) + nid = of_read_number(&associativity[primary_domain_index], 1); /* POWER4 LPAR uses 0xffff as invalid node */ if (nid == 0xffff || nid >= nr_node_ids) @@ -284,9 +284,9 @@ int of_node_to_nid(struct device_node *device) } EXPORT_SYMBOL(of_node_to_nid); -static int __init find_min_common_depth(void) +static int __init find_primary_domain_index(void) { - int depth; + int index; struct device_node *root; if (firmware_has_feature(FW_FEATURE_OPAL)) @@ -326,7 +326,7 @@ static int __init find_min_common_depth(void) } if (form1_affinity) { - depth = of_read_number(distance_ref_points, 1); + index = of_read_number(distance_ref_points, 1); } else { if (distance_ref_points_depth < 2) { printk(KERN_WARNING "NUMA: " @@ -334,7 +334,7 @@ static int __init find_min_common_depth(void) goto err; } - depth = of_read_number(&distance_ref_points[1], 1); + index = of_read_number(&distance_ref_points[1], 1); } /* @@ -348,7 +348,7 @@ static int __init find_min_common_depth(void) } of_node_put(root); - return depth; + return index; err: of_node_put(root); @@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb) int nid = default_nid; int rc, index; - if ((min_common_depth < 0) || !numa_enabled) + if ((primary_domain_index < 0) || !numa_enabled) return default_nid; rc = of_get_assoc_arrays(&aa); if (rc) return default_nid; - if (min_common_depth <= aa.array_sz && + if (primary_domain_index <= aa.array_sz && !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { - index = lmb->aa_index * aa.array_sz + min_common_depth - 1; + index = lmb->aa_index * aa.array_sz + primary_domain_index - 1; nid = of_read_number(&aa.arrays[index], 1); if (nid == 0xffff || nid >= nr_node_ids) @@ -708,18 +708,18 @@ static int __init parse_numa_properties(void) return -1; } - min_common_depth = find_min_common_depth(); + primary_domain_index = find_primary_domain_index(); - if (min_common_depth < 0) { + if (primary_domain_index < 0) { /* - * if we fail to parse min_common_depth from device tree + * if we fail to parse primary_domain_index from device tree * mark the numa disabled, boot with numa disabled. */ numa_enabled = false; - return min_common_depth; + return primary_domain_index; } - dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); + dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); /* * Even though we connect cpus to numa domains later in SMP @@ -924,7 +924,7 @@ static void __init find_possible_nodes(void) goto out; } - max_nodes = of_read_number(&domains[min_common_depth], 1); + max_nodes = of_read_number(&domains[primary_domain_index], 1); pr_info("Partition configured for %d NUMA nodes.\n", max_nodes); for (i = 0; i < max_nodes; i++) { @@ -933,7 +933,7 @@ static void __init find_possible_nodes(void) } prop_length /= sizeof(int); - if (prop_length > min_common_depth + 2) + if (prop_length > primary_domain_index + 2) coregroup_enabled = 1; out: @@ -1266,7 +1266,7 @@ int cpu_to_coregroup_id(int cpu) goto out; index = of_read_number(associativity, 1); - if (index > min_common_depth + 1) + if (index > primary_domain_index + 1) return of_read_number(&associativity[index - 1], 1); out: From 0eacd06bb8adea8dd9edb0a30144166d9f227e64 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:52:20 +0530 Subject: [PATCH 156/474] powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY Also make related code cleanup that will allow adding FORM2_AFFINITY in later patches. No functional change in this patch. Signed-off-by: Aneesh Kumar K.V Reviewed-by: David Gibson Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132223.225214-3-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/firmware.h | 4 +-- arch/powerpc/include/asm/prom.h | 2 +- arch/powerpc/kernel/prom_init.c | 2 +- arch/powerpc/mm/numa.c | 35 ++++++++++++++--------- arch/powerpc/platforms/pseries/firmware.c | 2 +- 5 files changed, 26 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 7604673787d6..60b631161360 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -44,7 +44,7 @@ #define FW_FEATURE_OPAL ASM_CONST(0x0000000010000000) #define FW_FEATURE_SET_MODE ASM_CONST(0x0000000040000000) #define FW_FEATURE_BEST_ENERGY ASM_CONST(0x0000000080000000) -#define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0000000100000000) +#define FW_FEATURE_FORM1_AFFINITY ASM_CONST(0x0000000100000000) #define FW_FEATURE_PRRN ASM_CONST(0x0000000200000000) #define FW_FEATURE_DRMEM_V2 ASM_CONST(0x0000000400000000) #define FW_FEATURE_DRC_INFO ASM_CONST(0x0000000800000000) @@ -69,7 +69,7 @@ enum { FW_FEATURE_SPLPAR | FW_FEATURE_LPAR | FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO | FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY | - FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN | + FW_FEATURE_FORM1_AFFINITY | FW_FEATURE_PRRN | FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR | diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index 324a13351749..df9fec9d232c 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -147,7 +147,7 @@ extern int of_read_drc_info_cell(struct property **prop, #define OV5_MSI 0x0201 /* PCIe/MSI support */ #define OV5_CMO 0x0480 /* Cooperative Memory Overcommitment */ #define OV5_XCMO 0x0440 /* Page Coalescing */ -#define OV5_TYPE1_AFFINITY 0x0580 /* Type 1 NUMA affinity */ +#define OV5_FORM1_AFFINITY 0x0580 /* FORM1 NUMA affinity */ #define OV5_PRRN 0x0540 /* Platform Resource Reassignment */ #define OV5_HP_EVT 0x0604 /* Hot Plug Event support */ #define OV5_RESIZE_HPT 0x0601 /* Hash Page Table resizing */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index a5bf355ce1d6..57db605ad33a 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1096,7 +1096,7 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { #else 0, #endif - .associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN), + .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN), .bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT), .micro_checkpoint = 0, .reserved0 = 0, diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 79132744b728..0bad11b3e929 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -53,7 +53,10 @@ EXPORT_SYMBOL(node_data); static int primary_domain_index; static int n_mem_addr_cells, n_mem_size_cells; -static int form1_affinity; + +#define FORM0_AFFINITY 0 +#define FORM1_AFFINITY 1 +static int affinity_form; #define MAX_DISTANCE_REF_POINTS 4 static int distance_ref_points_depth; @@ -190,7 +193,7 @@ int __node_distance(int a, int b) int i; int distance = LOCAL_DISTANCE; - if (!form1_affinity) + if (affinity_form == FORM0_AFFINITY) return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); for (i = 0; i < distance_ref_points_depth; i++) { @@ -210,7 +213,7 @@ static void initialize_distance_lookup_table(int nid, { int i; - if (!form1_affinity) + if (affinity_form != FORM1_AFFINITY) return; for (i = 0; i < distance_ref_points_depth; i++) { @@ -289,6 +292,17 @@ static int __init find_primary_domain_index(void) int index; struct device_node *root; + /* + * Check for which form of affinity. + */ + if (firmware_has_feature(FW_FEATURE_OPAL)) { + affinity_form = FORM1_AFFINITY; + } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { + dbg("Using form 1 affinity\n"); + affinity_form = FORM1_AFFINITY; + } else + affinity_form = FORM0_AFFINITY; + if (firmware_has_feature(FW_FEATURE_OPAL)) root = of_find_node_by_path("/ibm,opal"); else @@ -318,23 +332,16 @@ static int __init find_primary_domain_index(void) } distance_ref_points_depth /= sizeof(int); - - if (firmware_has_feature(FW_FEATURE_OPAL) || - firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) { - dbg("Using form 1 affinity\n"); - form1_affinity = 1; - } - - if (form1_affinity) { - index = of_read_number(distance_ref_points, 1); - } else { + if (affinity_form == FORM0_AFFINITY) { if (distance_ref_points_depth < 2) { printk(KERN_WARNING "NUMA: " - "short ibm,associativity-reference-points\n"); + "short ibm,associativity-reference-points\n"); goto err; } index = of_read_number(&distance_ref_points[1], 1); + } else { + index = of_read_number(distance_ref_points, 1); } /* diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index 4c7b7f5a2ebc..5d4c2bc20bba 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -119,7 +119,7 @@ struct vec5_fw_feature { static __initdata struct vec5_fw_feature vec5_fw_features_table[] = { - {FW_FEATURE_TYPE1_AFFINITY, OV5_TYPE1_AFFINITY}, + {FW_FEATURE_FORM1_AFFINITY, OV5_FORM1_AFFINITY}, {FW_FEATURE_PRRN, OV5_PRRN}, {FW_FEATURE_DRMEM_V2, OV5_DRMEM_V2}, {FW_FEATURE_DRC_INFO, OV5_DRC_INFO}, From 8ddc6448ec5a5ef50eaa581a7dec0e12a02850ff Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:52:21 +0530 Subject: [PATCH 157/474] powerpc/pseries: Consolidate different NUMA distance update code paths The associativity details of the newly added resourced are collected from the hypervisor via "ibm,configure-connector" rtas call. Update the numa distance details of the newly added numa node after the above call. Instead of updating NUMA distance every time we lookup a node id from the associativity property, add helpers that can be used during boot which does this only once. Also remove the distance update from node id lookup helpers. Currently, we duplicate parsing code for ibm,associativity and ibm,associativity-lookup-arrays in the kernel. The associativity array provided by these device tree properties are very similar and hence can use a helper to parse the node id and numa distance details. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132223.225214-4-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/topology.h | 2 + arch/powerpc/mm/numa.c | 212 +++++++++++++----- arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 + .../platforms/pseries/hotplug-memory.c | 2 + 4 files changed, 161 insertions(+), 57 deletions(-) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index e4db64c0e184..a6425a70c37b 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -64,6 +64,7 @@ static inline int early_cpu_to_node(int cpu) } int of_drconf_to_nid_single(struct drmem_lmb *lmb); +void update_numa_distance(struct device_node *node); #else @@ -93,6 +94,7 @@ static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb) return first_online_node; } +static inline void update_numa_distance(struct device_node *node) {} #endif /* CONFIG_NUMA */ #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 0bad11b3e929..3cda37c52f26 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -208,50 +208,35 @@ int __node_distance(int a, int b) } EXPORT_SYMBOL(__node_distance); -static void initialize_distance_lookup_table(int nid, - const __be32 *associativity) +static int __associativity_to_nid(const __be32 *associativity, + int max_array_sz) { - int i; + int nid; + /* + * primary_domain_index is 1 based array index. + */ + int index = primary_domain_index - 1; - if (affinity_form != FORM1_AFFINITY) - return; + if (!numa_enabled || index >= max_array_sz) + return NUMA_NO_NODE; - for (i = 0; i < distance_ref_points_depth; i++) { - const __be32 *entry; + nid = of_read_number(&associativity[index], 1); - entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1]; - distance_lookup_table[nid][i] = of_read_number(entry, 1); - } + /* POWER4 LPAR uses 0xffff as invalid node */ + if (nid == 0xffff || nid >= nr_node_ids) + nid = NUMA_NO_NODE; + return nid; } - /* * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA * info is found. */ static int associativity_to_nid(const __be32 *associativity) { - int nid = NUMA_NO_NODE; + int array_sz = of_read_number(associativity, 1); - if (!numa_enabled) - goto out; - - if (of_read_number(associativity, 1) >= primary_domain_index) - nid = of_read_number(&associativity[primary_domain_index], 1); - - /* POWER4 LPAR uses 0xffff as invalid node */ - if (nid == 0xffff || nid >= nr_node_ids) - nid = NUMA_NO_NODE; - - if (nid > 0 && - of_read_number(associativity, 1) >= distance_ref_points_depth) { - /* - * Skip the length field and send start of associativity array - */ - initialize_distance_lookup_table(nid, associativity + 1); - } - -out: - return nid; + /* Skip the first element in the associativity array */ + return __associativity_to_nid((associativity + 1), array_sz); } /* Returns the nid associated with the given device tree node, @@ -287,6 +272,60 @@ int of_node_to_nid(struct device_node *device) } EXPORT_SYMBOL(of_node_to_nid); +static void __initialize_form1_numa_distance(const __be32 *associativity, + int max_array_sz) +{ + int i, nid; + + if (affinity_form != FORM1_AFFINITY) + return; + + nid = __associativity_to_nid(associativity, max_array_sz); + if (nid != NUMA_NO_NODE) { + for (i = 0; i < distance_ref_points_depth; i++) { + const __be32 *entry; + int index = be32_to_cpu(distance_ref_points[i]) - 1; + + /* + * broken hierarchy, return with broken distance table + */ + if (WARN(index >= max_array_sz, "Broken ibm,associativity property")) + return; + + entry = &associativity[index]; + distance_lookup_table[nid][i] = of_read_number(entry, 1); + } + } +} + +static void initialize_form1_numa_distance(const __be32 *associativity) +{ + int array_sz; + + array_sz = of_read_number(associativity, 1); + /* Skip the first element in the associativity array */ + __initialize_form1_numa_distance(associativity + 1, array_sz); +} + +/* + * Used to update distance information w.r.t newly added node. + */ +void update_numa_distance(struct device_node *node) +{ + if (affinity_form == FORM0_AFFINITY) + return; + else if (affinity_form == FORM1_AFFINITY) { + const __be32 *associativity; + + associativity = of_get_associativity(node); + if (!associativity) + return; + + initialize_form1_numa_distance(associativity); + return; + } +} + static int __init find_primary_domain_index(void) { int index; @@ -433,6 +472,38 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa) return 0; } +static int get_nid_and_numa_distance(struct drmem_lmb *lmb) +{ + struct assoc_arrays aa = { .arrays = NULL }; + int default_nid = NUMA_NO_NODE; + int nid = default_nid; + int rc, index; + + if ((primary_domain_index < 0) || !numa_enabled) + return default_nid; + + rc = of_get_assoc_arrays(&aa); + if (rc) + return default_nid; + + if (primary_domain_index <= aa.array_sz && + !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { + const __be32 *associativity; + + index = lmb->aa_index * aa.array_sz; + associativity = &aa.arrays[index]; + nid = __associativity_to_nid(associativity, aa.array_sz); + if (nid > 0 && affinity_form == FORM1_AFFINITY) { + /* + * lookup array associativity entries have + * no length of the array as the first element. + */ + __initialize_form1_numa_distance(associativity, aa.array_sz); + } + } + return nid; +} + /* * This is like of_node_to_nid_single() for memory represented in the * ibm,dynamic-reconfiguration-memory node. @@ -453,26 +524,19 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb) if (primary_domain_index <= aa.array_sz && !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { - index = lmb->aa_index * aa.array_sz + primary_domain_index - 1; - nid = of_read_number(&aa.arrays[index], 1); + const __be32 *associativity; - if (nid == 0xffff || nid >= nr_node_ids) - nid = default_nid; - - if (nid > 0) { - index = lmb->aa_index * aa.array_sz; - initialize_distance_lookup_table(nid, - &aa.arrays[index]); - } + index = lmb->aa_index * aa.array_sz; + associativity = &aa.arrays[index]; + nid = __associativity_to_nid(associativity, aa.array_sz); } - return nid; } #ifdef CONFIG_PPC_SPLPAR -static int vphn_get_nid(long lcpu) + +static int __vphn_get_associativity(long lcpu, __be32 *associativity) { - __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; long rc, hwid; /* @@ -492,12 +556,30 @@ static int vphn_get_nid(long lcpu) rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity); if (rc == H_SUCCESS) - return associativity_to_nid(associativity); + return 0; } + return -1; +} + +static int vphn_get_nid(long lcpu) +{ + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; + + + if (!__vphn_get_associativity(lcpu, associativity)) + return associativity_to_nid(associativity); + return NUMA_NO_NODE; + } #else + +static int __vphn_get_associativity(long lcpu, __be32 *associativity) +{ + return -1; +} + static int vphn_get_nid(long unused) { return NUMA_NO_NODE; @@ -692,7 +774,7 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb, size = read_n_cells(n_mem_size_cells, usm); } - nid = of_drconf_to_nid_single(lmb); + nid = get_nid_and_numa_distance(lmb); fake_numa_create_new_node(((base + size) >> PAGE_SHIFT), &nid); node_set_online(nid); @@ -709,6 +791,7 @@ static int __init parse_numa_properties(void) struct device_node *memory; int default_nid = 0; unsigned long i; + const __be32 *associativity; if (numa_enabled == 0) { printk(KERN_WARNING "NUMA disabled by user\n"); @@ -734,18 +817,30 @@ static int __init parse_numa_properties(void) * each node to be onlined must have NODE_DATA etc backing it. */ for_each_present_cpu(i) { + __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE]; struct device_node *cpu; - int nid = vphn_get_nid(i); + int nid = NUMA_NO_NODE; - /* - * Don't fall back to default_nid yet -- we will plug - * cpus into nodes once the memory scan has discovered - * the topology. - */ - if (nid == NUMA_NO_NODE) { + memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32)); + + if (__vphn_get_associativity(i, vphn_assoc) == 0) { + nid = associativity_to_nid(vphn_assoc); + initialize_form1_numa_distance(vphn_assoc); + } else { + + /* + * Don't fall back to default_nid yet -- we will plug + * cpus into nodes once the memory scan has discovered + * the topology. + */ cpu = of_get_cpu_node(i, NULL); BUG_ON(!cpu); - nid = of_node_to_nid_single(cpu); + + associativity = of_get_associativity(cpu); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } of_node_put(cpu); } @@ -781,8 +876,11 @@ new_range: * have associativity properties. If none, then * everything goes to default_nid. */ - nid = of_node_to_nid_single(memory); - if (nid < 0) + associativity = of_get_associativity(memory); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } else nid = default_nid; fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index e1f224320102..1ef40ef699a6 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -580,6 +580,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index) return saved_rc; } + update_numa_distance(dn); + rc = dlpar_online_cpu(dn); if (rc) { saved_rc = rc; diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 0beb3ca2b549..14fccd7b9c99 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb) return -ENODEV; } + update_numa_distance(lmb_node); + dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); if (!dr_node) { dlpar_free_cc_nodes(lmb_node); From ef31cb83d19c4c589d650747cd5a7e502be9f665 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:52:22 +0530 Subject: [PATCH 158/474] powerpc/pseries: Add a helper for form1 cpu distance This helper is only used with the dispatch trace log collection. A later patch will add Form2 affinity support and this change helps in keeping that simpler. Also add a comment explaining we don't expect the code to be called with FORM0 Signed-off-by: Aneesh Kumar K.V Reviewed-by: David Gibson Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132223.225214-5-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/topology.h | 4 ++-- arch/powerpc/mm/numa.c | 10 +++++++++- arch/powerpc/platforms/pseries/lpar.c | 4 ++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index a6425a70c37b..a4712ecad3e9 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -36,7 +36,7 @@ static inline int pcibus_to_node(struct pci_bus *bus) cpu_all_mask : \ cpumask_of_node(pcibus_to_node(bus))) -extern int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc); +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc); extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) @@ -84,7 +84,7 @@ static inline void sysfs_remove_device_from_node(struct device *dev, static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {} -static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +static inline int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { return 0; } diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 3cda37c52f26..fe3a9a38eddf 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -166,7 +166,7 @@ static void unmap_cpu_from_node(unsigned long cpu) } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ -int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { int dist = 0; @@ -182,6 +182,14 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) return dist; } +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + /* We should not get called with FORM0 */ + VM_WARN_ON(affinity_form == FORM0_AFFINITY); + + return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); +} + /* must hold reference to node during call */ static const __be32 *of_get_associativity(struct device_node *dev) { diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index bd1fcb0881ea..3df6bdfea475 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -262,7 +262,7 @@ static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu) if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc) return -EIO; - return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc); + return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc); } static int cpu_home_node_dispatch_distance(int disp_cpu) @@ -282,7 +282,7 @@ static int cpu_home_node_dispatch_distance(int disp_cpu) if (!disp_cpu_assoc || !vcpu_assoc) return -EIO; - return cpu_distance(disp_cpu_assoc, vcpu_assoc); + return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc); } static void update_vcpu_disp_stat(int disp_cpu) From 1c6b5a7e74052768977855f95d6b8812f6e7772c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:52:23 +0530 Subject: [PATCH 159/474] powerpc/pseries: Add support for FORM2 associativity PAPR interface currently supports two different ways of communicating resource grouping details to the OS. These are referred to as Form 0 and Form 1 associativity grouping. Form 0 is the older format and is now considered deprecated. This patch adds another resource grouping named FORM2. Signed-off-by: Daniel Henrique Barboza Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132223.225214-6-aneesh.kumar@linux.ibm.com --- Documentation/powerpc/associativity.rst | 104 ++++++++++ arch/powerpc/include/asm/firmware.h | 3 +- arch/powerpc/include/asm/prom.h | 1 + arch/powerpc/kernel/prom_init.c | 3 +- arch/powerpc/mm/numa.c | 219 +++++++++++++++++----- arch/powerpc/platforms/pseries/firmware.c | 1 + 6 files changed, 278 insertions(+), 53 deletions(-) create mode 100644 Documentation/powerpc/associativity.rst diff --git a/Documentation/powerpc/associativity.rst b/Documentation/powerpc/associativity.rst new file mode 100644 index 000000000000..07e7dd3d6c87 --- /dev/null +++ b/Documentation/powerpc/associativity.rst @@ -0,0 +1,104 @@ +============================ +NUMA resource associativity +============================= + +Associativity represents the groupings of the various platform resources into +domains of substantially similar mean performance relative to resources outside +of that domain. Resources subsets of a given domain that exhibit better +performance relative to each other than relative to other resources subsets +are represented as being members of a sub-grouping domain. This performance +characteristic is presented in terms of NUMA node distance within the Linux kernel. +From the platform view, these groups are also referred to as domains. + +PAPR interface currently supports different ways of communicating these resource +grouping details to the OS. These are referred to as Form 0, Form 1 and Form2 +associativity grouping. Form 0 is the oldest format and is now considered deprecated. + +Hypervisor indicates the type/form of associativity used via "ibm,architecture-vec-5 property". +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1. +A value of 1 indicates the usage of Form 1 associativity. For Form 2 associativity +bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used. + +Form 0 +----- +Form 0 associativity supports only two NUMA distances (LOCAL and REMOTE). + +Form 1 +----- +With Form 1 a combination of ibm,associativity-reference-points, and ibm,associativity +device tree properties are used to determine the NUMA distance between resource groups/domains. + +The “ibm,associativity” property contains a list of one or more numbers (domainID) +representing the resource’s platform grouping domains. + +The “ibm,associativity-reference-points” property contains a list of one or more numbers +(domainID index) that represents the 1 based ordinal in the associativity lists. +The list of domainID indexes represents an increasing hierarchy of resource grouping. + +ex: +{ primary domainID index, secondary domainID index, tertiary domainID index.. } + +Linux kernel uses the domainID at the primary domainID index as the NUMA node id. +Linux kernel computes NUMA distance between two domains by recursively comparing +if they belong to the same higher-level domains. For mismatch at every higher +level of the resource group, the kernel doubles the NUMA distance between the +comparing domains. + +Form 2 +------- +Form 2 associativity format adds separate device tree properties representing NUMA node distance +thereby making the node distance computation flexible. Form 2 also allows flexible primary +domain numbering. With numa distance computation now detached from the index value in +"ibm,associativity-reference-points" property, Form 2 allows a large number of primary domain +ids at the same domainID index representing resource groups of different performance/latency +characteristics. + +Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in the +"ibm,architecture-vec-5" property. + +"ibm,numa-lookup-index-table" property contains a list of one or more numbers representing +the domainIDs present in the system. The offset of the domainID in this property is +used as an index while computing numa distance information via "ibm,numa-distance-table". + +prop-encoded-array: The number N of the domainIDs encoded as with encode-int, followed by +N domainID encoded as with encode-int + +For ex: +"ibm,numa-lookup-index-table" = {4, 0, 8, 250, 252}. The offset of domainID 8 (2) is used when +computing the distance of domain 8 from other domains present in the system. For the rest of +this document, this offset will be referred to as domain distance offset. + +"ibm,numa-distance-table" property contains a list of one or more numbers representing the NUMA +distance between resource groups/domains present in the system. + +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by +N distance values encoded as with encode-bytes. The max distance value we could encode is 255. +The number N must be equal to the square of m where m is the number of domainIDs in the +numa-lookup-index-table. + +For ex: +ibm,numa-lookup-index-table = <3 0 8 40>; +ibm,numa-distace-table = <9>, /bits/ 8 < 10 20 80 + 20 10 160 + 80 160 10>; + | 0 8 40 +--|------------ + | +0 | 10 20 80 + | +8 | 20 10 160 + | +40| 80 160 10 + +A possible "ibm,associativity" property for resources in node 0, 8 and 40 + +{ 3, 6, 7, 0 } +{ 3, 6, 9, 8 } +{ 3, 6, 7, 40} + +With "ibm,associativity-reference-points" { 0x3 } + +"ibm,lookup-index-table" helps in having a compact representation of distance matrix. +Since domainID can be sparse, the matrix of distances can also be effectively sparse. +With "ibm,lookup-index-table" we can achieve a compact representation of +distance information. diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 60b631161360..97a3bd9ffeb9 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -53,6 +53,7 @@ #define FW_FEATURE_ULTRAVISOR ASM_CONST(0x0000004000000000) #define FW_FEATURE_STUFF_TCE ASM_CONST(0x0000008000000000) #define FW_FEATURE_RPT_INVALIDATE ASM_CONST(0x0000010000000000) +#define FW_FEATURE_FORM2_AFFINITY ASM_CONST(0x0000020000000000) #ifndef __ASSEMBLY__ @@ -73,7 +74,7 @@ enum { FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR | - FW_FEATURE_RPT_INVALIDATE, + FW_FEATURE_RPT_INVALIDATE | FW_FEATURE_FORM2_AFFINITY, FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR, FW_FEATURE_POWERNV_ALWAYS = 0, diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index df9fec9d232c..5c80152e8f18 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -149,6 +149,7 @@ extern int of_read_drc_info_cell(struct property **prop, #define OV5_XCMO 0x0440 /* Page Coalescing */ #define OV5_FORM1_AFFINITY 0x0580 /* FORM1 NUMA affinity */ #define OV5_PRRN 0x0540 /* Platform Resource Reassignment */ +#define OV5_FORM2_AFFINITY 0x0520 /* Form2 NUMA affinity */ #define OV5_HP_EVT 0x0604 /* Hot Plug Event support */ #define OV5_RESIZE_HPT 0x0601 /* Hash Page Table resizing */ #define OV5_PFO_HW_RNG 0x1180 /* PFO Random Number Generator */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 57db605ad33a..95a42d49e291 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1096,7 +1096,8 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { #else 0, #endif - .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN), + .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN) | + OV5_FEAT(OV5_FORM2_AFFINITY), .bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT), .micro_checkpoint = 0, .reserved0 = 0, diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index fe3a9a38eddf..fe9c6ced40b2 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -56,12 +56,17 @@ static int n_mem_addr_cells, n_mem_size_cells; #define FORM0_AFFINITY 0 #define FORM1_AFFINITY 1 +#define FORM2_AFFINITY 2 static int affinity_form; #define MAX_DISTANCE_REF_POINTS 4 static int distance_ref_points_depth; static const __be32 *distance_ref_points; static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; +static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = { + [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 } +}; +static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE }; /* * Allocate node_to_cpumask_map based on number of available nodes @@ -166,56 +171,6 @@ static void unmap_cpu_from_node(unsigned long cpu) } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ -static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) -{ - int dist = 0; - - int i, index; - - for (i = 0; i < distance_ref_points_depth; i++) { - index = be32_to_cpu(distance_ref_points[i]); - if (cpu1_assoc[index] == cpu2_assoc[index]) - break; - dist++; - } - - return dist; -} - -int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) -{ - /* We should not get called with FORM0 */ - VM_WARN_ON(affinity_form == FORM0_AFFINITY); - - return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); -} - -/* must hold reference to node during call */ -static const __be32 *of_get_associativity(struct device_node *dev) -{ - return of_get_property(dev, "ibm,associativity", NULL); -} - -int __node_distance(int a, int b) -{ - int i; - int distance = LOCAL_DISTANCE; - - if (affinity_form == FORM0_AFFINITY) - return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); - - for (i = 0; i < distance_ref_points_depth; i++) { - if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) - break; - - /* Double the distance for each NUMA level */ - distance *= 2; - } - - return distance; -} -EXPORT_SYMBOL(__node_distance); - static int __associativity_to_nid(const __be32 *associativity, int max_array_sz) { @@ -247,6 +202,76 @@ static int associativity_to_nid(const __be32 *associativity) return __associativity_to_nid((associativity + 1), array_sz); } +static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + int dist; + int node1, node2; + + node1 = associativity_to_nid(cpu1_assoc); + node2 = associativity_to_nid(cpu2_assoc); + + dist = numa_distance_table[node1][node2]; + if (dist <= LOCAL_DISTANCE) + return 0; + else if (dist <= REMOTE_DISTANCE) + return 1; + else + return 2; +} + +static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + int dist = 0; + + int i, index; + + for (i = 0; i < distance_ref_points_depth; i++) { + index = be32_to_cpu(distance_ref_points[i]); + if (cpu1_assoc[index] == cpu2_assoc[index]) + break; + dist++; + } + + return dist; +} + +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + /* We should not get called with FORM0 */ + VM_WARN_ON(affinity_form == FORM0_AFFINITY); + if (affinity_form == FORM1_AFFINITY) + return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); + return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc); +} + +/* must hold reference to node during call */ +static const __be32 *of_get_associativity(struct device_node *dev) +{ + return of_get_property(dev, "ibm,associativity", NULL); +} + +int __node_distance(int a, int b) +{ + int i; + int distance = LOCAL_DISTANCE; + + if (affinity_form == FORM2_AFFINITY) + return numa_distance_table[a][b]; + else if (affinity_form == FORM0_AFFINITY) + return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); + + for (i = 0; i < distance_ref_points_depth; i++) { + if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) + break; + + /* Double the distance for each NUMA level */ + distance *= 2; + } + + return distance; +} +EXPORT_SYMBOL(__node_distance); + /* Returns the nid associated with the given device tree node, * or -1 if not found. */ @@ -320,6 +345,8 @@ static void initialize_form1_numa_distance(const __be32 *associativity) */ void update_numa_distance(struct device_node *node) { + int nid; + if (affinity_form == FORM0_AFFINITY) return; else if (affinity_form == FORM1_AFFINITY) { @@ -332,6 +359,84 @@ void update_numa_distance(struct device_node *node) initialize_form1_numa_distance(associativity); return; } + + /* FORM2 affinity */ + nid = of_node_to_nid_single(node); + if (nid == NUMA_NO_NODE) + return; + + /* + * With FORM2 we expect NUMA distance of all possible NUMA + * nodes to be provided during boot. + */ + WARN(numa_distance_table[nid][nid] == -1, + "NUMA distance details for node %d not provided\n", nid); +} + +/* + * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} + * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements} + */ +static void initialize_form2_numa_distance_lookup_table(void) +{ + int i, j; + struct device_node *root; + const __u8 *numa_dist_table; + const __be32 *numa_lookup_index; + int numa_dist_table_length; + int max_numa_index, distance_index; + + if (firmware_has_feature(FW_FEATURE_OPAL)) + root = of_find_node_by_path("/ibm,opal"); + else + root = of_find_node_by_path("/rtas"); + if (!root) + root = of_find_node_by_path("/"); + + numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL); + max_numa_index = of_read_number(&numa_lookup_index[0], 1); + + /* first element of the array is the size and is encode-int */ + numa_dist_table = of_get_property(root, "ibm,numa-distance-table", NULL); + numa_dist_table_length = of_read_number((const __be32 *)&numa_dist_table[0], 1); + /* Skip the size which is encoded int */ + numa_dist_table += sizeof(__be32); + + pr_debug("numa_dist_table_len = %d, numa_dist_indexes_len = %d\n", + numa_dist_table_length, max_numa_index); + + for (i = 0; i < max_numa_index; i++) + /* +1 skip the max_numa_index in the property */ + numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1); + + + if (numa_dist_table_length != max_numa_index * max_numa_index) { + WARN(1, "Wrong NUMA distance information\n"); + /* consider everybody else just remote. */ + for (i = 0; i < max_numa_index; i++) { + for (j = 0; j < max_numa_index; j++) { + int nodeA = numa_id_index_table[i]; + int nodeB = numa_id_index_table[j]; + + if (nodeA == nodeB) + numa_distance_table[nodeA][nodeB] = LOCAL_DISTANCE; + else + numa_distance_table[nodeA][nodeB] = REMOTE_DISTANCE; + } + } + } + + distance_index = 0; + for (i = 0; i < max_numa_index; i++) { + for (j = 0; j < max_numa_index; j++) { + int nodeA = numa_id_index_table[i]; + int nodeB = numa_id_index_table[j]; + + numa_distance_table[nodeA][nodeB] = numa_dist_table[distance_index++]; + pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, numa_distance_table[nodeA][nodeB]); + } + } + of_node_put(root); } static int __init find_primary_domain_index(void) @@ -344,6 +449,9 @@ static int __init find_primary_domain_index(void) */ if (firmware_has_feature(FW_FEATURE_OPAL)) { affinity_form = FORM1_AFFINITY; + } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) { + dbg("Using form 2 affinity\n"); + affinity_form = FORM2_AFFINITY; } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { dbg("Using form 1 affinity\n"); affinity_form = FORM1_AFFINITY; @@ -388,9 +496,12 @@ static int __init find_primary_domain_index(void) index = of_read_number(&distance_ref_points[1], 1); } else { + /* + * Both FORM1 and FORM2 affinity find the primary domain details + * at the same offset. + */ index = of_read_number(distance_ref_points, 1); } - /* * Warn and cap if the hardware supports more than * MAX_DISTANCE_REF_POINTS domains. @@ -819,6 +930,12 @@ static int __init parse_numa_properties(void) dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); + /* + * If it is FORM2 initialize the distance table here. + */ + if (affinity_form == FORM2_AFFINITY) + initialize_form2_numa_distance_lookup_table(); + /* * Even though we connect cpus to numa domains later in SMP * init, we need to know the node ids now. This is because diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index 5d4c2bc20bba..f162156b7b68 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -123,6 +123,7 @@ vec5_fw_features_table[] = { {FW_FEATURE_PRRN, OV5_PRRN}, {FW_FEATURE_DRMEM_V2, OV5_DRMEM_V2}, {FW_FEATURE_DRC_INFO, OV5_DRC_INFO}, + {FW_FEATURE_FORM2_AFFINITY, OV5_FORM2_AFFINITY}, }; static void __init fw_vec5_feature_init(const char *vec5, unsigned long len) From ef75702d6d65a1993dc0d6dd423fefaa08901f24 Mon Sep 17 00:00:00 2001 From: Sai Prakash Ranjan Date: Wed, 11 Aug 2021 21:34:26 +0530 Subject: [PATCH 160/474] iommu/arm-smmu: Optimize ->tlb_flush_walk() for qcom implementation Currently for iommu_unmap() of large scatter-gather list with page size elements, the majority of time is spent in flushing of partial walks in __arm_lpae_unmap() which is a VA based TLB invalidation invalidating page-by-page on iommus like arm-smmu-v2 (TLBIVA). For example: to unmap a 32MB scatter-gather list with page size elements (8192 entries), there are 16->2MB buffer unmaps based on the pgsize (2MB for 4K granule) and each of 2MB will further result in 512 TLBIVAs (2MB/4K) resulting in a total of 8192 TLBIVAs (512*16) for 16->2MB causing a huge overhead. On qcom implementation, there are several performance improvements for TLB cache invalidations in HW like wait-for-safe (for realtime clients such as camera and display) and few others to allow for cache lookups/updates when TLBI is in progress for the same context bank. So the cost of over-invalidation is less compared to the unmap latency on several usecases like camera which deals with large buffers. So, ASID based TLB invalidations (TLBIASID) can be used to invalidate the entire context for partial walk flush thereby improving the unmap latency. For this example of 32MB scatter-gather list unmap, this change results in just 16 ASID based TLB invalidations (TLBIASIDs) as opposed to 8192 TLBIVAs thereby increasing the performance of unmaps drastically. Test on QTI SM8150 SoC for 10 iterations of iommu_{map_sg}/unmap: (average over 10 iterations) Before this optimization: size iommu_map_sg iommu_unmap 4K 2.067 us 1.854 us 64K 9.598 us 8.802 us 1M 148.890 us 130.718 us 2M 305.864 us 67.291 us 12M 1793.604 us 390.838 us 16M 2386.848 us 518.187 us 24M 3563.296 us 775.989 us 32M 4747.171 us 1033.364 us After this optimization: size iommu_map_sg iommu_unmap 4K 1.723 us 1.765 us 64K 9.880 us 8.869 us 1M 155.364 us 135.223 us 2M 303.906 us 5.385 us 12M 1786.557 us 21.250 us 16M 2391.890 us 27.437 us 24M 3570.895 us 39.937 us 32M 4755.234 us 51.797 us Real world data also shows big difference in unmap performance as below: There were reports of camera frame drops because of high overhead in iommu unmap without this optimization because of frequent unmaps issued by camera of about 100MB/s taking more than 100ms thereby causing frame drops. Signed-off-by: Sai Prakash Ranjan Link: https://lore.kernel.org/r/20210811160426.10312-1-saiprakash.ranjan@codeaurora.org Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 11 +++++++++++ drivers/iommu/arm/arm-smmu/arm-smmu.c | 13 ++++++++++--- drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 + 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 9b9d13ec5a88..55690af1b25d 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -193,6 +193,8 @@ static int qcom_adreno_smmu_init_context(struct arm_smmu_domain *smmu_domain, { struct adreno_smmu_priv *priv; + smmu_domain->cfg.flush_walk_prefer_tlbiasid = true; + /* Only enable split pagetables for the GPU device (SID 0) */ if (!qcom_adreno_smmu_is_gpu_device(dev)) return 0; @@ -235,6 +237,14 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { { } }; +static int qcom_smmu_init_context(struct arm_smmu_domain *smmu_domain, + struct io_pgtable_cfg *pgtbl_cfg, struct device *dev) +{ + smmu_domain->cfg.flush_walk_prefer_tlbiasid = true; + + return 0; +} + static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) { unsigned int last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); @@ -358,6 +368,7 @@ static int qcom_smmu500_reset(struct arm_smmu_device *smmu) } static const struct arm_smmu_impl qcom_smmu_impl = { + .init_context = qcom_smmu_init_context, .cfg_probe = qcom_smmu_cfg_probe, .def_domain_type = qcom_smmu_def_domain_type, .reset = qcom_smmu500_reset, diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index f7da8953afbe..67b660b0551d 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -327,9 +327,16 @@ static void arm_smmu_tlb_inv_range_s2(unsigned long iova, size_t size, static void arm_smmu_tlb_inv_walk_s1(unsigned long iova, size_t size, size_t granule, void *cookie) { - arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie, - ARM_SMMU_CB_S1_TLBIVA); - arm_smmu_tlb_sync_context(cookie); + struct arm_smmu_domain *smmu_domain = cookie; + struct arm_smmu_cfg *cfg = &smmu_domain->cfg; + + if (cfg->flush_walk_prefer_tlbiasid) { + arm_smmu_tlb_inv_context_s1(cookie); + } else { + arm_smmu_tlb_inv_range_s1(iova, size, granule, cookie, + ARM_SMMU_CB_S1_TLBIVA); + arm_smmu_tlb_sync_context(cookie); + } } static void arm_smmu_tlb_add_page_s1(struct iommu_iotlb_gather *gather, diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h index a50271595960..432de2f742c3 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.h +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h @@ -346,6 +346,7 @@ struct arm_smmu_cfg { }; enum arm_smmu_cbar_type cbar; enum arm_smmu_context_fmt fmt; + bool flush_walk_prefer_tlbiasid; }; #define ARM_SMMU_INVALID_IRPTNDX 0xff From eff19474b1bd60286213e5052ccf246b6a6c7199 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 11 Aug 2021 19:48:49 +0800 Subject: [PATCH 161/474] iommu/arm-smmu-v3: Use command queue batching helpers to improve performance The obvious key to the performance optimization of commit 587e6c10a7ce ("iommu/arm-smmu-v3: Reduce contention during command-queue insertion") is to allow multiple cores to insert commands in parallel after a brief mutex contention. Obviously, inserting as many commands at a time as possible can reduce the number of times the mutex contention participates, thereby improving the overall performance. At least it reduces the number of calls to function arm_smmu_cmdq_issue_cmdlist(). Therefore, use command queue batching helpers to insert multiple commands at a time. Signed-off-by: Zhen Lei Link: https://lore.kernel.org/r/20210811114852.2429-2-thunder.leizhen@huawei.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 952291840c76..ac0c4b21cfa8 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1747,15 +1747,16 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master) { int i; struct arm_smmu_cmdq_ent cmd; + struct arm_smmu_cmdq_batch cmds = {}; arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd); for (i = 0; i < master->num_streams; i++) { cmd.atc.sid = master->streams[i].id; - arm_smmu_cmdq_issue_cmd(master->smmu, &cmd); + arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd); } - return arm_smmu_cmdq_issue_sync(master->smmu); + return arm_smmu_cmdq_batch_submit(master->smmu, &cmds); } int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, From 4537f6f1e2d8d22ec6f9c6bd3844fdccb931f46e Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 11 Aug 2021 19:48:50 +0800 Subject: [PATCH 162/474] iommu/arm-smmu-v3: Add and use static helper function arm_smmu_cmdq_issue_cmd_with_sync() The obvious key to the performance optimization of commit 587e6c10a7ce ("iommu/arm-smmu-v3: Reduce contention during command-queue insertion") is to allow multiple cores to insert commands in parallel after a brief mutex contention. Obviously, inserting as many commands at a time as possible can reduce the number of times the mutex contention participates, thereby improving the overall performance. At least it reduces the number of calls to function arm_smmu_cmdq_issue_cmdlist(). Therefore, function arm_smmu_cmdq_issue_cmd_with_sync() is added to insert the 'cmd+sync' commands at a time. Signed-off-by: Zhen Lei Link: https://lore.kernel.org/r/20210811114852.2429-3-thunder.leizhen@huawei.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 35 +++++++++++---------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index ac0c4b21cfa8..3cdb01f93f1b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -845,8 +845,9 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, return ret; } -static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, - struct arm_smmu_cmdq_ent *ent) +static int __arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq_ent *ent, + bool sync) { u64 cmd[CMDQ_ENT_DWORDS]; @@ -856,12 +857,19 @@ static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, return -EINVAL; } - return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false); + return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, sync); } -static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) +static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq_ent *ent) { - return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true); + return __arm_smmu_cmdq_issue_cmd(smmu, ent, false); +} + +static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq_ent *ent) +{ + return __arm_smmu_cmdq_issue_cmd(smmu, ent, true); } static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu, @@ -929,8 +937,7 @@ void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid) .tlbi.asid = asid, }; - arm_smmu_cmdq_issue_cmd(smmu, &cmd); - arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); } static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain, @@ -1211,8 +1218,7 @@ static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid) }, }; - arm_smmu_cmdq_issue_cmd(smmu, &cmd); - arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); } static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid, @@ -1824,8 +1830,7 @@ static void arm_smmu_tlb_inv_context(void *cookie) } else { cmd.opcode = CMDQ_OP_TLBI_S12_VMALL; cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; - arm_smmu_cmdq_issue_cmd(smmu, &cmd); - arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); } arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0); } @@ -3339,18 +3344,16 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) /* Invalidate any cached configuration */ cmd.opcode = CMDQ_OP_CFGI_ALL; - arm_smmu_cmdq_issue_cmd(smmu, &cmd); - arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); /* Invalidate any stale TLB entries */ if (smmu->features & ARM_SMMU_FEAT_HYP) { cmd.opcode = CMDQ_OP_TLBI_EL2_ALL; - arm_smmu_cmdq_issue_cmd(smmu, &cmd); + arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); } cmd.opcode = CMDQ_OP_TLBI_NSNH_ALL; - arm_smmu_cmdq_issue_cmd(smmu, &cmd); - arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); /* Event queue */ writeq_relaxed(smmu->evtq.q.q_base, smmu->base + ARM_SMMU_EVTQ_BASE); From 8639cc83aac5dd1a197ed05b8f717acc35bb0248 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 11 Aug 2021 19:48:51 +0800 Subject: [PATCH 163/474] iommu/arm-smmu-v3: Add and use static helper function arm_smmu_get_cmdq() One SMMU has only one normal CMDQ. Therefore, this CMDQ is used regardless of the core on which the command is inserted. It can be referenced directly through "smmu->cmdq". However, one SMMU has multiple ECMDQs, and the ECMDQ used by the core on which the command insertion is executed may be different. So the helper function arm_smmu_get_cmdq() is added, which returns the CMDQ/ECMDQ that the current core should use. Currently, the code that supports ECMDQ is not added. just simply returns "&smmu->cmdq". Many subfunctions of arm_smmu_cmdq_issue_cmdlist() use "&smmu->cmdq" or "&smmu->cmdq.q" directly. To support ECMDQ, they need to call the newly added function arm_smmu_get_cmdq() instead. Note that normal CMDQ is still required until ECMDQ is available. Signed-off-by: Zhen Lei Link: https://lore.kernel.org/r/20210811114852.2429-4-thunder.leizhen@huawei.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 24 ++++++++++++--------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 3cdb01f93f1b..c1cd2adab3ef 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -335,10 +335,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) return 0; } -static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu, - u32 prod) +static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu) +{ + return &smmu->cmdq; +} + +static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu, + struct arm_smmu_queue *q, u32 prod) { - struct arm_smmu_queue *q = &smmu->cmdq.q; struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC, }; @@ -579,7 +583,7 @@ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu, { unsigned long flags; struct arm_smmu_queue_poll qp; - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu); int ret = 0; /* @@ -595,7 +599,7 @@ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu, queue_poll_init(smmu, &qp); do { - llq->val = READ_ONCE(smmu->cmdq.q.llq.val); + llq->val = READ_ONCE(cmdq->q.llq.val); if (!queue_full(llq)) break; @@ -614,7 +618,7 @@ static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu, { int ret = 0; struct arm_smmu_queue_poll qp; - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu); u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod)); queue_poll_init(smmu, &qp); @@ -637,12 +641,12 @@ static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu, struct arm_smmu_ll_queue *llq) { struct arm_smmu_queue_poll qp; - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu); u32 prod = llq->prod; int ret = 0; queue_poll_init(smmu, &qp); - llq->val = READ_ONCE(smmu->cmdq.q.llq.val); + llq->val = READ_ONCE(cmdq->q.llq.val); do { if (queue_consumed(llq, prod)) break; @@ -732,7 +736,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, u32 prod; unsigned long flags; bool owner; - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu); struct arm_smmu_ll_queue llq, head; int ret = 0; @@ -772,7 +776,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n); if (sync) { prod = queue_inc_prod_n(&llq, n); - arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod); + arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, &cmdq->q, prod); queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS); /* From 2cbeaf3f36eb300cae1f344e125162c42fb091a3 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 11 Aug 2021 19:48:52 +0800 Subject: [PATCH 164/474] iommu/arm-smmu-v3: Extract reusable function __arm_smmu_cmdq_skip_err() When SMMU_GERROR.CMDQP_ERR is different to SMMU_GERRORN.CMDQP_ERR, it indicates that one or more errors have been encountered on a command queue control page interface. We need to traverse all ECMDQs in that control page to find all errors. For each ECMDQ error handling, it is much the same as the CMDQ error handling. This common processing part is extracted as a new function __arm_smmu_cmdq_skip_err(). Signed-off-by: Zhen Lei Link: https://lore.kernel.org/r/20210811114852.2429-5-thunder.leizhen@huawei.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index c1cd2adab3ef..283bfad5f28f 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -359,7 +359,8 @@ static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu, arm_smmu_cmdq_build_cmd(cmd, &ent); } -static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) +static void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu, + struct arm_smmu_queue *q) { static const char * const cerror_str[] = { [CMDQ_ERR_CERROR_NONE_IDX] = "No error", @@ -370,7 +371,6 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) int i; u64 cmd[CMDQ_ENT_DWORDS]; - struct arm_smmu_queue *q = &smmu->cmdq.q; u32 cons = readl_relaxed(q->cons_reg); u32 idx = FIELD_GET(CMDQ_CONS_ERR, cons); struct arm_smmu_cmdq_ent cmd_sync = { @@ -417,6 +417,11 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) queue_write(Q_ENT(q, cons), cmd, q->ent_dwords); } +static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) +{ + __arm_smmu_cmdq_skip_err(smmu, &smmu->cmdq.q); +} + /* * Command queue locking. * This is a form of bastardised rwlock with the following major changes: From fac956710ab0812f9e395e9f7a27da551412830f Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 11 Aug 2021 23:49:26 +0800 Subject: [PATCH 165/474] iommu/arm-smmu-v3: Stop pre-zeroing batch commands Pre-zeroing the batched commands structure is inefficient, as individual commands are zeroed later in arm_smmu_cmdq_build_cmd(). The size is quite large and commonly most commands won't even be used: struct arm_smmu_cmdq_batch cmds = {}; 345c: 52800001 mov w1, #0x0 // #0 3460: d2808102 mov x2, #0x408 // #1032 3464: 910143a0 add x0, x29, #0x50 3468: 94000000 bl 0 Stop pre-zeroing the complete structure and only zero the num member. Signed-off-by: John Garry Link: https://lore.kernel.org/r/1628696966-88386-1-git-send-email-john.garry@huawei.com Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 283bfad5f28f..3216e746c28a 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -955,7 +955,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain, size_t i; unsigned long flags; struct arm_smmu_master *master; - struct arm_smmu_cmdq_batch cmds = {}; + struct arm_smmu_cmdq_batch cmds; struct arm_smmu_device *smmu = smmu_domain->smmu; struct arm_smmu_cmdq_ent cmd = { .opcode = CMDQ_OP_CFGI_CD, @@ -965,6 +965,8 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain *smmu_domain, }, }; + cmds.num = 0; + spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_for_each_entry(master, &smmu_domain->devices, domain_head) { for (i = 0; i < master->num_streams; i++) { @@ -1781,7 +1783,7 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, unsigned long flags; struct arm_smmu_cmdq_ent cmd; struct arm_smmu_master *master; - struct arm_smmu_cmdq_batch cmds = {}; + struct arm_smmu_cmdq_batch cmds; if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS)) return 0; @@ -1805,6 +1807,8 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid, arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd); + cmds.num = 0; + spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_for_each_entry(master, &smmu_domain->devices, domain_head) { if (!master->ats_enabled) @@ -1852,7 +1856,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, struct arm_smmu_device *smmu = smmu_domain->smmu; unsigned long end = iova + size, num_pages = 0, tg = 0; size_t inv_range = granule; - struct arm_smmu_cmdq_batch cmds = {}; + struct arm_smmu_cmdq_batch cmds; if (!size) return; @@ -1870,6 +1874,8 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, num_pages = size >> tg; } + cmds.num = 0; + while (iova < end) { if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) { /* From db87a7199229b75c9996bf78117eceb81854fce2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 13 Apr 2021 16:38:09 +0000 Subject: [PATCH 166/474] powerpc/bug: Remove specific powerpc BUG_ON() and WARN_ON() on PPC32 powerpc BUG_ON() and WARN_ON() are based on using twnei instruction. For catching simple conditions like a variable having value 0, this is efficient because it does the test and the trap at the same time. But most conditions used with BUG_ON or WARN_ON are more complex and forces GCC to format the condition into a 0 or 1 value in a register. This will usually require 2 to 3 instructions. The most efficient solution would be to use __builtin_trap() because GCC is able to optimise the use of the different trap instructions based on the requested condition, but this is complex if not impossible for the following reasons: - __builtin_trap() is a non-recoverable instruction, so it can't be used for WARN_ON - Knowing which line of code generated the trap would require the analysis of DWARF information. This is not a feature we have today. As mentioned in commit 8d4fbcfbe0a4 ("Fix WARN_ON() on bitfield ops") the way WARN_ON() is implemented is suboptimal. That commit also mentions an issue with 'long long' condition. It fixed it for WARN_ON() but the same problem still exists today with BUG_ON() on PPC32. It will be fixed by using the generic implementation. By using the generic implementation, gcc will naturally generate a branch to the unconditional trap generated by BUG(). As modern powerpc implement zero-cycle branch, that's even more efficient. And for the functions using WARN_ON() and its return, the test on return from WARN_ON() is now also used for the WARN_ON() itself. On PPC64 we don't want it because we want to be able to use CFAR register to track how we entered the code that trapped. The CFAR register would be clobbered by the branch. A simple test function: unsigned long test9w(unsigned long a, unsigned long b) { if (WARN_ON(!b)) return 0; return a / b; } Before the patch: 0000046c : 46c: 7c 89 00 34 cntlzw r9,r4 470: 55 29 d9 7e rlwinm r9,r9,27,5,31 474: 0f 09 00 00 twnei r9,0 478: 2c 04 00 00 cmpwi r4,0 47c: 41 82 00 0c beq 488 480: 7c 63 23 96 divwu r3,r3,r4 484: 4e 80 00 20 blr 488: 38 60 00 00 li r3,0 48c: 4e 80 00 20 blr After the patch: 00000468 : 468: 2c 04 00 00 cmpwi r4,0 46c: 41 82 00 0c beq 478 470: 7c 63 23 96 divwu r3,r3,r4 474: 4e 80 00 20 blr 478: 0f e0 00 00 twui r0,0 47c: 38 60 00 00 li r3,0 480: 4e 80 00 20 blr So we see before the patch we need 3 instructions on the likely path to handle the WARN_ON(). With the patch the trap goes on the unlikely path. See below the difference at the entry of system_call_exception where we have several BUG_ON(), allthough less impressing. With the patch: 00000000 : 0: 81 6a 00 84 lwz r11,132(r10) 4: 90 6a 00 88 stw r3,136(r10) 8: 71 60 00 02 andi. r0,r11,2 c: 41 82 00 70 beq 7c 10: 71 60 40 00 andi. r0,r11,16384 14: 41 82 00 6c beq 80 18: 71 6b 80 00 andi. r11,r11,32768 1c: 41 82 00 68 beq 84 20: 94 21 ff e0 stwu r1,-32(r1) 24: 93 e1 00 1c stw r31,28(r1) 28: 7d 8c 42 e6 mftb r12 ... 7c: 0f e0 00 00 twui r0,0 80: 0f e0 00 00 twui r0,0 84: 0f e0 00 00 twui r0,0 Without the patch: 00000000 : 0: 94 21 ff e0 stwu r1,-32(r1) 4: 93 e1 00 1c stw r31,28(r1) 8: 90 6a 00 88 stw r3,136(r10) c: 81 6a 00 84 lwz r11,132(r10) 10: 69 60 00 02 xori r0,r11,2 14: 54 00 ff fe rlwinm r0,r0,31,31,31 18: 0f 00 00 00 twnei r0,0 1c: 69 60 40 00 xori r0,r11,16384 20: 54 00 97 fe rlwinm r0,r0,18,31,31 24: 0f 00 00 00 twnei r0,0 28: 69 6b 80 00 xori r11,r11,32768 2c: 55 6b 8f fe rlwinm r11,r11,17,31,31 30: 0f 0b 00 00 twnei r11,0 34: 7d 8c 42 e6 mftb r12 Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b286e07fb771a664b631cd07a40b09c06f26e64b.1618331881.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/bug.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index 0b2162890d8b..d844de5adfcb 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -68,7 +68,11 @@ BUG_ENTRY("twi 31, 0, 0", 0); \ unreachable(); \ } while (0) +#define HAVE_ARCH_BUG +#define __WARN_FLAGS(flags) BUG_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags)) + +#ifdef CONFIG_PPC64 #define BUG_ON(x) do { \ if (__builtin_constant_p(x)) { \ if (x) \ @@ -78,8 +82,6 @@ } \ } while (0) -#define __WARN_FLAGS(flags) BUG_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags)) - #define WARN_ON(x) ({ \ int __ret_warn_on = !!(x); \ if (__builtin_constant_p(__ret_warn_on)) { \ @@ -93,9 +95,10 @@ unlikely(__ret_warn_on); \ }) -#define HAVE_ARCH_BUG #define HAVE_ARCH_BUG_ON #define HAVE_ARCH_WARN_ON +#endif + #endif /* __ASSEMBLY __ */ #else #ifdef __ASSEMBLY__ From 1e688dd2a3d6759d416616ff07afc4bb836c4213 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 13 Apr 2021 16:38:10 +0000 Subject: [PATCH 167/474] powerpc/bug: Provide better flexibility to WARN_ON/__WARN_FLAGS() with asm goto Using asm goto in __WARN_FLAGS() and WARN_ON() allows more flexibility to GCC. For that add an entry to the exception table so that program_check_exception() knowns where to resume execution after a WARNING. Here are two exemples. The first one is done on PPC32 (which benefits from the previous patch), the second is on PPC64. unsigned long test(struct pt_regs *regs) { int ret; WARN_ON(regs->msr & MSR_PR); return regs->gpr[3]; } unsigned long test9w(unsigned long a, unsigned long b) { if (WARN_ON(!b)) return 0; return a / b; } Before the patch: 000003a8 : 3a8: 81 23 00 84 lwz r9,132(r3) 3ac: 71 29 40 00 andi. r9,r9,16384 3b0: 40 82 00 0c bne 3bc 3b4: 80 63 00 0c lwz r3,12(r3) 3b8: 4e 80 00 20 blr 3bc: 0f e0 00 00 twui r0,0 3c0: 80 63 00 0c lwz r3,12(r3) 3c4: 4e 80 00 20 blr 0000000000000bf0 <.test9w>: bf0: 7c 89 00 74 cntlzd r9,r4 bf4: 79 29 d1 82 rldicl r9,r9,58,6 bf8: 0b 09 00 00 tdnei r9,0 bfc: 2c 24 00 00 cmpdi r4,0 c00: 41 82 00 0c beq c0c <.test9w+0x1c> c04: 7c 63 23 92 divdu r3,r3,r4 c08: 4e 80 00 20 blr c0c: 38 60 00 00 li r3,0 c10: 4e 80 00 20 blr After the patch: 000003a8 : 3a8: 81 23 00 84 lwz r9,132(r3) 3ac: 71 29 40 00 andi. r9,r9,16384 3b0: 40 82 00 0c bne 3bc 3b4: 80 63 00 0c lwz r3,12(r3) 3b8: 4e 80 00 20 blr 3bc: 0f e0 00 00 twui r0,0 0000000000000c50 <.test9w>: c50: 7c 89 00 74 cntlzd r9,r4 c54: 79 29 d1 82 rldicl r9,r9,58,6 c58: 0b 09 00 00 tdnei r9,0 c5c: 7c 63 23 92 divdu r3,r3,r4 c60: 4e 80 00 20 blr c70: 38 60 00 00 li r3,0 c74: 4e 80 00 20 blr In the first exemple, we see GCC doesn't need to duplicate what happens after the trap. In the second exemple, we see that GCC doesn't need to emit a test and a branch in the likely path in addition to the trap. We've got some WARN_ON() in .softirqentry.text section so it needs to be added in the OTHER_TEXT_SECTIONS in modpost.c Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/389962b1b702e3c78d169e59bcfac56282889173.1618331882.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/64/kup.h | 2 +- arch/powerpc/include/asm/bug.h | 54 +++++++++++++++---- arch/powerpc/include/asm/extable.h | 14 +++++ arch/powerpc/include/asm/ppc_asm.h | 11 +--- arch/powerpc/kernel/entry_64.S | 2 +- arch/powerpc/kernel/misc_32.S | 2 +- arch/powerpc/kernel/traps.c | 9 +++- scripts/mod/modpost.c | 2 +- .../powerpc/primitives/asm/extable.h | 1 + 9 files changed, 72 insertions(+), 25 deletions(-) create mode 120000 tools/testing/selftests/powerpc/primitives/asm/extable.h diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h index a1cc73a88710..170339969b7c 100644 --- a/arch/powerpc/include/asm/book3s/64/kup.h +++ b/arch/powerpc/include/asm/book3s/64/kup.h @@ -90,7 +90,7 @@ /* Prevent access to userspace using any key values */ LOAD_REG_IMMEDIATE(\gpr2, AMR_KUAP_BLOCKED) 999: tdne \gpr1, \gpr2 - EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) + EMIT_WARN_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_BOOK3S_KUAP, 67) #endif .endm diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index d844de5adfcb..1ee0f22313ee 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -4,6 +4,7 @@ #ifdef __KERNEL__ #include +#include #ifdef CONFIG_BUG @@ -30,6 +31,11 @@ .endm #endif /* verbose */ +.macro EMIT_WARN_ENTRY addr,file,line,flags + EX_TABLE(\addr,\addr+4) + EMIT_BUG_ENTRY \addr,\file,\line,\flags +.endm + #else /* !__ASSEMBLY__ */ /* _EMIT_BUG_ENTRY expects args %0,%1,%2,%3 to be FILE, LINE, flags and sizeof(struct bug_entry), respectively */ @@ -58,6 +64,16 @@ "i" (sizeof(struct bug_entry)), \ ##__VA_ARGS__) +#define WARN_ENTRY(insn, flags, label, ...) \ + asm_volatile_goto( \ + "1: " insn "\n" \ + EX_TABLE(1b, %l[label]) \ + _EMIT_BUG_ENTRY \ + : : "i" (__FILE__), "i" (__LINE__), \ + "i" (flags), \ + "i" (sizeof(struct bug_entry)), \ + ##__VA_ARGS__ : : label) + /* * BUG_ON() and WARN_ON() do their best to cooperate with compile-time * optimisations. However depending on the complexity of the condition @@ -70,7 +86,15 @@ } while (0) #define HAVE_ARCH_BUG -#define __WARN_FLAGS(flags) BUG_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags)) +#define __WARN_FLAGS(flags) do { \ + __label__ __label_warn_on; \ + \ + WARN_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags), __label_warn_on); \ + unreachable(); \ + \ +__label_warn_on: \ + break; \ +} while (0) #ifdef CONFIG_PPC64 #define BUG_ON(x) do { \ @@ -83,15 +107,24 @@ } while (0) #define WARN_ON(x) ({ \ - int __ret_warn_on = !!(x); \ - if (__builtin_constant_p(__ret_warn_on)) { \ - if (__ret_warn_on) \ + bool __ret_warn_on = false; \ + do { \ + if (__builtin_constant_p((x))) { \ + if (!(x)) \ + break; \ __WARN(); \ - } else { \ - BUG_ENTRY(PPC_TLNEI " %4, 0", \ - BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), \ - "r" (__ret_warn_on)); \ - } \ + __ret_warn_on = true; \ + } else { \ + __label__ __label_warn_on; \ + \ + WARN_ENTRY(PPC_TLNEI " %4, 0", \ + BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), \ + __label_warn_on, "r" (x)); \ + break; \ +__label_warn_on: \ + __ret_warn_on = true; \ + } \ + } while (0); \ unlikely(__ret_warn_on); \ }) @@ -104,8 +137,11 @@ #ifdef __ASSEMBLY__ .macro EMIT_BUG_ENTRY addr,file,line,flags .endm +.macro EMIT_WARN_ENTRY addr,file,line,flags +.endm #else /* !__ASSEMBLY__ */ #define _EMIT_BUG_ENTRY +#define _EMIT_WARN_ENTRY #endif #endif /* CONFIG_BUG */ diff --git a/arch/powerpc/include/asm/extable.h b/arch/powerpc/include/asm/extable.h index eb91b2d2935a..26ce2e5c0fa8 100644 --- a/arch/powerpc/include/asm/extable.h +++ b/arch/powerpc/include/asm/extable.h @@ -17,6 +17,8 @@ #define ARCH_HAS_RELATIVE_EXTABLE +#ifndef __ASSEMBLY__ + struct exception_table_entry { int insn; int fixup; @@ -28,3 +30,15 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x) } #endif + +/* + * Helper macro for exception table entries + */ +#define EX_TABLE(_fault, _target) \ + stringify_in_c(.section __ex_table,"a";)\ + stringify_in_c(.balign 4;) \ + stringify_in_c(.long (_fault) - . ;) \ + stringify_in_c(.long (_target) - . ;) \ + stringify_in_c(.previous) + +#endif diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 116c1519728a..ffe712307e11 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -10,6 +10,7 @@ #include #include #include +#include #ifdef __ASSEMBLY__ @@ -752,16 +753,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) #endif /* __ASSEMBLY__ */ -/* - * Helper macro for exception table entries - */ -#define EX_TABLE(_fault, _target) \ - stringify_in_c(.section __ex_table,"a";)\ - stringify_in_c(.balign 4;) \ - stringify_in_c(.long (_fault) - . ;) \ - stringify_in_c(.long (_target) - . ;) \ - stringify_in_c(.previous) - #define SOFT_MASK_TABLE(_start, _end) \ stringify_in_c(.section __soft_mask_table,"a";)\ stringify_in_c(.balign 8;) \ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 15720f8661a1..70cff7b49e17 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -309,7 +309,7 @@ _GLOBAL(enter_rtas) */ lbz r0,PACAIRQSOFTMASK(r13) 1: tdeqi r0,IRQS_ENABLED - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING + EMIT_WARN_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING #endif /* Hard-disable interrupts */ diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 39ab15419592..d8645efff902 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -237,7 +237,7 @@ _GLOBAL(copy_page) addi r3,r3,-4 0: twnei r5, 0 /* WARN if r3 is not cache aligned */ - EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING + EMIT_WARN_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING addi r4,r4,-4 diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index c8f648727d36..51d4f5faf425 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1477,8 +1477,13 @@ static void do_program_check(struct pt_regs *regs) if (!(regs->msr & MSR_PR) && /* not user-mode */ report_bug(bugaddr, regs) == BUG_TRAP_TYPE_WARN) { - regs_add_return_ip(regs, 4); - return; + const struct exception_table_entry *entry; + + entry = search_exception_tables(bugaddr); + if (entry) { + regs_set_return_ip(regs, extable_fixup(entry) + regs->nip - bugaddr); + return; + } } _exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip); return; diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 270a7df898e2..1209e1786af7 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -931,7 +931,7 @@ static void check_section(const char *modname, struct elf_info *elf, ".kprobes.text", ".cpuidle.text", ".noinstr.text" #define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \ ".fixup", ".entry.text", ".exception.text", ".text.*", \ - ".coldtext" + ".coldtext", ".softirqentry.text" #define INIT_SECTIONS ".init.*" #define MEM_INIT_SECTIONS ".meminit.*" diff --git a/tools/testing/selftests/powerpc/primitives/asm/extable.h b/tools/testing/selftests/powerpc/primitives/asm/extable.h new file mode 120000 index 000000000000..6385f059a951 --- /dev/null +++ b/tools/testing/selftests/powerpc/primitives/asm/extable.h @@ -0,0 +1 @@ +../../../../../../arch/powerpc/include/asm/extable.h \ No newline at end of file From 0355785313e2191be4e1108cdbda94ddb0238c48 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 13 Aug 2021 13:05:11 -0700 Subject: [PATCH 168/474] powerpc: Add "-z notext" flag to disable diagnostic Object files used to link .tmp_vmlinux.kallsyms1 have many R_PPC64_ADDR64 relocations in non-SHF_WRITE sections. There are many text relocations (e.g. in .rela___ksymtab_gpl+* and .rela__mcount_loc sections) in a -pie link and are disallowed by LLD: ld.lld: error: can't create dynamic relocation R_PPC64_ADDR64 against local symbol in readonly segment; recompile object files with -fPIC or pass '-Wl,-z,notext' to allow text relocations in the output >>> defined in arch/powerpc/kernel/head_64.o >>> referenced by arch/powerpc/kernel/head_64.o:(__restart_table+0x10) Newer GNU ld configured with "--enable-textrel-check=error" will report an error as well: $ ld-new -EL -m elf64lppc -pie ... -o .tmp_vmlinux.kallsyms1 ... ld-new: read-only segment has dynamic relocations Add "-z notext" to suppress the errors. Non-CONFIG_RELOCATABLE builds use the default -no-pie mode and thus R_PPC64_ADDR64 relocations can be resolved at link-time. Reported-by: Itaru Kitayama Co-developed-by: Bill Wendling Signed-off-by: Fangrui Song Signed-off-by: Bill Wendling Reviewed-by: Nick Desaulniers Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210813200511.1905703-1-morbo@google.com --- arch/powerpc/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 9aaf1abbc641..aa6808e70647 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -122,6 +122,7 @@ endif LDFLAGS_vmlinux-y := -Bstatic LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie +LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) += -z notext LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y) ifdef CONFIG_PPC64 From 8b893ef190b0c440877de04f767efca4bf4d6af8 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 16 Aug 2021 12:30:11 +1000 Subject: [PATCH 169/474] powerpc/pseries: Fix build error when NUMA=n As reported by lkp, if NUMA=n we see a build error: arch/powerpc/platforms/pseries/hotplug-cpu.c: In function 'pseries_cpu_hotplug_init': arch/powerpc/platforms/pseries/hotplug-cpu.c:1022:8: error: 'node_to_cpumask_map' undeclared 1022 | node_to_cpumask_map[node]); Use cpumask_of_node() which has an empty stub for NUMA=n, and when NUMA=y does a lookup from node_to_cpumask_map[]. Fixes: bd1dd4c5f528 ("powerpc/pseries: Prevent free CPU ids being reused on another node") Reported-by: kernel test robot Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210816041032.2839343-1-mpe@ellerman.id.au --- arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 1ef40ef699a6..d646c22e94ab 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -1021,7 +1021,7 @@ static int __init pseries_cpu_hotplug_init(void) /* Record ids of CPU added at boot time */ cpumask_or(node_recorded_ids_map[node], node_recorded_ids_map[node], - node_to_cpumask_map[node]); + cpumask_of_node(node)); } of_reconfig_notifier_register(&pseries_smp_nb); From 47c258d71ebfc832a760a1dc6540cf3c33968023 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 15 Aug 2021 15:23:34 -0700 Subject: [PATCH 170/474] powerpc/head_check: use stdout for error messages Prefer stderr instead of stdout for error messages. This is a good practice and can help CI error detecting and reporting (0day in this case). Signed-off-by: Randy Dunlap Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210815222334.9575-1-rdunlap@infradead.org --- arch/powerpc/tools/head_check.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/tools/head_check.sh b/arch/powerpc/tools/head_check.sh index e32d3162e5ed..e477837fdc58 100644 --- a/arch/powerpc/tools/head_check.sh +++ b/arch/powerpc/tools/head_check.sh @@ -56,9 +56,9 @@ expected_start_head_addr=$vma start_head_addr=$(cat .tmp_symbols.txt | grep " t start_first_256B$" | cut -d' ' -f1) if [ "$start_head_addr" != "$expected_start_head_addr" ]; then - echo "ERROR: head code starts at $start_head_addr, should be $expected_start_head_addr" - echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" - echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" + echo "ERROR: head code starts at $start_head_addr, should be $expected_start_head_addr" 1>&2 + echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" 1>&2 + echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" 1>&2 exit 1 fi @@ -70,9 +70,9 @@ expected_start_text_addr=$(cat .tmp_symbols.txt | grep " a text_start$" | cut -d start_text_addr=$(cat .tmp_symbols.txt | grep " t start_text$" | cut -d' ' -f1) if [ "$start_text_addr" != "$expected_start_text_addr" ]; then - echo "ERROR: start_text address is $start_text_addr, should be $expected_start_text_addr" - echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" - echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" + echo "ERROR: start_text address is $start_text_addr, should be $expected_start_text_addr" 1>&2 + echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" 1>&2 + echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" 1>&2 exit 1 fi From e95ad5f21693a37c0d318b5988c5a0de324eb3e3 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 16 Aug 2021 16:36:02 +1000 Subject: [PATCH 171/474] powerpc/head_check: Fix shellcheck errors Replace "cat file | grep pattern" with "grep pattern file", and quote a few variables. Together that fixes all shellcheck errors. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817125154.3369884-1-mpe@ellerman.id.au --- arch/powerpc/tools/head_check.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/tools/head_check.sh b/arch/powerpc/tools/head_check.sh index e477837fdc58..689907cda996 100644 --- a/arch/powerpc/tools/head_check.sh +++ b/arch/powerpc/tools/head_check.sh @@ -49,11 +49,11 @@ vmlinux="$2" $nm "$vmlinux" | grep -e " [TA] _stext$" -e " t start_first_256B$" -e " a text_start$" -e " t start_text$" > .tmp_symbols.txt -vma=$(cat .tmp_symbols.txt | grep -e " [TA] _stext$" | cut -d' ' -f1) +vma=$(grep -e " [TA] _stext$" .tmp_symbols.txt | cut -d' ' -f1) -expected_start_head_addr=$vma +expected_start_head_addr="$vma" -start_head_addr=$(cat .tmp_symbols.txt | grep " t start_first_256B$" | cut -d' ' -f1) +start_head_addr=$(grep " t start_first_256B$" .tmp_symbols.txt | cut -d' ' -f1) if [ "$start_head_addr" != "$expected_start_head_addr" ]; then echo "ERROR: head code starts at $start_head_addr, should be $expected_start_head_addr" 1>&2 @@ -63,11 +63,11 @@ if [ "$start_head_addr" != "$expected_start_head_addr" ]; then exit 1 fi -top_vma=$(echo $vma | cut -d'0' -f1) +top_vma=$(echo "$vma" | cut -d'0' -f1) -expected_start_text_addr=$(cat .tmp_symbols.txt | grep " a text_start$" | cut -d' ' -f1 | sed "s/^0/$top_vma/") +expected_start_text_addr=$(grep " a text_start$" .tmp_symbols.txt | cut -d' ' -f1 | sed "s/^0/$top_vma/") -start_text_addr=$(cat .tmp_symbols.txt | grep " t start_text$" | cut -d' ' -f1) +start_text_addr=$(grep " t start_text$" .tmp_symbols.txt | cut -d' ' -f1) if [ "$start_text_addr" != "$expected_start_text_addr" ]; then echo "ERROR: start_text address is $start_text_addr, should be $expected_start_text_addr" 1>&2 From c5ac55b6cbc604ad4144c380feae20f69453df91 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 17 Aug 2021 14:24:05 +0930 Subject: [PATCH 172/474] powerpc/config: Fix IPV6 warning in mpc855_ads When building this config there's a warning: 79:warning: override: reassigning to symbol IPV6 Commit 9a1762a4a4ff ("powerpc/8xx: Update mpc885_ads_defconfig to improve CI") added CONFIG_IPV6=y, but left '# CONFIG_IPV6 is not set' in. IPV6 is default y, so remove both to clean up the build. Fixes: 9a1762a4a4ff ("powerpc/8xx: Update mpc885_ads_defconfig to improve CI") Signed-off-by: Joel Stanley Acked-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817045407.2445664-2-joel@jms.id.au --- arch/powerpc/configs/mpc885_ads_defconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/configs/mpc885_ads_defconfig b/arch/powerpc/configs/mpc885_ads_defconfig index d21f266cea9a..5cd17adf903f 100644 --- a/arch/powerpc/configs/mpc885_ads_defconfig +++ b/arch/powerpc/configs/mpc885_ads_defconfig @@ -21,7 +21,6 @@ CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_PNP=y CONFIG_SYN_COOKIES=y -# CONFIG_IPV6 is not set # CONFIG_FW_LOADER is not set CONFIG_MTD=y CONFIG_MTD_BLOCK=y @@ -76,7 +75,6 @@ CONFIG_PERF_EVENTS=y CONFIG_MATH_EMULATION=y CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y CONFIG_STRICT_KERNEL_RWX=y -CONFIG_IPV6=y CONFIG_BPF_JIT=y CONFIG_DEBUG_VM_PGTABLE=y CONFIG_BDI_SWITCH=y From d0e28a6145c3455b69991245e7f6147eb914b34a Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 17 Aug 2021 14:24:06 +0930 Subject: [PATCH 173/474] powerpc/config: Renable MTD_PHYSMAP_OF CONFIG_MTD_PHYSMAP_OF is not longer enabled as it depends on MTD_PHYSMAP which is not enabled. This is a regression from commit 642b1e8dbed7 ("mtd: maps: Merge physmap_of.c into physmap-core.c"), which added the extra dependency. Add CONFIG_MTD_PHYSMAP=y so this stays in the config, as Christophe said it is useful for build coverage. Fixes: 642b1e8dbed7 ("mtd: maps: Merge physmap_of.c into physmap-core.c") Signed-off-by: Joel Stanley Acked-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817045407.2445664-3-joel@jms.id.au --- arch/powerpc/configs/mpc885_ads_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/configs/mpc885_ads_defconfig b/arch/powerpc/configs/mpc885_ads_defconfig index 5cd17adf903f..cd08f9ed2c8d 100644 --- a/arch/powerpc/configs/mpc885_ads_defconfig +++ b/arch/powerpc/configs/mpc885_ads_defconfig @@ -33,6 +33,7 @@ CONFIG_MTD_CFI_GEOMETRY=y # CONFIG_MTD_CFI_I2 is not set CONFIG_MTD_CFI_I4=y CONFIG_MTD_CFI_AMDSTD=y +CONFIG_MTD_PHYSMAP=y CONFIG_MTD_PHYSMAP_OF=y # CONFIG_BLK_DEV is not set CONFIG_NETDEVICES=y From 87e0d46bf68913f4c87dba94aadc00da658a874b Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 17 Aug 2021 14:24:07 +0930 Subject: [PATCH 174/474] powerpc/configs: Regenerate mpc885_ads_defconfig Regenerate atop v5.14-rc6 by doing a make savedefconfig. The changes a re-ordering except for the following (which are still set indirectly): - CONFIG_DEBUG_KERNEL=y selected by EXPERT - CONFIG_PPC_EARLY_DEBUG_CPM_ADDR=0xff002008 which is the default setting Signed-off-by: Joel Stanley Acked-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817045407.2445664-4-joel@jms.id.au --- arch/powerpc/configs/mpc885_ads_defconfig | 46 +++++++++++------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/configs/mpc885_ads_defconfig b/arch/powerpc/configs/mpc885_ads_defconfig index cd08f9ed2c8d..c74dc76b1d0d 100644 --- a/arch/powerpc/configs/mpc885_ads_defconfig +++ b/arch/powerpc/configs/mpc885_ads_defconfig @@ -1,19 +1,30 @@ -CONFIG_PPC_8xx=y # CONFIG_SWAP is not set CONFIG_SYSVIPC=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_BPF_JIT=y +CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_EXPERT=y # CONFIG_ELF_CORE is not set # CONFIG_BASE_FULL is not set # CONFIG_FUTEX is not set +CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set -# CONFIG_BLK_DEV_BSG is not set -CONFIG_PARTITION_ADVANCED=y +CONFIG_PPC_8xx=y +CONFIG_8xx_GPIO=y +CONFIG_SMC_UCODE_PATCH=y +CONFIG_PIN_TLB=y CONFIG_GEN_RTC=y CONFIG_HZ_100=y +CONFIG_MATH_EMULATION=y +CONFIG_PPC_16K_PAGES=y +CONFIG_ADVANCED_OPTIONS=y # CONFIG_SECCOMP is not set +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_MODULES=y +# CONFIG_BLK_DEV_BSG is not set +CONFIG_PARTITION_ADVANCED=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -46,38 +57,25 @@ CONFIG_DAVICOM_PHY=y # CONFIG_LEGACY_PTYS is not set CONFIG_SERIAL_CPM=y CONFIG_SERIAL_CPM_CONSOLE=y +CONFIG_SPI=y +CONFIG_SPI_FSL_SPI=y # CONFIG_HWMON is not set +CONFIG_WATCHDOG=y +CONFIG_8xxx_WDT=y # CONFIG_USB_SUPPORT is not set # CONFIG_DNOTIFY is not set CONFIG_TMPFS=y CONFIG_CRAMFS=y CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y +CONFIG_CRYPTO=y +CONFIG_CRYPTO_DEV_TALITOS=y CONFIG_CRC32_SLICEBY4=y CONFIG_DEBUG_INFO=y CONFIG_MAGIC_SYSRQ=y -CONFIG_DETECT_HUNG_TASK=y -CONFIG_PPC_16K_PAGES=y -CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_FS=y -CONFIG_PPC_PTDUMP=y -CONFIG_MODULES=y -CONFIG_SPI=y -CONFIG_SPI_FSL_SPI=y -CONFIG_CRYPTO=y -CONFIG_CRYPTO_DEV_TALITOS=y -CONFIG_8xx_GPIO=y -CONFIG_WATCHDOG=y -CONFIG_8xxx_WDT=y -CONFIG_SMC_UCODE_PATCH=y -CONFIG_ADVANCED_OPTIONS=y -CONFIG_PIN_TLB=y -CONFIG_PERF_EVENTS=y -CONFIG_MATH_EMULATION=y -CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y -CONFIG_STRICT_KERNEL_RWX=y -CONFIG_BPF_JIT=y CONFIG_DEBUG_VM_PGTABLE=y +CONFIG_DETECT_HUNG_TASK=y CONFIG_BDI_SWITCH=y CONFIG_PPC_EARLY_DEBUG=y -CONFIG_PPC_EARLY_DEBUG_CPM_ADDR=0xff002008 +CONFIG_PPC_PTDUMP=y From e225c4d6bc389701f2f63fc246420a1da3465ab5 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Tue, 23 Mar 2021 14:29:05 +0800 Subject: [PATCH 175/474] powerpc: Remove duplicate includes interrupt.c: asm/interrupt.h has been included at line 12, so remove the duplicate one at line 10. time.c: linux/sched/clock.h has been included at line 33,so remove the duplicate one at line 56 and move sched/cputime.h under sched including segament. Signed-off-by: Wan Jiabing Reviewed-by: Daniel Axtens Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210323062916.295346-1-wanjiabing@vivo.com --- arch/powerpc/kernel/interrupt.c | 1 - arch/powerpc/kernel/time.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index 21bbd615ca41..abc2b3dc3f20 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index e45ce427bffb..77bae85e2fae 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -52,8 +53,6 @@ #include #include #include -#include -#include #include #include From 6af0b5570b59ce8dd1608a8e48f59eff3f4bdd04 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Fri, 26 Mar 2021 14:48:08 +0800 Subject: [PATCH 176/474] selftests/powerpc: Remove duplicated include from tm-poison.c Remove duplicated include. Reported-by: Hulk Robot Signed-off-by: Zheng Yongjun Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210326064808.3262568-1-zhengyongjun3@huawei.com --- tools/testing/selftests/powerpc/tm/tm-poison.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/tm/tm-poison.c b/tools/testing/selftests/powerpc/tm/tm-poison.c index 29e5f26af7b9..27c083a03d1f 100644 --- a/tools/testing/selftests/powerpc/tm/tm-poison.c +++ b/tools/testing/selftests/powerpc/tm/tm-poison.c @@ -20,7 +20,6 @@ #include #include #include -#include #include "tm.h" From 46983fcd67ac5a830d41ebe3755314db67a6dd16 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:15 +0100 Subject: [PATCH 177/474] iommu: Pull IOVA cookie management into the core Now that everyone has converged on iommu-dma for IOMMU_DOMAIN_DMA support, we can abandon the notion of drivers being responsible for the cookie type, and consolidate all the management into the core code. CC: Yong Wu CC: Chunyan Zhang CC: Maxime Ripard Tested-by: Heiko Stuebner Tested-by: Marek Szyprowski Tested-by: Yoshihiro Shimoda Reviewed-by: Jean-Philippe Brucker Reviewed-by: Lu Baolu Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/46a2c0e7419c7d1d931762dc7b6a69fa082d199a.1628682048.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 7 +++++++ include/linux/iommu.h | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index f2cda9950bd5..b65fcc66ffa4 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -7,6 +7,7 @@ #define pr_fmt(fmt) "iommu: " fmt #include +#include #include #include #include @@ -1946,6 +1947,11 @@ static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus, /* Assume all sizes by default; the driver may override this later */ domain->pgsize_bitmap = bus->iommu_ops->pgsize_bitmap; + /* Temporarily avoid -EEXIST while drivers still get their own cookies */ + if (type == IOMMU_DOMAIN_DMA && !domain->iova_cookie && iommu_get_dma_cookie(domain)) { + iommu_domain_free(domain); + domain = NULL; + } return domain; } @@ -1957,6 +1963,7 @@ EXPORT_SYMBOL_GPL(iommu_domain_alloc); void iommu_domain_free(struct iommu_domain *domain) { + iommu_put_dma_cookie(domain); domain->ops->domain_free(domain); } EXPORT_SYMBOL_GPL(iommu_domain_free); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 4997c78e2670..141779d76035 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -40,6 +40,7 @@ struct iommu_domain; struct notifier_block; struct iommu_sva; struct iommu_fault_event; +struct iommu_dma_cookie; /* iommu fault flags */ #define IOMMU_FAULT_READ 0x0 @@ -86,7 +87,7 @@ struct iommu_domain { iommu_fault_handler_t handler; void *handler_token; struct iommu_domain_geometry geometry; - void *iova_cookie; + struct iommu_dma_cookie *iova_cookie; }; enum iommu_cap { From 3f166dae1ab54872d02d910148388c6a17b43f46 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:16 +0100 Subject: [PATCH 178/474] iommu/amd: Drop IOVA cookie management The core code bakes its own cookies now. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/648e74e7422caa6a7db7fb0c36813c7bd2007af8.1628682048.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 52fe2326042a..0fd98d35d73b 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1918,16 +1918,7 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) domain->domain.geometry.aperture_end = ~0ULL; domain->domain.geometry.force_aperture = true; - if (type == IOMMU_DOMAIN_DMA && - iommu_get_dma_cookie(&domain->domain) == -ENOMEM) - goto free_domain; - return &domain->domain; - -free_domain: - protection_domain_free(domain); - - return NULL; } static void amd_iommu_domain_free(struct iommu_domain *dom) @@ -1944,9 +1935,6 @@ static void amd_iommu_domain_free(struct iommu_domain *dom) if (!dom) return; - if (dom->type == IOMMU_DOMAIN_DMA) - iommu_put_dma_cookie(&domain->domain); - if (domain->flags & PD_IOMMUV2_MASK) free_gcr3_table(domain); From 229496a0eb088b34849fcc3ab25f0f9e3f90794a Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:17 +0100 Subject: [PATCH 179/474] iommu/arm-smmu: Drop IOVA cookie management The core code bakes its own cookies now. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/7ae3680dad9735cc69c3618866666896bd11e031.1628682048.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 7 ------- drivers/iommu/arm/arm-smmu/arm-smmu.c | 15 ++++----------- drivers/iommu/arm/arm-smmu/qcom_iommu.c | 9 --------- 3 files changed, 4 insertions(+), 27 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 35d54919c583..ee53a841815e 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1984,12 +1984,6 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) if (!smmu_domain) return NULL; - if (type == IOMMU_DOMAIN_DMA && - iommu_get_dma_cookie(&smmu_domain->domain)) { - kfree(smmu_domain); - return NULL; - } - mutex_init(&smmu_domain->init_mutex); INIT_LIST_HEAD(&smmu_domain->devices); spin_lock_init(&smmu_domain->devices_lock); @@ -2021,7 +2015,6 @@ static void arm_smmu_domain_free(struct iommu_domain *domain) struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_device *smmu = smmu_domain->smmu; - iommu_put_dma_cookie(domain); free_io_pgtable_ops(smmu_domain->pgtbl_ops); /* Free the CD and ASID, if we allocated them */ diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index ac21170fa208..970d9e4dcd69 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -868,10 +868,10 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) { struct arm_smmu_domain *smmu_domain; - if (type != IOMMU_DOMAIN_UNMANAGED && - type != IOMMU_DOMAIN_DMA && - type != IOMMU_DOMAIN_IDENTITY) - return NULL; + if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_IDENTITY) { + if (using_legacy_binding || type != IOMMU_DOMAIN_DMA) + return NULL; + } /* * Allocate the domain and initialise some of its data structures. * We can't really do anything meaningful until we've added a @@ -881,12 +881,6 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) if (!smmu_domain) return NULL; - if (type == IOMMU_DOMAIN_DMA && (using_legacy_binding || - iommu_get_dma_cookie(&smmu_domain->domain))) { - kfree(smmu_domain); - return NULL; - } - mutex_init(&smmu_domain->init_mutex); spin_lock_init(&smmu_domain->cb_lock); @@ -901,7 +895,6 @@ static void arm_smmu_domain_free(struct iommu_domain *domain) * Free the domain resources. We assume that all devices have * already been detached. */ - iommu_put_dma_cookie(domain); arm_smmu_destroy_domain_context(domain); kfree(smmu_domain); } diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index 021cf8f65ffc..b91874cb6cf3 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -335,12 +334,6 @@ static struct iommu_domain *qcom_iommu_domain_alloc(unsigned type) if (!qcom_domain) return NULL; - if (type == IOMMU_DOMAIN_DMA && - iommu_get_dma_cookie(&qcom_domain->domain)) { - kfree(qcom_domain); - return NULL; - } - mutex_init(&qcom_domain->init_mutex); spin_lock_init(&qcom_domain->pgtbl_lock); @@ -351,8 +344,6 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain) { struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); - iommu_put_dma_cookie(domain); - if (qcom_domain->iommu) { /* * NOTE: unmap can be called after client device is powered From f297e27f831705de50811d933a394969602e8564 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:18 +0100 Subject: [PATCH 180/474] iommu/vt-d: Drop IOVA cookie management The core code bakes its own cookies now. Reviewed-by: Lu Baolu Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/e9dbe3b6108f8538e17e0c5f59f8feeb714f51a4.1628682048.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index c12cc955389a..7e168634c433 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1979,10 +1979,6 @@ static void domain_exit(struct dmar_domain *domain) /* Remove associated devices and clear attached or cached domains */ domain_remove_dev_info(domain); - /* destroy iovas */ - if (domain->domain.type == IOMMU_DOMAIN_DMA) - iommu_put_dma_cookie(&domain->domain); - if (domain->pgd) { struct page *freelist; @@ -4544,10 +4540,6 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) return NULL; } - if (type == IOMMU_DOMAIN_DMA && - iommu_get_dma_cookie(&dmar_domain->domain)) - return NULL; - domain = &dmar_domain->domain; domain->geometry.aperture_start = 0; domain->geometry.aperture_end = From 4a376d4ac189ea097524eb53b3137c46a0dcc2d4 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:19 +0100 Subject: [PATCH 181/474] iommu/exynos: Drop IOVA cookie management The core code bakes its own cookies now. Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/12d88cbf44e57faa4f0512760e7ed3a9cba05ca8.1628682048.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/exynos-iommu.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c index d0fbf1d10e18..939ffa768986 100644 --- a/drivers/iommu/exynos-iommu.c +++ b/drivers/iommu/exynos-iommu.c @@ -21,7 +21,6 @@ #include #include #include -#include typedef u32 sysmmu_iova_t; typedef u32 sysmmu_pte_t; @@ -735,20 +734,16 @@ static struct iommu_domain *exynos_iommu_domain_alloc(unsigned type) /* Check if correct PTE offsets are initialized */ BUG_ON(PG_ENT_SHIFT < 0 || !dma_dev); + if (type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_UNMANAGED) + return NULL; + domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return NULL; - if (type == IOMMU_DOMAIN_DMA) { - if (iommu_get_dma_cookie(&domain->domain) != 0) - goto err_pgtable; - } else if (type != IOMMU_DOMAIN_UNMANAGED) { - goto err_pgtable; - } - domain->pgtable = (sysmmu_pte_t *)__get_free_pages(GFP_KERNEL, 2); if (!domain->pgtable) - goto err_dma_cookie; + goto err_pgtable; domain->lv2entcnt = (short *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); if (!domain->lv2entcnt) @@ -779,9 +774,6 @@ err_lv2ent: free_pages((unsigned long)domain->lv2entcnt, 1); err_counter: free_pages((unsigned long)domain->pgtable, 2); -err_dma_cookie: - if (type == IOMMU_DOMAIN_DMA) - iommu_put_dma_cookie(&domain->domain); err_pgtable: kfree(domain); return NULL; @@ -809,9 +801,6 @@ static void exynos_iommu_domain_free(struct iommu_domain *iommu_domain) spin_unlock_irqrestore(&domain->lock, flags); - if (iommu_domain->type == IOMMU_DOMAIN_DMA) - iommu_put_dma_cookie(iommu_domain); - dma_unmap_single(dma_dev, virt_to_phys(domain->pgtable), LV1TABLE_SIZE, DMA_TO_DEVICE); From 5d8941824e4042728c4f64ebde0f7791742ff571 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:20 +0100 Subject: [PATCH 182/474] iommu/ipmmu-vmsa: Drop IOVA cookie management The core code bakes its own cookies now. Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/dc5513293942d81f84edf61b354b236e5ac51dc2.1628682048.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/ipmmu-vmsa.c | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index 51ea6f00db2f..d38ff29a76e8 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -8,7 +8,6 @@ #include #include -#include #include #include #include @@ -564,10 +563,13 @@ static irqreturn_t ipmmu_irq(int irq, void *dev) * IOMMU Operations */ -static struct iommu_domain *__ipmmu_domain_alloc(unsigned type) +static struct iommu_domain *ipmmu_domain_alloc(unsigned type) { struct ipmmu_vmsa_domain *domain; + if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA) + return NULL; + domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return NULL; @@ -577,27 +579,6 @@ static struct iommu_domain *__ipmmu_domain_alloc(unsigned type) return &domain->io_domain; } -static struct iommu_domain *ipmmu_domain_alloc(unsigned type) -{ - struct iommu_domain *io_domain = NULL; - - switch (type) { - case IOMMU_DOMAIN_UNMANAGED: - io_domain = __ipmmu_domain_alloc(type); - break; - - case IOMMU_DOMAIN_DMA: - io_domain = __ipmmu_domain_alloc(type); - if (io_domain && iommu_get_dma_cookie(io_domain)) { - kfree(io_domain); - io_domain = NULL; - } - break; - } - - return io_domain; -} - static void ipmmu_domain_free(struct iommu_domain *io_domain) { struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain); @@ -606,7 +587,6 @@ static void ipmmu_domain_free(struct iommu_domain *io_domain) * Free the domain resources. We assume that all devices have already * been detached. */ - iommu_put_dma_cookie(io_domain); ipmmu_domain_destroy_context(domain); free_io_pgtable_ops(domain->iop); kfree(domain); From a88a42be04db17d07eaca9a88e894a3c28443262 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:21 +0100 Subject: [PATCH 183/474] iommu/mtk: Drop IOVA cookie management The core code bakes its own cookies now. CC: Yong Wu Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/b856648e7ee2b1017e7c7c02e2ddd50eaf72cbf7.1628682048.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/mtk_iommu.c | 7 ------- drivers/iommu/mtk_iommu_v1.c | 1 - 2 files changed, 8 deletions(-) diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 6f7c69688ce2..185694eb4456 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -441,17 +440,11 @@ static struct iommu_domain *mtk_iommu_domain_alloc(unsigned type) if (!dom) return NULL; - if (iommu_get_dma_cookie(&dom->domain)) { - kfree(dom); - return NULL; - } - return &dom->domain; } static void mtk_iommu_domain_free(struct iommu_domain *domain) { - iommu_put_dma_cookie(domain); kfree(to_mtk_domain(domain)); } diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index 778e66f5f1aa..be22fcf988ce 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include From b811a451519036e54d1af10e695c9b2ff014fcc3 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:22 +0100 Subject: [PATCH 184/474] iommu/rockchip: Drop IOVA cookie management The core code bakes its own cookies now. Tested-by: Heiko Stuebner Acked-by: Heiko Stuebner Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/aff51e2da1e431987ae5fdafa62a6a7c4bd042dc.1628682048.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/rockchip-iommu.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index 9febfb7f3025..5cb260820eda 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -1074,10 +1073,6 @@ static struct iommu_domain *rk_iommu_domain_alloc(unsigned type) if (!rk_domain) return NULL; - if (type == IOMMU_DOMAIN_DMA && - iommu_get_dma_cookie(&rk_domain->domain)) - goto err_free_domain; - /* * rk32xx iommus use a 2 level pagetable. * Each level1 (dt) and level2 (pt) table has 1024 4-byte entries. @@ -1085,7 +1080,7 @@ static struct iommu_domain *rk_iommu_domain_alloc(unsigned type) */ rk_domain->dt = (u32 *)get_zeroed_page(GFP_KERNEL | GFP_DMA32); if (!rk_domain->dt) - goto err_put_cookie; + goto err_free_domain; rk_domain->dt_dma = dma_map_single(dma_dev, rk_domain->dt, SPAGE_SIZE, DMA_TO_DEVICE); @@ -1106,9 +1101,6 @@ static struct iommu_domain *rk_iommu_domain_alloc(unsigned type) err_free_dt: free_page((unsigned long)rk_domain->dt); -err_put_cookie: - if (type == IOMMU_DOMAIN_DMA) - iommu_put_dma_cookie(&rk_domain->domain); err_free_domain: kfree(rk_domain); @@ -1137,8 +1129,6 @@ static void rk_iommu_domain_free(struct iommu_domain *domain) SPAGE_SIZE, DMA_TO_DEVICE); free_page((unsigned long)rk_domain->dt); - if (domain->type == IOMMU_DOMAIN_DMA) - iommu_put_dma_cookie(&rk_domain->domain); kfree(rk_domain); } From 5ad5f667147848037f2e697f065816c47310d4f9 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:23 +0100 Subject: [PATCH 185/474] iommu/sprd: Drop IOVA cookie management The core code bakes its own cookies now. Acked-by: Chunyan Zhang Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/4e7fc6e523cb4b63fb13f5be10041eb24c0dcb1e.1628682048.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/sprd-iommu.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c index 73dfd9946312..27ac818b0354 100644 --- a/drivers/iommu/sprd-iommu.c +++ b/drivers/iommu/sprd-iommu.c @@ -8,7 +8,6 @@ #include #include -#include #include #include #include @@ -144,11 +143,6 @@ static struct iommu_domain *sprd_iommu_domain_alloc(unsigned int domain_type) if (!dom) return NULL; - if (iommu_get_dma_cookie(&dom->domain)) { - kfree(dom); - return NULL; - } - spin_lock_init(&dom->pgtlock); dom->domain.geometry.aperture_start = 0; @@ -161,7 +155,6 @@ static void sprd_iommu_domain_free(struct iommu_domain *domain) { struct sprd_iommu_domain *dom = to_sprd_domain(domain); - iommu_put_dma_cookie(domain); kfree(dom); } From aa6546423a5678c79e00599d147153230e71abea Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:24 +0100 Subject: [PATCH 186/474] iommu/sun50i: Drop IOVA cookie management The core code bakes its own cookies now. CC: Maxime Ripard Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/147edb0ba59be563df19cec3e63e621aa65b7b68.1628682048.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/sun50i-iommu.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c index 181bb1c3437c..92997021e188 100644 --- a/drivers/iommu/sun50i-iommu.c +++ b/drivers/iommu/sun50i-iommu.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -610,14 +609,10 @@ static struct iommu_domain *sun50i_iommu_domain_alloc(unsigned type) if (!sun50i_domain) return NULL; - if (type == IOMMU_DOMAIN_DMA && - iommu_get_dma_cookie(&sun50i_domain->domain)) - goto err_free_domain; - sun50i_domain->dt = (u32 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(DT_SIZE)); if (!sun50i_domain->dt) - goto err_put_cookie; + goto err_free_domain; refcount_set(&sun50i_domain->refcnt, 1); @@ -627,10 +622,6 @@ static struct iommu_domain *sun50i_iommu_domain_alloc(unsigned type) return &sun50i_domain->domain; -err_put_cookie: - if (type == IOMMU_DOMAIN_DMA) - iommu_put_dma_cookie(&sun50i_domain->domain); - err_free_domain: kfree(sun50i_domain); @@ -644,8 +635,6 @@ static void sun50i_iommu_domain_free(struct iommu_domain *domain) free_pages((unsigned long)sun50i_domain->dt, get_order(DT_SIZE)); sun50i_domain->dt = NULL; - iommu_put_dma_cookie(domain); - kfree(sun50i_domain); } From ca84ed7f724cf9f9620a607756482e297c2d8fea Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:25 +0100 Subject: [PATCH 187/474] iommu/virtio: Drop IOVA cookie management The core code bakes its own cookies now. Reviewed-by: Jean-Philippe Brucker Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/f05cd2d0a0f414de3180e2536c7656faf1e52418.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/virtio-iommu.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 6abdcab7273b..80930ce04a16 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -598,12 +598,6 @@ static struct iommu_domain *viommu_domain_alloc(unsigned type) spin_lock_init(&vdomain->mappings_lock); vdomain->mappings = RB_ROOT_CACHED; - if (type == IOMMU_DOMAIN_DMA && - iommu_get_dma_cookie(&vdomain->domain)) { - kfree(vdomain); - return NULL; - } - return &vdomain->domain; } @@ -643,8 +637,6 @@ static void viommu_domain_free(struct iommu_domain *domain) { struct viommu_domain *vdomain = to_viommu_domain(domain); - iommu_put_dma_cookie(domain); - /* Free all remaining mappings (size 2^64) */ viommu_del_mappings(vdomain, 0, 0); From 8d971243a9a7d73a410450951a21f298f1394dc7 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:27 +0100 Subject: [PATCH 188/474] iommu/dma: Remove redundant "!dev" checks iommu_dma_init_domain() is now only called from iommu_setup_dma_ops(), which has already assumed dev to be non-NULL. Reviewed-by: John Garry Reviewed-by: Lu Baolu Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/06024523c080364390016550065e3cfe8031367e.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/dma-iommu.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 98ba927aee1a..afaa1f9b5935 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -370,7 +370,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, init_iova_domain(iovad, 1UL << order, base_pfn); - if (!cookie->fq_domain && (!dev || !dev_is_untrusted(dev)) && + if (!cookie->fq_domain && !dev_is_untrusted(dev) && domain->ops->flush_iotlb_all && !iommu_get_dma_strict(domain)) { if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all, iommu_dma_entry_dtor)) @@ -379,9 +379,6 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, cookie->fq_domain = domain; } - if (!dev) - return 0; - return iova_reserve_iommu_regions(dev, domain); } From 7a7c5badf85806eab75e31ab8d45021f1545b0e3 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:28 +0100 Subject: [PATCH 189/474] iommu: Indicate queued flushes via gather data Since iommu_iotlb_gather exists to help drivers optimise flushing for a given unmap request, it is also the logical place to indicate whether the unmap is strict or not, and thus help them further optimise for whether to expect a sync or a flush_all subsequently. As part of that, it also seems fair to make the flush queue code take responsibility for enforcing the really subtle ordering requirement it brings, so that we don't need to worry about forgetting that if new drivers want to add flush queue support, and can consolidate the existing versions. While we're adding to the kerneldoc, also fill in some info for @freelist which was overlooked previously. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/bf5f8e2ad84e48c712ccbf80fa8c610594c7595f.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/dma-iommu.c | 1 + drivers/iommu/iova.c | 7 +++++++ include/linux/iommu.h | 8 +++++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index afaa1f9b5935..1eacbbdf601c 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -481,6 +481,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, dma_addr -= iova_off; size = iova_align(iovad, size + iova_off); iommu_iotlb_gather_init(&iotlb_gather); + iotlb_gather.queued = cookie->fq_domain; unmapped = iommu_unmap_fast(domain, dma_addr, size, &iotlb_gather); WARN_ON(unmapped != size); diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index b6cf5f16123b..2ad73fb2e94e 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -637,6 +637,13 @@ void queue_iova(struct iova_domain *iovad, unsigned long flags; unsigned idx; + /* + * Order against the IOMMU driver's pagetable update from unmapping + * @pte, to guarantee that iova_domain_flush() observes that if called + * from a different CPU before we release the lock below. + */ + smp_wmb(); + spin_lock_irqsave(&fq->lock, flags); /* diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 141779d76035..f7679f6684b1 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -161,16 +161,22 @@ enum iommu_dev_features { * @start: IOVA representing the start of the range to be flushed * @end: IOVA representing the end of the range to be flushed (inclusive) * @pgsize: The interval at which to perform the flush + * @freelist: Removed pages to free after sync + * @queued: Indicates that the flush will be queued * * This structure is intended to be updated by multiple calls to the * ->unmap() function in struct iommu_ops before eventually being passed - * into ->iotlb_sync(). + * into ->iotlb_sync(). Drivers can add pages to @freelist to be freed after + * ->iotlb_sync() or ->iotlb_flush_all() have cleared all cached references to + * them. @queued is set to indicate when ->iotlb_flush_all() will be called + * later instead of ->iotlb_sync(), so drivers may optimise accordingly. */ struct iommu_iotlb_gather { unsigned long start; unsigned long end; size_t pgsize; struct page *freelist; + bool queued; }; /** From a8e5f04458c4e496eada2b029ce96713bb6c388d Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:29 +0100 Subject: [PATCH 190/474] iommu/io-pgtable: Remove non-strict quirk IO_PGTABLE_QUIRK_NON_STRICT was never a very comfortable fit, since it's not a quirk of the pagetable format itself. Now that we have a more appropriate way to convey non-strict unmaps, though, this last of the non-quirk quirks can also go, and with the flush queue code also now enforcing its own ordering we can have a lovely cleanup all round. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/155b5c621cd8936472e273a8b07a182f62c6c20d.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 3 --- drivers/iommu/io-pgtable-arm-v7s.c | 12 ++---------- drivers/iommu/io-pgtable-arm.c | 12 ++---------- include/linux/io-pgtable.h | 5 ----- 5 files changed, 4 insertions(+), 31 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index ee53a841815e..69801866090c 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2174,9 +2174,6 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain, .iommu_dev = smmu->dev, }; - if (!iommu_get_dma_strict(domain)) - pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT; - pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain); if (!pgtbl_ops) return -ENOMEM; diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 970d9e4dcd69..a325d4769c17 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -765,9 +765,6 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, .iommu_dev = smmu->dev, }; - if (!iommu_get_dma_strict(domain)) - pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT; - if (smmu->impl && smmu->impl->init_context) { ret = smmu->impl->init_context(smmu_domain, &pgtbl_cfg, dev); if (ret) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 5db90d7ce2ec..e84478d39705 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -700,14 +700,7 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, ARM_V7S_BLOCK_SIZE(lvl + 1)); ptep = iopte_deref(pte[i], lvl, data); __arm_v7s_free_table(ptep, lvl + 1, data); - } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) { - /* - * Order the PTE update against queueing the IOVA, to - * guarantee that a flush callback from a different CPU - * has observed it before the TLBIALL can be issued. - */ - smp_wmb(); - } else { + } else if (!gather->queued) { io_pgtable_tlb_add_page(iop, gather, iova, blk_size); } iova += blk_size; @@ -791,8 +784,7 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg, if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_PERMS | - IO_PGTABLE_QUIRK_ARM_MTK_EXT | - IO_PGTABLE_QUIRK_NON_STRICT)) + IO_PGTABLE_QUIRK_ARM_MTK_EXT)) return NULL; /* If ARM_MTK_4GB is enabled, the NO_PERMS is also expected. */ diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 053df4048a29..48a5bd8f571d 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -638,14 +638,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, io_pgtable_tlb_flush_walk(iop, iova + i * size, size, ARM_LPAE_GRANULE(data)); __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data)); - } else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) { - /* - * Order the PTE update against queueing the IOVA, to - * guarantee that a flush callback from a different CPU - * has observed it before the TLBIALL can be issued. - */ - smp_wmb(); - } else { + } else if (!gather->queued) { io_pgtable_tlb_add_page(iop, gather, iova + i * size, size); } @@ -825,7 +818,6 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) bool tg1; if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS | - IO_PGTABLE_QUIRK_NON_STRICT | IO_PGTABLE_QUIRK_ARM_TTBR1 | IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)) return NULL; @@ -929,7 +921,7 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie) typeof(&cfg->arm_lpae_s2_cfg.vtcr) vtcr = &cfg->arm_lpae_s2_cfg.vtcr; /* The NS quirk doesn't apply at stage 2 */ - if (cfg->quirks & ~(IO_PGTABLE_QUIRK_NON_STRICT)) + if (cfg->quirks) return NULL; data = arm_lpae_alloc_pgtable(cfg); diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index c43f3b899d2a..9ba6d9ea316e 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -73,10 +73,6 @@ struct io_pgtable_cfg { * to support up to 35 bits PA where the bit32, bit33 and bit34 are * encoded in the bit9, bit4 and bit5 of the PTE respectively. * - * IO_PGTABLE_QUIRK_NON_STRICT: Skip issuing synchronous leaf TLBIs - * on unmap, for DMA domains using the flush queue mechanism for - * delayed invalidation. - * * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table * for use in the upper half of a split address space. * @@ -86,7 +82,6 @@ struct io_pgtable_cfg { #define IO_PGTABLE_QUIRK_ARM_NS BIT(0) #define IO_PGTABLE_QUIRK_NO_PERMS BIT(1) #define IO_PGTABLE_QUIRK_ARM_MTK_EXT BIT(3) - #define IO_PGTABLE_QUIRK_NON_STRICT BIT(4) #define IO_PGTABLE_QUIRK_ARM_TTBR1 BIT(5) #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6) unsigned long quirks; From bf3aed4660c6e3c44c69f07d8927ee5a22a952ac Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:30 +0100 Subject: [PATCH 191/474] iommu: Introduce explicit type for non-strict DMA domains Promote the difference between strict and non-strict DMA domains from an internal detail to a distinct domain feature and type, to pave the road for exposing it through the sysfs default domain interface. Reviewed-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/08cd2afaf6b63c58ad49acec3517c9b32c2bb946.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/dma-iommu.c | 2 +- drivers/iommu/iommu.c | 8 ++++++-- include/linux/iommu.h | 11 +++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 1eacbbdf601c..17ac3dd4f23e 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1319,7 +1319,7 @@ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit) * The IOMMU core code allocates the default DMA domain, which the * underlying IOMMU driver needs to support via the dma-iommu layer. */ - if (domain->type == IOMMU_DOMAIN_DMA) { + if (iommu_is_dma_domain(domain)) { if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev)) goto out_err; dev->dma_ops = &iommu_dma_ops; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b65fcc66ffa4..17d6728f5a09 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -115,6 +115,7 @@ static const char *iommu_domain_type_str(unsigned int t) case IOMMU_DOMAIN_UNMANAGED: return "Unmanaged"; case IOMMU_DOMAIN_DMA: + case IOMMU_DOMAIN_DMA_FQ: return "Translated"; default: return "Unknown"; @@ -552,6 +553,9 @@ static ssize_t iommu_group_show_type(struct iommu_group *group, case IOMMU_DOMAIN_DMA: type = "DMA\n"; break; + case IOMMU_DOMAIN_DMA_FQ: + type = "DMA-FQ\n"; + break; } } mutex_unlock(&group->mutex); @@ -765,7 +769,7 @@ static int iommu_create_device_direct_mappings(struct iommu_group *group, unsigned long pg_size; int ret = 0; - if (!domain || domain->type != IOMMU_DOMAIN_DMA) + if (!domain || !iommu_is_dma_domain(domain)) return 0; BUG_ON(!domain->pgsize_bitmap); @@ -1948,7 +1952,7 @@ static struct iommu_domain *__iommu_domain_alloc(struct bus_type *bus, domain->pgsize_bitmap = bus->iommu_ops->pgsize_bitmap; /* Temporarily avoid -EEXIST while drivers still get their own cookies */ - if (type == IOMMU_DOMAIN_DMA && !domain->iova_cookie && iommu_get_dma_cookie(domain)) { + if (iommu_is_dma_domain(domain) && !domain->iova_cookie && iommu_get_dma_cookie(domain)) { iommu_domain_free(domain); domain = NULL; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f7679f6684b1..5629ae42951f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -61,6 +61,7 @@ struct iommu_domain_geometry { #define __IOMMU_DOMAIN_DMA_API (1U << 1) /* Domain for use in DMA-API implementation */ #define __IOMMU_DOMAIN_PT (1U << 2) /* Domain is identity mapped */ +#define __IOMMU_DOMAIN_DMA_FQ (1U << 3) /* DMA-API uses flush queue */ /* * This are the possible domain-types @@ -73,12 +74,17 @@ struct iommu_domain_geometry { * IOMMU_DOMAIN_DMA - Internally used for DMA-API implementations. * This flag allows IOMMU drivers to implement * certain optimizations for these domains + * IOMMU_DOMAIN_DMA_FQ - As above, but definitely using batched TLB + * invalidation. */ #define IOMMU_DOMAIN_BLOCKED (0U) #define IOMMU_DOMAIN_IDENTITY (__IOMMU_DOMAIN_PT) #define IOMMU_DOMAIN_UNMANAGED (__IOMMU_DOMAIN_PAGING) #define IOMMU_DOMAIN_DMA (__IOMMU_DOMAIN_PAGING | \ __IOMMU_DOMAIN_DMA_API) +#define IOMMU_DOMAIN_DMA_FQ (__IOMMU_DOMAIN_PAGING | \ + __IOMMU_DOMAIN_DMA_API | \ + __IOMMU_DOMAIN_DMA_FQ) struct iommu_domain { unsigned type; @@ -90,6 +96,11 @@ struct iommu_domain { struct iommu_dma_cookie *iova_cookie; }; +static inline bool iommu_is_dma_domain(struct iommu_domain *domain) +{ + return domain->type & __IOMMU_DOMAIN_DMA_API; +} + enum iommu_cap { IOMMU_CAP_CACHE_COHERENCY, /* IOMMU can enforce cache coherent DMA transactions */ From 6d596039392bac2a0160fb71300d314943411e2a Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:31 +0100 Subject: [PATCH 192/474] iommu/amd: Prepare for multiple DMA domain types The DMA ops reset/setup can simply be unconditional, since iommu-dma already knows only to touch DMA domains. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/6450b4f39a5a086d505297b4a53ff1e4a7a0fe7c.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 0fd98d35d73b..02f9b4fffe90 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1707,14 +1707,9 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev) static void amd_iommu_probe_finalize(struct device *dev) { - struct iommu_domain *domain; - /* Domains are initialized for this device - have a look what we ended up with */ - domain = iommu_get_domain_for_dev(dev); - if (domain->type == IOMMU_DOMAIN_DMA) - iommu_setup_dma_ops(dev, 0, U64_MAX); - else - set_dma_ops(dev, NULL); + set_dma_ops(dev, NULL); + iommu_setup_dma_ops(dev, 0, U64_MAX); } static void amd_iommu_release_device(struct device *dev) From f9afa313ad0ebbdcf0f01c0453022be6ef2e9c1c Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:32 +0100 Subject: [PATCH 193/474] iommu/arm-smmu: Prepare for multiple DMA domain types In preparation for the strict vs. non-strict decision for DMA domains to be expressed in the domain type, make sure we expose our flush queue awareness by accepting the new domain type. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/8f217ef285bd0bb9456c27ef622d2efdbbca1ad8.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 + drivers/iommu/arm/arm-smmu/arm-smmu.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 69801866090c..f29dbe9e90ce 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1972,6 +1972,7 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA && + type != IOMMU_DOMAIN_DMA_FQ && type != IOMMU_DOMAIN_IDENTITY) return NULL; diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index a325d4769c17..1d013b1d7bb2 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -866,7 +866,8 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) struct arm_smmu_domain *smmu_domain; if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_IDENTITY) { - if (using_legacy_binding || type != IOMMU_DOMAIN_DMA) + if (using_legacy_binding || + (type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_DMA_FQ)) return NULL; } /* From 78ca078459d779319a22a8e2a18fc491da8a229b Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:33 +0100 Subject: [PATCH 194/474] iommu/vt-d: Prepare for multiple DMA domain types In preparation for the strict vs. non-strict decision for DMA domains to be expressed in the domain type, make sure we expose our flush queue awareness by accepting the new domain type, and test the specific feature flag where we want to identify DMA domains in general. The DMA ops reset/setup can simply be made unconditional, since iommu-dma already knows only to touch DMA domains. Reviewed-by: Lu Baolu Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/31a8ef868d593a2f3826a6a120edee81815375a7.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 7e168634c433..8fc46c9d6b96 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -582,7 +582,7 @@ struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) int iommu_id; /* si_domain and vm domain should not get here. */ - if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA)) + if (WARN_ON(!iommu_is_dma_domain(&domain->domain))) return NULL; for_each_domain_iommu(iommu_id, domain) @@ -1034,7 +1034,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; if (domain_use_first_level(domain)) { pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; - if (domain->domain.type == IOMMU_DOMAIN_DMA) + if (iommu_is_dma_domain(&domain->domain)) pteval |= DMA_FL_PTE_ACCESS; } if (cmpxchg64(&pte->val, 0ULL, pteval)) @@ -2345,7 +2345,7 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, if (domain_use_first_level(domain)) { attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US; - if (domain->domain.type == IOMMU_DOMAIN_DMA) { + if (iommu_is_dma_domain(&domain->domain)) { attr |= DMA_FL_PTE_ACCESS; if (prot & DMA_PTE_WRITE) attr |= DMA_FL_PTE_DIRTY; @@ -4528,6 +4528,7 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) switch (type) { case IOMMU_DOMAIN_DMA: + case IOMMU_DOMAIN_DMA_FQ: case IOMMU_DOMAIN_UNMANAGED: dmar_domain = alloc_domain(0); if (!dmar_domain) { @@ -5197,12 +5198,8 @@ static void intel_iommu_release_device(struct device *dev) static void intel_iommu_probe_finalize(struct device *dev) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - - if (domain && domain->type == IOMMU_DOMAIN_DMA) - iommu_setup_dma_ops(dev, 0, U64_MAX); - else - set_dma_ops(dev, NULL); + set_dma_ops(dev, NULL); + iommu_setup_dma_ops(dev, 0, U64_MAX); } static void intel_iommu_get_resv_regions(struct device *device, From c208916fe6c7b84e3ec95cd91853039596eeb2cf Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:34 +0100 Subject: [PATCH 195/474] iommu: Express DMA strictness via the domain type Eliminate the iommu_get_dma_strict() indirection and pipe the information through the domain type from the beginning. Besides the flow simplification this also has several nice side-effects: - Automatically implies strict mode for untrusted devices by virtue of their IOMMU_DOMAIN_DMA override. - Ensures that we only end up using flush queues for drivers which are aware of them and can actually benefit. - Allows us to handle flush queue init failure by falling back to strict mode instead of leaving it to possibly blow up later. Reviewed-by: Lu Baolu Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/47083d69155577f1367877b1594921948c366eb3.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/dma-iommu.c | 10 ++++++---- drivers/iommu/iommu.c | 14 +++++--------- include/linux/iommu.h | 1 - 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 17ac3dd4f23e..b7ae855c1e89 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -370,13 +370,15 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, init_iova_domain(iovad, 1UL << order, base_pfn); - if (!cookie->fq_domain && !dev_is_untrusted(dev) && - domain->ops->flush_iotlb_all && !iommu_get_dma_strict(domain)) { + /* If the FQ fails we can simply fall back to strict mode */ + if (domain->type == IOMMU_DOMAIN_DMA_FQ && !cookie->fq_domain) { if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all, - iommu_dma_entry_dtor)) + iommu_dma_entry_dtor)) { pr_warn("iova flush queue initialization failed\n"); - else + domain->type = IOMMU_DOMAIN_DMA; + } else { cookie->fq_domain = domain; + } } return iova_reserve_iommu_regions(dev, domain); diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 17d6728f5a09..e09f0d433683 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -136,6 +136,9 @@ static int __init iommu_subsys_init(void) } } + if (!iommu_default_passthrough() && !iommu_dma_strict) + iommu_def_domain_type = IOMMU_DOMAIN_DMA_FQ; + pr_info("Default domain type: %s %s\n", iommu_domain_type_str(iommu_def_domain_type), (iommu_cmd_line & IOMMU_CMD_LINE_DMA_API) ? @@ -355,17 +358,10 @@ early_param("iommu.strict", iommu_dma_setup); void iommu_set_dma_strict(void) { iommu_dma_strict = true; + if (iommu_def_domain_type == IOMMU_DOMAIN_DMA_FQ) + iommu_def_domain_type = IOMMU_DOMAIN_DMA; } -bool iommu_get_dma_strict(struct iommu_domain *domain) -{ - /* only allow lazy flushing for DMA domains */ - if (domain->type == IOMMU_DOMAIN_DMA) - return iommu_dma_strict; - return true; -} -EXPORT_SYMBOL_GPL(iommu_get_dma_strict); - static ssize_t iommu_group_attr_show(struct kobject *kobj, struct attribute *__attr, char *buf) { diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 5629ae42951f..923a8d1c5e39 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -504,7 +504,6 @@ int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks); void iommu_set_dma_strict(void); -bool iommu_get_dma_strict(struct iommu_domain *domain); extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev, unsigned long iova, int flags); From 26225bea1d84b090b378838a1797daec62d4ad0e Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:35 +0100 Subject: [PATCH 196/474] iommu: Expose DMA domain strictness via sysfs The sysfs interface for default domain types exists primarily so users can choose the performance/security tradeoff relevant to their own workload. As such, the choice between the policies for DMA domains fits perfectly as an additional point on that scale - downgrading a particular device from a strict default to non-strict may be enough to let it reach the desired level of performance, while still retaining more peace of mind than with a wide-open identity domain. Now that we've abstracted non-strict mode as a distinct type of DMA domain, allow it to be chosen through the user interface as well. Reviewed-by: Lu Baolu Reviewed-by: John Garry Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/0e08da5ed4069fd3473cfbadda758ca983becdbf.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- Documentation/ABI/testing/sysfs-kernel-iommu_groups | 6 +++++- drivers/iommu/iommu.c | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-iommu_groups b/Documentation/ABI/testing/sysfs-kernel-iommu_groups index eae2f1c1e11e..b15af6a5bc08 100644 --- a/Documentation/ABI/testing/sysfs-kernel-iommu_groups +++ b/Documentation/ABI/testing/sysfs-kernel-iommu_groups @@ -42,8 +42,12 @@ Description: /sys/kernel/iommu_groups//type shows the type of default ======== ====================================================== DMA All the DMA transactions from the device in this group are translated by the iommu. + DMA-FQ As above, but using batched invalidation to lazily + remove translations after use. This may offer reduced + overhead at the cost of reduced memory protection. identity All the DMA transactions from the device in this group - are not translated by the iommu. + are not translated by the iommu. Maximum performance + but zero protection. auto Change to the type the device was booted with. ======== ====================================================== diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index e09f0d433683..f653a70389ef 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3268,6 +3268,8 @@ static ssize_t iommu_group_store_type(struct iommu_group *group, req_type = IOMMU_DOMAIN_IDENTITY; else if (sysfs_streq(buf, "DMA")) req_type = IOMMU_DOMAIN_DMA; + else if (sysfs_streq(buf, "DMA-FQ")) + req_type = IOMMU_DOMAIN_DMA_FQ; else if (sysfs_streq(buf, "auto")) req_type = 0; else From 7cf8a638678c89e9ff62520c5f90c4919b3b2af7 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:36 +0100 Subject: [PATCH 197/474] iommu: Only log strictness for DMA domains When passthrough is enabled, the default strictness policy becomes irrelevant, since any subsequent runtime override to a DMA domain type now embodies an explicit choice of strictness as well. Save on noise by only logging the default policy when it is meaningfully in effect. Reviewed-by: John Garry Reviewed-by: Lu Baolu Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/9d2bcba880c6d517d0751ed8bd4960853030b4d7.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index f653a70389ef..069f7dcc1de0 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -144,10 +144,11 @@ static int __init iommu_subsys_init(void) (iommu_cmd_line & IOMMU_CMD_LINE_DMA_API) ? "(set via kernel command line)" : ""); - pr_info("DMA domain TLB invalidation policy: %s mode %s\n", - iommu_dma_strict ? "strict" : "lazy", - (iommu_cmd_line & IOMMU_CMD_LINE_STRICT) ? - "(set via kernel command line)" : ""); + if (!iommu_default_passthrough()) + pr_info("DMA domain TLB invalidation policy: %s mode %s\n", + iommu_dma_strict ? "strict" : "lazy", + (iommu_cmd_line & IOMMU_CMD_LINE_STRICT) ? + "(set via kernel command line)" : ""); return 0; } From e96763ec42ceb7fc4f1e80b8647bc3ef53b5d286 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:37 +0100 Subject: [PATCH 198/474] iommu: Merge strictness and domain type configs To parallel the sysfs behaviour, merge the new build-time option for DMA domain strictness into the default domain type choice. Suggested-by: Joerg Roedel Reviewed-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: John Garry Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/d04af35b9c0f2a1d39605d7a9b451f5e1f0c7736.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- .../admin-guide/kernel-parameters.txt | 8 +- drivers/iommu/Kconfig | 80 +++++++++---------- drivers/iommu/iommu.c | 2 +- 3 files changed, 44 insertions(+), 46 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 90b525cf0ec2..19192b39952a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2045,11 +2045,9 @@ 1 - Strict mode. DMA unmap operations invalidate IOMMU hardware TLBs synchronously. - unset - Use value of CONFIG_IOMMU_DEFAULT_{LAZY,STRICT}. - Note: on x86, the default behaviour depends on the - equivalent driver-specific parameters, but a strict - mode explicitly specified by either method takes - precedence. + unset - Use value of CONFIG_IOMMU_DEFAULT_DMA_{LAZY,STRICT}. + Note: on x86, strict mode specified via one of the + legacy driver-specific options takes precedence. iommu.passthrough= [ARM64, X86] Configure DMA to bypass the IOMMU by default. diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index c84da8205be7..6e06f876d75a 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -79,55 +79,55 @@ config IOMMU_DEBUGFS debug/iommu directory, and then populate a subdirectory with entries as required. -config IOMMU_DEFAULT_PASSTHROUGH - bool "IOMMU passthrough by default" - depends on IOMMU_API - help - Enable passthrough by default, removing the need to pass in - iommu.passthrough=on or iommu=pt through command line. If this - is enabled, you can still disable with iommu.passthrough=off - or iommu=nopt depending on the architecture. - - If unsure, say N here. - choice - prompt "IOMMU default DMA IOTLB invalidation mode" - depends on IOMMU_DMA - - default IOMMU_DEFAULT_LAZY if (AMD_IOMMU || INTEL_IOMMU) - default IOMMU_DEFAULT_STRICT + prompt "IOMMU default domain type" + depends on IOMMU_API + default IOMMU_DEFAULT_DMA_LAZY if AMD_IOMMU || INTEL_IOMMU + default IOMMU_DEFAULT_DMA_STRICT help - This option allows an IOMMU DMA IOTLB invalidation mode to be - chosen at build time, to override the default mode of each ARCH, - removing the need to pass in kernel parameters through command line. - It is still possible to provide common boot params to override this - config. + Choose the type of IOMMU domain used to manage DMA API usage by + device drivers. The options here typically represent different + levels of tradeoff between robustness/security and performance, + depending on the IOMMU driver. Not all IOMMUs support all options. + This choice can be overridden at boot via the command line, and for + some devices also at runtime via sysfs. If unsure, keep the default. -config IOMMU_DEFAULT_STRICT - bool "strict" +config IOMMU_DEFAULT_DMA_STRICT + bool "Translated - Strict" help - For every IOMMU DMA unmap operation, the flush operation of IOTLB and - the free operation of IOVA are guaranteed to be done in the unmap - function. + Trusted devices use translation to restrict their access to only + DMA-mapped pages, with strict TLB invalidation on unmap. Equivalent + to passing "iommu.passthrough=0 iommu.strict=1" on the command line. -config IOMMU_DEFAULT_LAZY - bool "lazy" + Untrusted devices always use this mode, with an additional layer of + bounce-buffering such that they cannot gain access to any unrelated + data within a mapped page. + +config IOMMU_DEFAULT_DMA_LAZY + bool "Translated - Lazy" help - Support lazy mode, where for every IOMMU DMA unmap operation, the - flush operation of IOTLB and the free operation of IOVA are deferred. - They are only guaranteed to be done before the related IOVA will be - reused. + Trusted devices use translation to restrict their access to only + DMA-mapped pages, but with "lazy" batched TLB invalidation. This + mode allows higher performance with some IOMMUs due to reduced TLB + flushing, but at the cost of reduced isolation since devices may be + able to access memory for some time after it has been unmapped. + Equivalent to passing "iommu.passthrough=0 iommu.strict=0" on the + command line. - The isolation provided in this mode is not as secure as STRICT mode, - such that a vulnerable time window may be created between the DMA - unmap and the mappings cached in the IOMMU IOTLB or device TLB - finally being invalidated, where the device could still access the - memory which has already been unmapped by the device driver. - However this mode may provide better performance in high throughput - scenarios, and is still considerably more secure than passthrough - mode or no IOMMU. + If this mode is not supported by the IOMMU driver, the effective + runtime default will fall back to IOMMU_DEFAULT_DMA_STRICT. + +config IOMMU_DEFAULT_PASSTHROUGH + bool "Passthrough" + help + Trusted devices are identity-mapped, giving them unrestricted access + to memory with minimal performance overhead. Equivalent to passing + "iommu.passthrough=1" (historically "iommu=pt") on the command line. + + If this mode is not supported by the IOMMU driver, the effective + runtime default will fall back to IOMMU_DEFAULT_DMA_STRICT. endchoice diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 069f7dcc1de0..0e1f791873fa 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -31,7 +31,7 @@ static struct kset *iommu_group_kset; static DEFINE_IDA(iommu_group_ida); static unsigned int iommu_def_domain_type __read_mostly; -static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_STRICT); +static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_DMA_STRICT); static u32 iommu_cmd_line __read_mostly; struct iommu_group { From 452e69b58c2889e5546edb92d9e66285410f7463 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 11 Aug 2021 13:21:38 +0100 Subject: [PATCH 199/474] iommu: Allow enabling non-strict mode dynamically Allocating and enabling a flush queue is in fact something we can reasonably do while a DMA domain is active, without having to rebuild it from scratch. Thus we can allow a strict -> non-strict transition from sysfs without requiring to unbind the device's driver, which is of particular interest to users who want to make selective relaxations to critical devices like the one serving their root filesystem. Disabling and draining a queue also seems technically possible to achieve without rebuilding the whole domain, but would certainly be more involved. Furthermore there's not such a clear use-case for tightening up security *after* the device may already have done whatever it is that you don't trust it not to do, so we only consider the relaxation case. Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/d652966348c78457c38bf18daf369272a4ebc2c9.1628682049.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/dma-iommu.c | 47 ++++++++++++++++++++++++++------------- drivers/iommu/iommu.c | 17 ++++++++++---- drivers/iommu/iova.c | 11 ++++----- include/linux/dma-iommu.h | 6 +++++ 4 files changed, 57 insertions(+), 24 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index b7ae855c1e89..bac7370ead3e 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -317,6 +317,30 @@ static bool dev_is_untrusted(struct device *dev) return dev_is_pci(dev) && to_pci_dev(dev)->untrusted; } +/* sysfs updates are serialised by the mutex of the group owning @domain */ +int iommu_dma_init_fq(struct iommu_domain *domain) +{ + struct iommu_dma_cookie *cookie = domain->iova_cookie; + int ret; + + if (cookie->fq_domain) + return 0; + + ret = init_iova_flush_queue(&cookie->iovad, iommu_dma_flush_iotlb_all, + iommu_dma_entry_dtor); + if (ret) { + pr_warn("iova flush queue initialization failed\n"); + return ret; + } + /* + * Prevent incomplete iovad->fq being observable. Pairs with path from + * __iommu_dma_unmap() through iommu_dma_free_iova() to queue_iova() + */ + smp_wmb(); + WRITE_ONCE(cookie->fq_domain, domain); + return 0; +} + /** * iommu_dma_init_domain - Initialise a DMA mapping domain * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() @@ -371,15 +395,8 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, init_iova_domain(iovad, 1UL << order, base_pfn); /* If the FQ fails we can simply fall back to strict mode */ - if (domain->type == IOMMU_DOMAIN_DMA_FQ && !cookie->fq_domain) { - if (init_iova_flush_queue(iovad, iommu_dma_flush_iotlb_all, - iommu_dma_entry_dtor)) { - pr_warn("iova flush queue initialization failed\n"); - domain->type = IOMMU_DOMAIN_DMA; - } else { - cookie->fq_domain = domain; - } - } + if (domain->type == IOMMU_DOMAIN_DMA_FQ && iommu_dma_init_fq(domain)) + domain->type = IOMMU_DOMAIN_DMA; return iova_reserve_iommu_regions(dev, domain); } @@ -454,17 +471,17 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain, } static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie, - dma_addr_t iova, size_t size, struct page *freelist) + dma_addr_t iova, size_t size, struct iommu_iotlb_gather *gather) { struct iova_domain *iovad = &cookie->iovad; /* The MSI case is only ever cleaning up its most recent allocation */ if (cookie->type == IOMMU_DMA_MSI_COOKIE) cookie->msi_iova -= size; - else if (cookie->fq_domain) /* non-strict mode */ + else if (gather && gather->queued) queue_iova(iovad, iova_pfn(iovad, iova), size >> iova_shift(iovad), - (unsigned long)freelist); + (unsigned long)gather->freelist); else free_iova_fast(iovad, iova_pfn(iovad, iova), size >> iova_shift(iovad)); @@ -483,14 +500,14 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, dma_addr -= iova_off; size = iova_align(iovad, size + iova_off); iommu_iotlb_gather_init(&iotlb_gather); - iotlb_gather.queued = cookie->fq_domain; + iotlb_gather.queued = READ_ONCE(cookie->fq_domain); unmapped = iommu_unmap_fast(domain, dma_addr, size, &iotlb_gather); WARN_ON(unmapped != size); - if (!cookie->fq_domain) + if (!iotlb_gather.queued) iommu_iotlb_sync(domain, &iotlb_gather); - iommu_dma_free_iova(cookie, dma_addr, size, iotlb_gather.freelist); + iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather); } static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 0e1f791873fa..feb66d937c9c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3204,6 +3204,14 @@ static int iommu_change_dev_def_domain(struct iommu_group *group, goto out; } + /* We can bring up a flush queue without tearing down the domain */ + if (type == IOMMU_DOMAIN_DMA_FQ && prev_dom->type == IOMMU_DOMAIN_DMA) { + ret = iommu_dma_init_fq(prev_dom); + if (!ret) + prev_dom->type = IOMMU_DOMAIN_DMA_FQ; + goto out; + } + /* Sets group->default_domain to the newly allocated domain */ ret = iommu_group_alloc_default_domain(dev->bus, group, type); if (ret) @@ -3244,9 +3252,9 @@ out: } /* - * Changing the default domain through sysfs requires the users to ubind the - * drivers from the devices in the iommu group. Return failure if this doesn't - * meet. + * Changing the default domain through sysfs requires the users to unbind the + * drivers from the devices in the iommu group, except for a DMA -> DMA-FQ + * transition. Return failure if this isn't met. * * We need to consider the race between this and the device release path. * device_lock(dev) is used here to guarantee that the device release path @@ -3322,7 +3330,8 @@ static ssize_t iommu_group_store_type(struct iommu_group *group, /* Check if the device in the group still has a driver bound to it */ device_lock(dev); - if (device_is_bound(dev)) { + if (device_is_bound(dev) && !(req_type == IOMMU_DOMAIN_DMA_FQ && + group->default_domain->type == IOMMU_DOMAIN_DMA)) { pr_err_ratelimited("Device is still bound to driver\n"); ret = -EBUSY; goto out; diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 2ad73fb2e94e..0af42fb93a49 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -121,8 +121,6 @@ int init_iova_flush_queue(struct iova_domain *iovad, spin_lock_init(&fq->lock); } - smp_wmb(); - iovad->fq = queue; timer_setup(&iovad->fq_timer, fq_flush_timeout, 0); @@ -633,17 +631,20 @@ void queue_iova(struct iova_domain *iovad, unsigned long pfn, unsigned long pages, unsigned long data) { - struct iova_fq *fq = raw_cpu_ptr(iovad->fq); + struct iova_fq *fq; unsigned long flags; unsigned idx; /* * Order against the IOMMU driver's pagetable update from unmapping * @pte, to guarantee that iova_domain_flush() observes that if called - * from a different CPU before we release the lock below. + * from a different CPU before we release the lock below. Full barrier + * so it also pairs with iommu_dma_init_fq() to avoid seeing partially + * written fq state here. */ - smp_wmb(); + smp_mb(); + fq = raw_cpu_ptr(iovad->fq); spin_lock_irqsave(&fq->lock, flags); /* diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h index 758ca4694257..24607dc3c2ac 100644 --- a/include/linux/dma-iommu.h +++ b/include/linux/dma-iommu.h @@ -20,6 +20,7 @@ void iommu_put_dma_cookie(struct iommu_domain *domain); /* Setup call for arch DMA mapping code */ void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit); +int iommu_dma_init_fq(struct iommu_domain *domain); /* The DMA API isn't _quite_ the whole story, though... */ /* @@ -54,6 +55,11 @@ static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base, { } +static inline int iommu_dma_init_fq(struct iommu_domain *domain) +{ + return -EINVAL; +} + static inline int iommu_get_dma_cookie(struct iommu_domain *domain) { return -ENODEV; From 51ed00e71f0130e0f3534b8e7d78facd16829426 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 18 Aug 2021 08:47:28 +0000 Subject: [PATCH 200/474] powerpc/32: Remove unneccessary calculations in load_up_{fpu/altivec} No need to re-read SPRN_THREAD, we can calculate thread address from current (r2). And remove a reload of value 1 into r4 as r4 is already 1. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c04cce578b97a76a9e69a096698b1d89f721768a.1629276437.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/fpu.S | 3 +-- arch/powerpc/kernel/vector.S | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 6010adcee16e..ba4afe3b5a9c 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -91,8 +91,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) isync /* enable use of FP after return */ #ifdef CONFIG_PPC32 - mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ - tovirt(r5, r5) + addi r5,r2,THREAD lwz r4,THREAD_FPEXC_MODE(r5) ori r9,r9,MSR_FP /* enable FP for current */ or r9,r9,r4 diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index fc120fac1910..ba03eedfdcd8 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -65,9 +65,8 @@ _GLOBAL(load_up_altivec) 1: /* enable use of VMX after return */ #ifdef CONFIG_PPC32 - mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ + addi r5,r2,THREAD oris r9,r9,MSR_VEC@h - tovirt(r5, r5) #else ld r4,PACACURRENT(r13) addi r5,r4,THREAD /* Get THREAD */ @@ -81,7 +80,6 @@ _GLOBAL(load_up_altivec) li r4,1 stb r4,THREAD_LOAD_VEC(r5) addi r6,r5,THREAD_VRSTATE - li r4,1 li r10,VRSTATE_VSCR stw r4,THREAD_USED_VR(r5) lvx v0,r10,r6 From 4d99efb229e63928c6b03a756a2e38cd4777fbe8 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 18 Aug 2021 21:48:44 +0800 Subject: [PATCH 201/474] iommu/vt-d: Update the virtual command related registers The VT-d spec Revision 3.3 updated the virtual command registers, virtual command opcode B register, virtual command response register and virtual command capability register (Section 10.4.43, 10.4.44, 10.4.45, 10.4.46). This updates the virtual command interface implementation in the Intel IOMMU driver accordingly. Fixes: 24f27d32ab6b7 ("iommu/vt-d: Enlightened PASID allocation") Signed-off-by: Lu Baolu Cc: Ashok Raj Cc: Sanjay Kumar Cc: Kevin Tian Link: https://lore.kernel.org/r/20210713042649.3547403-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20210818134852.1847070-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/pasid.h | 10 +++++----- include/linux/intel-iommu.h | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 5ff61c3d401f..8c2efb85fb3b 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -28,12 +28,12 @@ #define VCMD_CMD_ALLOC 0x1 #define VCMD_CMD_FREE 0x2 #define VCMD_VRSP_IP 0x1 -#define VCMD_VRSP_SC(e) (((e) >> 1) & 0x3) +#define VCMD_VRSP_SC(e) (((e) & 0xff) >> 1) #define VCMD_VRSP_SC_SUCCESS 0 -#define VCMD_VRSP_SC_NO_PASID_AVAIL 2 -#define VCMD_VRSP_SC_INVALID_PASID 2 -#define VCMD_VRSP_RESULT_PASID(e) (((e) >> 8) & 0xfffff) -#define VCMD_CMD_OPERAND(e) ((e) << 8) +#define VCMD_VRSP_SC_NO_PASID_AVAIL 16 +#define VCMD_VRSP_SC_INVALID_PASID 16 +#define VCMD_VRSP_RESULT_PASID(e) (((e) >> 16) & 0xfffff) +#define VCMD_CMD_OPERAND(e) ((e) << 16) /* * Domain ID reserved for pasid entries programmed for first-level * only and pass-through transfer modes. diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index d0fa0b31994d..05a65eb155f7 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -124,9 +124,9 @@ #define DMAR_MTRR_PHYSMASK8_REG 0x208 #define DMAR_MTRR_PHYSBASE9_REG 0x210 #define DMAR_MTRR_PHYSMASK9_REG 0x218 -#define DMAR_VCCAP_REG 0xe00 /* Virtual command capability register */ -#define DMAR_VCMD_REG 0xe10 /* Virtual command register */ -#define DMAR_VCRSP_REG 0xe20 /* Virtual command response register */ +#define DMAR_VCCAP_REG 0xe30 /* Virtual command capability register */ +#define DMAR_VCMD_REG 0xe00 /* Virtual command register */ +#define DMAR_VCRSP_REG 0xe10 /* Virtual command response register */ #define DMAR_IQER_REG_IQEI(reg) FIELD_GET(GENMASK_ULL(3, 0), reg) #define DMAR_IQER_REG_ITESID(reg) FIELD_GET(GENMASK_ULL(47, 32), reg) From 5e41c998949377c80d03610600228022430daf45 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 18 Aug 2021 21:48:45 +0800 Subject: [PATCH 202/474] iommu/vt-d: Remove unnecessary oom message Fixes scripts/checkpatch.pl warning: WARNING: Possible unnecessary 'out of memory' message Remove it can help us save a bit of memory. Signed-off-by: Zhen Lei Link: https://lore.kernel.org/r/20210609124937.14260-1-thunder.leizhen@huawei.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20210818134852.1847070-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/dmar.c | 2 -- drivers/iommu/intel/iommu.c | 6 +----- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index d66f79acd14d..0ec5514c9980 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -149,8 +149,6 @@ dmar_alloc_pci_notify_info(struct pci_dev *dev, unsigned long event) } else { info = kzalloc(size, GFP_KERNEL); if (!info) { - pr_warn("Out of memory when allocating notify_info " - "for %s.\n", pci_name(dev)); if (dmar_dev_scope_status == 0) dmar_dev_scope_status = -ENOMEM; return NULL; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 8fc46c9d6b96..36ce79c55766 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1779,11 +1779,8 @@ static int iommu_init_domains(struct intel_iommu *iommu) spin_lock_init(&iommu->lock); iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL); - if (!iommu->domain_ids) { - pr_err("%s: Allocating domain id array failed\n", - iommu->name); + if (!iommu->domain_ids) return -ENOMEM; - } size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **); iommu->domains = kzalloc(size, GFP_KERNEL); @@ -3224,7 +3221,6 @@ static int __init init_dmars(void) g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *), GFP_KERNEL); if (!g_iommus) { - pr_err("Allocating global iommu array failed\n"); ret = -ENOMEM; goto error; } From 01dac2d9d2364d2a68b37cc1381947ca53413b51 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 18 Aug 2021 21:48:46 +0800 Subject: [PATCH 203/474] iommu/vt-d: Refactor Kconfig a bit Put all sub-options inside a "if INTEL_IOMMU" so that they don't need to always depend on INTEL_IOMMU. Use IS_ENABLED() instead of #ifdef as well. Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20210720013856.4143880-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20210818134852.1847070-4-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/Kconfig | 18 ++++++++++-------- drivers/iommu/intel/iommu.c | 13 ++----------- 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index 43ebd8af11c5..c1a92c3049d0 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -25,9 +25,11 @@ config INTEL_IOMMU and include PCI device scope covered by these DMA remapping devices. +if INTEL_IOMMU + config INTEL_IOMMU_DEBUGFS bool "Export Intel IOMMU internals in Debugfs" - depends on INTEL_IOMMU && IOMMU_DEBUGFS + depends on IOMMU_DEBUGFS select DMAR_PERF help !!!WARNING!!! @@ -41,7 +43,7 @@ config INTEL_IOMMU_DEBUGFS config INTEL_IOMMU_SVM bool "Support for Shared Virtual Memory with Intel IOMMU" - depends on INTEL_IOMMU && X86_64 + depends on X86_64 select PCI_PASID select PCI_PRI select MMU_NOTIFIER @@ -53,9 +55,8 @@ config INTEL_IOMMU_SVM means of a Process Address Space ID (PASID). config INTEL_IOMMU_DEFAULT_ON - def_bool y - prompt "Enable Intel DMA Remapping Devices by default" - depends on INTEL_IOMMU + bool "Enable Intel DMA Remapping Devices by default" + default y help Selecting this option will enable a DMAR device at boot time if one is found. If this option is not selected, DMAR support can @@ -63,7 +64,7 @@ config INTEL_IOMMU_DEFAULT_ON config INTEL_IOMMU_BROKEN_GFX_WA bool "Workaround broken graphics drivers (going away soon)" - depends on INTEL_IOMMU && BROKEN && X86 + depends on BROKEN && X86 help Current Graphics drivers tend to use physical address for DMA and avoid using DMA APIs. Setting this config @@ -74,7 +75,7 @@ config INTEL_IOMMU_BROKEN_GFX_WA config INTEL_IOMMU_FLOPPY_WA def_bool y - depends on INTEL_IOMMU && X86 + depends on X86 help Floppy disk drivers are known to bypass DMA API calls thereby failing to work when IOMMU is enabled. This @@ -83,7 +84,6 @@ config INTEL_IOMMU_FLOPPY_WA config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON bool "Enable Intel IOMMU scalable mode by default" - depends on INTEL_IOMMU help Selecting this option will enable by default the scalable mode if hardware presents the capability. The scalable mode is defined in @@ -92,3 +92,5 @@ config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON is not selected, scalable mode support could also be enabled by passing intel_iommu=sm_on to the kernel. If not sure, please use the default value. + +endif # INTEL_IOMMU diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 36ce79c55766..acb91ddf32d0 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -327,17 +327,8 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova); -#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON -int dmar_disabled = 0; -#else -int dmar_disabled = 1; -#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */ - -#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON -int intel_iommu_sm = 1; -#else -int intel_iommu_sm; -#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */ +int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); +int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON); int intel_iommu_enabled = 0; EXPORT_SYMBOL_GPL(intel_iommu_enabled); From 792fb43ce2c98878f1539c1d3fdecc1d7a7d78fd Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 18 Aug 2021 21:48:47 +0800 Subject: [PATCH 204/474] iommu/vt-d: Enable Intel IOMMU scalable mode by default The commit 8950dcd83ae7d ("iommu/vt-d: Leave scalable mode default off") leaves the scalable mode default off and end users could turn it on with "intel_iommu=sm_on". Using the Intel IOMMU scalable mode for kernel DMA, user-level device access and Shared Virtual Address have been enabled. This enables the scalable mode by default if the hardware advertises the support and adds kernel options of "intel_iommu=sm_on/sm_off" for end users to configure it through the kernel parameters. Suggested-by: Ashok Raj Suggested-by: Sanjay Kumar Signed-off-by: Lu Baolu Cc: Kevin Tian Link: https://lore.kernel.org/r/20210720013856.4143880-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20210818134852.1847070-5-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- Documentation/admin-guide/kernel-parameters.txt | 11 ++++++----- drivers/iommu/intel/Kconfig | 1 + drivers/iommu/intel/iommu.c | 5 ++++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 19192b39952a..87d46cb76121 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1946,11 +1946,12 @@ By default, super page will be supported if Intel IOMMU has the capability. With this option, super page will not be supported. - sm_on [Default Off] - By default, scalable mode will be disabled even if the - hardware advertises that it has support for the scalable - mode translation. With this option set, scalable mode - will be used on hardware which claims to support it. + sm_on + Enable the Intel IOMMU scalable mode if the hardware + advertises that it has support for the scalable mode + translation. + sm_off + Disallow use of the Intel IOMMU scalable mode. tboot_noforce [Default Off] Do not force the Intel IOMMU enabled under tboot. By default, tboot will force Intel IOMMU on, which diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index c1a92c3049d0..0ddb77115be7 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -84,6 +84,7 @@ config INTEL_IOMMU_FLOPPY_WA config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON bool "Enable Intel IOMMU scalable mode by default" + default y help Selecting this option will enable by default the scalable mode if hardware presents the capability. The scalable mode is defined in diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index acb91ddf32d0..8c9a9ed7dc09 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -432,8 +432,11 @@ static int __init intel_iommu_setup(char *str) pr_info("Disable supported super page\n"); intel_iommu_superpage = 0; } else if (!strncmp(str, "sm_on", 5)) { - pr_info("Intel-IOMMU: scalable mode supported\n"); + pr_info("Enable scalable mode if hardware supports\n"); intel_iommu_sm = 1; + } else if (!strncmp(str, "sm_off", 6)) { + pr_info("Scalable mode is disallowed\n"); + intel_iommu_sm = 0; } else if (!strncmp(str, "tboot_noforce", 13)) { pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n"); intel_iommu_tboot_noforce = 1; From 289b3b005cb9d9dd6b30297b52c2b4596bc878b2 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 18 Aug 2021 21:48:48 +0800 Subject: [PATCH 205/474] iommu/vt-d: Preset A/D bits for user space DMA usage We preset the access and dirty bits for IOVA over first level usage only for the kernel DMA (i.e., when domain type is IOMMU_DOMAIN_DMA). We should also preset the FL A/D for user space DMA usage. The idea is that even the user space A/D bit memory write is unnecessary. We should avoid it to minimize the overhead. Suggested-by: Sanjay Kumar Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20210720013856.4143880-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20210818134852.1847070-6-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 8c9a9ed7dc09..8d4d49e12c51 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2334,13 +2334,9 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); attr |= DMA_FL_PTE_PRESENT; if (domain_use_first_level(domain)) { - attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US; - - if (iommu_is_dma_domain(&domain->domain)) { - attr |= DMA_FL_PTE_ACCESS; - if (prot & DMA_PTE_WRITE) - attr |= DMA_FL_PTE_DIRTY; - } + attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; + if (prot & DMA_PTE_WRITE) + attr |= DMA_FL_PTE_DIRTY; } pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; From 48811c44349ffbb778d3e36b53beb03ad43a979c Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 18 Aug 2021 21:48:49 +0800 Subject: [PATCH 206/474] iommu/vt-d: Allow devices to have more than 32 outstanding PRs The minimum per-IOMMU PRQ queue size is one 4K page, this is more entries than the hardcoded limit of 32 in the current VT-d code. Some devices can support up to 512 outstanding PRQs but underutilized by this limit of 32. Although, 32 gives some rough fairness when multiple devices share the same IOMMU PRQ queue, but far from optimal for customized use case. This extends the per-IOMMU PRQ queue size to four 4K pages and let the devices have as many outstanding page requests as they can. Signed-off-by: Jacob Pan Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20210720013856.4143880-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20210818134852.1847070-7-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 3 ++- drivers/iommu/intel/svm.c | 4 ---- include/linux/intel-svm.h | 5 +++++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 8d4d49e12c51..d75f59ae28e6 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1541,7 +1542,7 @@ static void iommu_enable_dev_iotlb(struct device_domain_info *info) if (info->pri_supported && (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) && - !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) + !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH)) info->pri_enabled = 1; #endif if (info->ats_supported && pci_ats_page_aligned(pdev) && diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 9b0f22bc0514..813438a07b62 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -31,8 +31,6 @@ static irqreturn_t prq_event_thread(int irq, void *d); static void intel_svm_drain_prq(struct device *dev, u32 pasid); #define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva) -#define PRQ_ORDER 0 - static DEFINE_XARRAY_ALLOC(pasid_private_array); static int pasid_private_add(ioasid_t pasid, void *priv) { @@ -724,8 +722,6 @@ struct page_req_dsc { u64 priv_data[2]; }; -#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) - static bool is_canonical_address(u64 addr) { int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index 10fa80eef13a..57cceecbe37f 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -14,6 +14,11 @@ #define SVM_REQ_EXEC (1<<1) #define SVM_REQ_PRIV (1<<0) +/* Page Request Queue depth */ +#define PRQ_ORDER 2 +#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) +#define PRQ_DEPTH ((0x1000 << PRQ_ORDER) >> 5) + /* * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only * for access to kernel addresses. No IOTLB flushes are automatically done From 9ddc348214c775ce5d0861e0363182b18a3dd187 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 18 Aug 2021 21:48:50 +0800 Subject: [PATCH 207/474] iommu/vt-d: Drop the kernel doc annotation Kernel doc validator is unhappy with the following .../perf.c:16: warning: Function parameter or member 'latency_lock' not described in 'DEFINE_SPINLOCK' .../perf.c:16: warning: expecting prototype for perf.c(). Prototype was for DEFINE_SPINLOCK() instead Drop kernel doc annotation since the top comment is not in the required format. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210729163538.40101-1-andriy.shevchenko@linux.intel.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20210818134852.1847070-8-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c index 73b7ec705552..0e8e03252d92 100644 --- a/drivers/iommu/intel/perf.c +++ b/drivers/iommu/intel/perf.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/** +/* * perf.c - performance monitor * * Copyright (C) 2021 Intel Corporation From 8123b0b86855185357e4b12f29e327ba773fc58d Mon Sep 17 00:00:00 2001 From: Liu Yi L Date: Wed, 18 Aug 2021 21:48:51 +0800 Subject: [PATCH 208/474] iommu/vt-d: Use pasid_pte_is_present() helper function Use the pasid_pte_is_present() helper for present bit check in the intel_pasid_tear_down_entry(). Signed-off-by: Liu Yi L Link: https://lore.kernel.org/r/20210817042425.1784279-1-yi.l.liu@intel.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20210818134852.1847070-9-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/pasid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index c6cf44a6c923..02e10491184b 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -517,7 +517,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, if (WARN_ON(!pte)) return; - if (!(pte->val[0] & PASID_PTE_PRESENT)) + if (!pasid_pte_is_present(pte)) return; did = pasid_get_domain_id(pte); From 423d39d8518c1bba12e0889a92beeddbb1502392 Mon Sep 17 00:00:00 2001 From: Liu Yi L Date: Wed, 18 Aug 2021 21:48:52 +0800 Subject: [PATCH 209/474] iommu/vt-d: Add present bit check in pasid entry setup helpers The helper functions should not modify the pasid entries which are still in use. Add a check against present bit. Signed-off-by: Liu Yi L Link: https://lore.kernel.org/r/20210817042425.1784279-1-yi.l.liu@intel.com Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20210818134852.1847070-10-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/pasid.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 02e10491184b..b1d0c2945c9a 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -534,6 +534,10 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, devtlb_invalidation_with_pasid(iommu, dev, pasid); } +/* + * This function flushes cache for a newly setup pasid table entry. + * Caller of it should not modify the in-use pasid table entries. + */ static void pasid_flush_caches(struct intel_iommu *iommu, struct pasid_entry *pte, u32 pasid, u16 did) @@ -585,6 +589,10 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, if (WARN_ON(!pte)) return -EINVAL; + /* Caller must ensure PASID entry is not in use. */ + if (pasid_pte_is_present(pte)) + return -EBUSY; + pasid_clear_entry(pte); /* Setup the first level page table pointer: */ @@ -684,6 +692,10 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return -ENODEV; } + /* Caller must ensure PASID entry is not in use. */ + if (pasid_pte_is_present(pte)) + return -EBUSY; + pasid_clear_entry(pte); pasid_set_domain_id(pte, did); pasid_set_slptr(pte, pgd_val); @@ -723,6 +735,10 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return -ENODEV; } + /* Caller must ensure PASID entry is not in use. */ + if (pasid_pte_is_present(pte)) + return -EBUSY; + pasid_clear_entry(pte); pasid_set_domain_id(pte, did); pasid_set_address_width(pte, iommu->agaw); From 1daf08a066cfe500587affd3fa3be8c13b8ff007 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:09 +0200 Subject: [PATCH 210/474] livepatch: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Petr Mladek Cc: Joe Lawrence Cc: live-patching@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Petr Mladek Signed-off-by: Jiri Kosina --- kernel/livepatch/transition.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 3a4beb9395c4..291b857a6e20 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -411,7 +411,7 @@ void klp_try_complete_transition(void) /* * Ditto for the idle "swapper" tasks. */ - get_online_cpus(); + cpus_read_lock(); for_each_possible_cpu(cpu) { task = idle_task(cpu); if (cpu_online(cpu)) { @@ -423,7 +423,7 @@ void klp_try_complete_transition(void) task->patch_state = klp_target_state; } } - put_online_cpus(); + cpus_read_unlock(); if (!complete) { if (klp_signals_cnt && !(klp_signals_cnt % SIGNALS_TIMEOUT)) From c26d4c5d4f0df7207da3975458261927f9305465 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 19 Aug 2021 13:39:53 +0200 Subject: [PATCH 211/474] powerpc/kvm: Remove obsolete and unneeded select Commit a278e7ea608b ("powerpc: Fix compile issue with force DAWR") selects the non-existing config PPC_DAWR_FORCE_ENABLE for config KVM_BOOK3S_64_HANDLER. As this commit also introduces a config PPC_DAWR and this config PPC_DAWR is selected with PPC if PPC64, there is no need for any further select in the KVM_BOOK3S_64_HANDLER. Remove an obsolete and unneeded select in config KVM_BOOK3S_64_HANDLER. The issue was identified with ./scripts/checkkconfigsymbols.py. Fixes: a278e7ea608b ("powerpc: Fix compile issue with force DAWR") Signed-off-by: Lukas Bulwahn Reviewed-by: Daniel Axtens Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210819113954.17515-2-lukas.bulwahn@gmail.com --- arch/powerpc/kvm/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index e45644657d49..ff581d70f20c 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -38,7 +38,6 @@ config KVM_BOOK3S_32_HANDLER config KVM_BOOK3S_64_HANDLER bool select KVM_BOOK3S_HANDLER - select PPC_DAWR_FORCE_ENABLE config KVM_BOOK3S_PR_POSSIBLE bool From 6cd717fe9b3a787f8e8f9d0bc9b7634a9c104b3e Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Thu, 19 Aug 2021 10:46:54 +1000 Subject: [PATCH 212/474] powerpc/tau: Add 'static' storage qualifier to 'tau_work' definition This patch prevents the following sparse warning. arch/powerpc/kernel/tau_6xx.c:199:1: sparse: sparse: symbol 'tau_work' was not declared. Should it be static? Reported-by: kernel test robot Signed-off-by: Finn Thain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/44ab381741916a51e783c4a50d0b186abdd8f280.1629334014.git.fthain@linux-m68k.org --- arch/powerpc/kernel/tau_6xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index b9a047d92ec0..8e83d19fe8fa 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -164,7 +164,7 @@ static void tau_work_func(struct work_struct *work) queue_work(tau_workq, work); } -DECLARE_WORK(tau_work, tau_work_func); +static DECLARE_WORK(tau_work, tau_work_func); /* * setup the TAU From f9addd85fbfacf0d155e83dbee8696d6df5ed0c7 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 13 Aug 2021 13:51:58 +0530 Subject: [PATCH 213/474] powerpc/perf/hv-gpci: Fix counter value parsing H_GetPerformanceCounterInfo (0xF080) hcall returns the counter data in the result buffer. Result buffer has specific format defined in the PAPR specification. One of the fields is counter offset and width of the counter data returned. Counter data are returned in a unsigned char array in big endian byte order. To get the final counter data, the values must be left shifted byte at a time. But commit 220a0c609ad17 ("powerpc/perf: Add support for the hv gpci (get performance counter info) interface") made the shifting bitwise and also assumed little endian order. Because of that, hcall counters values are reported incorrectly. In particular this can lead to counters go backwards which messes up the counter prev vs now calculation and leads to huge counter value reporting: #: perf stat -e hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ -C 0 -I 1000 time counts unit events 1.000078854 18,446,744,073,709,535,232 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 2.000213293 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 3.000320107 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 4.000428392 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 5.000537864 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 6.000649087 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 7.000760312 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 8.000865218 16,448 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 9.000978985 18,446,744,073,709,535,232 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 10.001088891 16,384 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 11.001201435 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 12.001307937 18,446,744,073,709,535,232 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ Fix the shifting logic to correct match the format, ie. read bytes in big endian order. Fixes: e4f226b1580b ("powerpc/perf/hv-gpci: Increase request buffer size") Cc: stable@vger.kernel.org # v4.6+ Reported-by: Nageswara R Sastry Signed-off-by: Kajol Jain Tested-by: Nageswara R Sastry Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210813082158.429023-1-kjain@linux.ibm.com --- arch/powerpc/perf/hv-gpci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c index d48413e28c39..c756228a081f 100644 --- a/arch/powerpc/perf/hv-gpci.c +++ b/arch/powerpc/perf/hv-gpci.c @@ -175,7 +175,7 @@ static unsigned long single_gpci_request(u32 req, u32 starting_index, */ count = 0; for (i = offset; i < offset + length; i++) - count |= arg->bytes[i] << (i - offset); + count |= (u64)(arg->bytes[i]) << ((length - 1 - (i - offset)) * 8); *value = count; out: From 4cb266074aa17e9cafed3a92e9f43b161516569f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 19 Aug 2021 14:56:52 +0200 Subject: [PATCH 214/474] powerpc/pseries/vas: Declare pseries_vas_fault_thread_fn() as static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a compile error with W=1. Fixes: 6d0aaf5e0de0 ("powerpc/pseries/vas: Setup IRQ and fault handling") Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210819125656.14498-3-clg@kaod.org --- arch/powerpc/platforms/pseries/vas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index b5c1cf1bc64d..b043e3936d21 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -184,7 +184,7 @@ static int h_get_nx_fault(u32 winid, u64 buffer) * Note: The hypervisor forwards an interrupt for each fault request. * So one fault CRB to process for each H_GET_NX_FAULT hcall. */ -irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data) +static irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data) { struct pseries_vas_window *txwin = data; struct coprocessor_request_block crb; From cb53a93e33e108bfec00a13cd12696e1c0ccc8b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 19 Aug 2021 14:56:53 +0200 Subject: [PATCH 215/474] KVM: PPC: Book3S PR: Declare kvmppc_handle_exit_pr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a compile error with W=1. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210819125656.14498-4-clg@kaod.org --- arch/powerpc/kvm/book3s.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index 740e51def5a5..58391b4b32ed 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -23,7 +23,8 @@ extern int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val); extern int kvmppc_book3s_init_pr(void); -extern void kvmppc_book3s_exit_pr(void); +void kvmppc_book3s_exit_pr(void); +extern int kvmppc_handle_exit_pr(struct kvm_vcpu *vcpu, unsigned int exit_nr); #ifdef CONFIG_PPC_TRANSACTIONAL_MEM extern void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val); From b352ddae7b2ccd2cb29f495ca0a7c9b6848b623f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 19 Aug 2021 14:56:54 +0200 Subject: [PATCH 216/474] KVM: PPC: Book3S PR: Remove unused variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a compile error with W=1. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210819125656.14498-5-clg@kaod.org --- arch/powerpc/kvm/book3s_64_mmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 26b8b27a3755..feee40cb2ba1 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -196,7 +196,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, hva_t ptegp; u64 pteg[16]; u64 avpn = 0; - u64 v, r; + u64 r; u64 v_val, v_mask; u64 eaddr_mask; int i; @@ -285,7 +285,6 @@ do_second: goto do_second; } - v = be64_to_cpu(pteg[i]); r = be64_to_cpu(pteg[i+1]); pp = (r & HPTE_R_PP) | key; if (r & HPTE_R_PP0) From 53f613134984c293af158920ebadcac045fa0545 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 20 Aug 2021 15:49:49 +0800 Subject: [PATCH 217/474] iommu/arm-smmu: Fix missing unlock on error in arm_smmu_device_group() Add the missing unlock before return from function arm_smmu_device_group() in the error handling case. Fixes: b1a1347912a7 ("iommu/arm-smmu: Fix race condition during iommu_group creation") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210820074949.1946576-1-yangyingliang@huawei.com Signed-off-by: Joerg Roedel --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 67b660b0551d..45b4aedd7ab1 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1488,8 +1488,10 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) mutex_lock(&smmu->stream_map_mutex); for_each_cfg_sme(cfg, fwspec, i, idx) { if (group && smmu->s2crs[idx].group && - group != smmu->s2crs[idx].group) + group != smmu->s2crs[idx].group) { + mutex_unlock(&smmu->stream_map_mutex); return ERR_PTR(-EINVAL); + } group = smmu->s2crs[idx].group; } From 898a1ef06ad4a2a8e3c9490ce62624fcd1a7b1f8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 20 Aug 2021 09:28:19 +0000 Subject: [PATCH 218/474] powerpc/audit: Avoid unneccessary #ifdef in syscall_get_arguments() Use is_32bit_task() which already handles CONFIG_COMPAT. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ba49cdd574558a0363300c3f6b5b062b397cb071.1629451483.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/syscall.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index ba0f88f3a30d..7ea3c4044186 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -90,10 +90,9 @@ static inline void syscall_get_arguments(struct task_struct *task, unsigned long val, mask = -1UL; unsigned int n = 6; -#ifdef CONFIG_COMPAT - if (test_tsk_thread_flag(task, TIF_32BIT)) + if (is_32bit_task()) mask = 0xffffffff; -#endif + while (n--) { if (n == 0) val = regs->orig_gpr3; From 770cec16cdc9d15555e67896dd2214a4fec9a3fa Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 20 Aug 2021 09:39:14 +0000 Subject: [PATCH 219/474] powerpc/audit: Simplify syscall_get_arch() Make use of is_32bit_task() and CONFIG_CPU_LITTLE_ENDIAN to simplify syscall_get_arch(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/4be53b9187a4d8c163968f4d224267e41a7fcc33.1629451479.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/syscall.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index 7ea3c4044186..c60ebd04b2ed 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -115,16 +115,11 @@ static inline void syscall_set_arguments(struct task_struct *task, static inline int syscall_get_arch(struct task_struct *task) { - int arch; - - if (IS_ENABLED(CONFIG_PPC64) && !test_tsk_thread_flag(task, TIF_32BIT)) - arch = AUDIT_ARCH_PPC64; + if (is_32bit_task()) + return AUDIT_ARCH_PPC; + else if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) + return AUDIT_ARCH_PPC64LE; else - arch = AUDIT_ARCH_PPC; - -#ifdef __LITTLE_ENDIAN__ - arch |= __AUDIT_ARCH_LE; -#endif - return arch; + return AUDIT_ARCH_PPC64; } #endif /* _ASM_SYSCALL_H */ From f7403abf5f06f407c50252e003f5fb332325147b Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 20 Aug 2021 14:14:42 +0100 Subject: [PATCH 220/474] iommu/io-pgtable: Abstract iommu_iotlb_gather access Previously io-pgtable merely passed the iommu_iotlb_gather pointer through to helpers, but now it has grown its own direct dereference. This turns out to break the build for !IOMMU_API configs where the structure only has a dummy definition. It will probably also crash drivers who don't use the gather mechanism and simply pass in NULL. Wrap this dereference in a suitable helper which can both be stubbed out for !IOMMU_API and encapsulate a NULL check otherwise. Fixes: 7a7c5badf858 ("iommu: Indicate queued flushes via gather data") Reported-by: kernel test robot Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/83672ee76f6405c82845a55c148fa836f56fbbc1.1629465282.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-arm-v7s.c | 2 +- drivers/iommu/io-pgtable-arm.c | 2 +- include/linux/iommu.h | 10 ++++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index e84478d39705..bfb6acb651e5 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -700,7 +700,7 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, ARM_V7S_BLOCK_SIZE(lvl + 1)); ptep = iopte_deref(pte[i], lvl, data); __arm_v7s_free_table(ptep, lvl + 1, data); - } else if (!gather->queued) { + } else if (!iommu_iotlb_gather_queued(gather)) { io_pgtable_tlb_add_page(iop, gather, iova, blk_size); } iova += blk_size; diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 48a5bd8f571d..9697721f7e3a 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -638,7 +638,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, io_pgtable_tlb_flush_walk(iop, iova + i * size, size, ARM_LPAE_GRANULE(data)); __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data)); - } else if (!gather->queued) { + } else if (!iommu_iotlb_gather_queued(gather)) { io_pgtable_tlb_add_page(iop, gather, iova + i * size, size); } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 923a8d1c5e39..a23779c093c7 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -548,6 +548,11 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, gather->start = start; } +static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather) +{ + return gather && gather->queued; +} + /* PCI device grouping function */ extern struct iommu_group *pci_device_group(struct device *dev); /* Generic device grouping function */ @@ -896,6 +901,11 @@ static inline void iommu_iotlb_gather_add_page(struct iommu_domain *domain, { } +static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather) +{ + return false; +} + static inline void iommu_device_unregister(struct iommu_device *iommu) { } From 126b39368604dbf155ce6dad8781a0454477022a Mon Sep 17 00:00:00 2001 From: Huilong Deng Date: Tue, 17 Aug 2021 10:53:38 +0800 Subject: [PATCH 221/474] MIPS: Return true/false (not 1/0) from bool functions ./arch/mips/kernel/uprobes.c:261:8-9: WARNING: return of 0/1 in function 'arch_uprobe_skip_sstep' with return type bool ./arch/mips/kernel/uprobes.c:78:10-11: WARNING: return of 0/1 in function 'is_trap_insn' with return type bool ./arch/mips/kvm/mmu.c:489:9-10: WARNING: return of 0/1 in function 'kvm_test_age_gfn' with return type bool ./arch/mips/kvm/mmu.c:445:8-9: WARNING: return of 0/1 in function 'kvm_unmap_gfn_range' with return type bool Signed-off-by: Huilong Deng Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/uprobes.c | 10 +++++----- arch/mips/kvm/mmu.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/mips/kernel/uprobes.c b/arch/mips/kernel/uprobes.c index 6dbe4eab0a0e..9db2a6db5f62 100644 --- a/arch/mips/kernel/uprobes.c +++ b/arch/mips/kernel/uprobes.c @@ -75,7 +75,7 @@ bool is_trap_insn(uprobe_opcode_t *insn) case tlt_op: case tltu_op: case tne_op: - return 1; + return true; } break; @@ -87,12 +87,12 @@ bool is_trap_insn(uprobe_opcode_t *insn) case tlti_op: case tltiu_op: case tnei_op: - return 1; + return true; } break; } - return 0; + return false; } #define UPROBE_TRAP_NR ULONG_MAX @@ -254,9 +254,9 @@ unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) * See if the instruction can be emulated. * Returns true if instruction was emulated, false otherwise. * - * For now we always emulate so this function just returns 0. + * For now we always emulate so this function just returns false. */ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { - return 0; + return false; } diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 6d1f68cf4edf..1bfd1b501d82 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -442,7 +442,7 @@ static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn, bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { kvm_mips_flush_gpa_pt(kvm, range->start, range->end); - return 1; + return true; } bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -486,7 +486,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa); if (!gpa_pte) - return 0; + return false; return pte_young(*gpa_pte); } From cd92dbaf5d0444c403ca818ec37d945f05e9d240 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 16 Aug 2021 12:53:26 +0200 Subject: [PATCH 222/474] MAINTAINERS: adjust PISTACHIO SOC SUPPORT after its retirement Commit 104f942b2832 ("MIPS: Retire MACH_PISTACHIO") removes ./arch/mips/pistachio/ and ./arch/mips/configs/pistachio_defconfig, but misses to adjust the corresponding section PISTACHIO SOC SUPPORT in MAINTAINERS. Hence, ./scripts/get_maintainer.pl --self-test=patterns complains: warning: no file matches F: arch/mips/configs/pistachio*_defconfig warning: no file matches F: arch/mips/pistachio/ As James Hartley is not reachable with the provided email address, the remaining dtsi file, arch/mips/boot/dts/img/pistachio.dtsi, must be maintained by its only user pistachio_marduk.dts, which is part of MARDUK (CREATOR CI40) DEVICE TREE SUPPORT. Add maintenance of pistachio.dtsi to that section and drop the PISTACHIO SOC SUPPORT after its retirement. Signed-off-by: Lukas Bulwahn Acked-by: Rahul Bedarkar Acked-by: Jiaxun Yang Signed-off-by: Thomas Bogendoerfer --- MAINTAINERS | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index a61f4f3b78a9..6723c68bd0f8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11039,7 +11039,7 @@ MARDUK (CREATOR CI40) DEVICE TREE SUPPORT M: Rahul Bedarkar L: linux-mips@vger.kernel.org S: Maintained -F: arch/mips/boot/dts/img/pistachio_marduk.dts +F: arch/mips/boot/dts/img/pistachio* MARVELL 88E6XXX ETHERNET SWITCH FABRIC DRIVER M: Andrew Lunn @@ -14694,14 +14694,6 @@ S: Maintained W: http://www.st.com/spear F: drivers/pinctrl/spear/ -PISTACHIO SOC SUPPORT -M: James Hartley -L: linux-mips@vger.kernel.org -S: Odd Fixes -F: arch/mips/boot/dts/img/pistachio* -F: arch/mips/configs/pistachio*_defconfig -F: arch/mips/pistachio/ - PKTCDVD DRIVER M: linux-block@vger.kernel.org S: Orphan From 0181f6f19c6c35b24f1516d8db22f3bbce762633 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 19 Aug 2021 20:04:15 +0300 Subject: [PATCH 223/474] MIPS: mscc: ocelot: disable all switch ports by default The ocelot switch driver used to ignore ports which do not have a phy-handle property and not probe those, but this is not quite ok since it is valid to not have a phy-handle property if there is a fixed-link. It seems that checking for a phy-handle was a proxy for the proper check which is for the status, but that doesn't make a lot of sense, since the ocelot driver already iterates using for_each_available_child_of_node which skips the disabled ports, so I have no idea. Anyway, a widespread pattern in device trees is for a SoC dtsi to disable by default all hardware, and let board dts files enable what is used. So let's do that and enable only the ports with a phy-handle in the pcb120 and pcb123 device tree files. Signed-off-by: Vladimir Oltean Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/mscc/ocelot.dtsi | 11 +++++++++++ arch/mips/boot/dts/mscc/ocelot_pcb120.dts | 8 ++++++++ arch/mips/boot/dts/mscc/ocelot_pcb123.dts | 4 ++++ 3 files changed, 23 insertions(+) diff --git a/arch/mips/boot/dts/mscc/ocelot.dtsi b/arch/mips/boot/dts/mscc/ocelot.dtsi index 535a98284dcb..e51db651af13 100644 --- a/arch/mips/boot/dts/mscc/ocelot.dtsi +++ b/arch/mips/boot/dts/mscc/ocelot.dtsi @@ -150,36 +150,47 @@ port0: port@0 { reg = <0>; + status = "disabled"; }; port1: port@1 { reg = <1>; + status = "disabled"; }; port2: port@2 { reg = <2>; + status = "disabled"; }; port3: port@3 { reg = <3>; + status = "disabled"; }; port4: port@4 { reg = <4>; + status = "disabled"; }; port5: port@5 { reg = <5>; + status = "disabled"; }; port6: port@6 { reg = <6>; + status = "disabled"; }; port7: port@7 { reg = <7>; + status = "disabled"; }; port8: port@8 { reg = <8>; + status = "disabled"; }; port9: port@9 { reg = <9>; + status = "disabled"; }; port10: port@10 { reg = <10>; + status = "disabled"; }; }; }; diff --git a/arch/mips/boot/dts/mscc/ocelot_pcb120.dts b/arch/mips/boot/dts/mscc/ocelot_pcb120.dts index 897de5025d7f..d2dc6b3d923c 100644 --- a/arch/mips/boot/dts/mscc/ocelot_pcb120.dts +++ b/arch/mips/boot/dts/mscc/ocelot_pcb120.dts @@ -69,40 +69,48 @@ }; &port0 { + status = "okay"; phy-handle = <&phy0>; }; &port1 { + status = "okay"; phy-handle = <&phy1>; }; &port2 { + status = "okay"; phy-handle = <&phy2>; }; &port3 { + status = "okay"; phy-handle = <&phy3>; }; &port4 { + status = "okay"; phy-handle = <&phy7>; phy-mode = "sgmii"; phys = <&serdes 4 SERDES1G(2)>; }; &port5 { + status = "okay"; phy-handle = <&phy4>; phy-mode = "sgmii"; phys = <&serdes 5 SERDES1G(5)>; }; &port6 { + status = "okay"; phy-handle = <&phy6>; phy-mode = "sgmii"; phys = <&serdes 6 SERDES1G(3)>; }; &port9 { + status = "okay"; phy-handle = <&phy5>; phy-mode = "sgmii"; phys = <&serdes 9 SERDES1G(4)>; diff --git a/arch/mips/boot/dts/mscc/ocelot_pcb123.dts b/arch/mips/boot/dts/mscc/ocelot_pcb123.dts index ef852f382da8..7d7e638791dd 100644 --- a/arch/mips/boot/dts/mscc/ocelot_pcb123.dts +++ b/arch/mips/boot/dts/mscc/ocelot_pcb123.dts @@ -47,17 +47,21 @@ }; &port0 { + status = "okay"; phy-handle = <&phy0>; }; &port1 { + status = "okay"; phy-handle = <&phy1>; }; &port2 { + status = "okay"; phy-handle = <&phy2>; }; &port3 { + status = "okay"; phy-handle = <&phy3>; }; From eba54cbb92d28b4f6dc1ed5f73f5187b09d82c08 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 19 Aug 2021 20:04:16 +0300 Subject: [PATCH 224/474] MIPS: mscc: ocelot: mark the phy-mode for internal PHY ports The ocelot driver was converted to phylink, and that expects a valid phy_interface_t. Without a phy-mode, of_get_phy_mode returns PHY_INTERFACE_MODE_NA, which is not ideal because phylink rejects that. The ocelot driver was patched to treat PHY_INTERFACE_MODE_NA as PHY_INTERFACE_MODE_INTERNAL to work with the broken DT blobs, but we should fix the device trees and specify the phy-mode too. Signed-off-by: Vladimir Oltean Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/mscc/ocelot_pcb120.dts | 4 ++++ arch/mips/boot/dts/mscc/ocelot_pcb123.dts | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/arch/mips/boot/dts/mscc/ocelot_pcb120.dts b/arch/mips/boot/dts/mscc/ocelot_pcb120.dts index d2dc6b3d923c..bd240690cb37 100644 --- a/arch/mips/boot/dts/mscc/ocelot_pcb120.dts +++ b/arch/mips/boot/dts/mscc/ocelot_pcb120.dts @@ -71,21 +71,25 @@ &port0 { status = "okay"; phy-handle = <&phy0>; + phy-mode = "internal"; }; &port1 { status = "okay"; phy-handle = <&phy1>; + phy-mode = "internal"; }; &port2 { status = "okay"; phy-handle = <&phy2>; + phy-mode = "internal"; }; &port3 { status = "okay"; phy-handle = <&phy3>; + phy-mode = "internal"; }; &port4 { diff --git a/arch/mips/boot/dts/mscc/ocelot_pcb123.dts b/arch/mips/boot/dts/mscc/ocelot_pcb123.dts index 7d7e638791dd..0185045c7630 100644 --- a/arch/mips/boot/dts/mscc/ocelot_pcb123.dts +++ b/arch/mips/boot/dts/mscc/ocelot_pcb123.dts @@ -49,19 +49,23 @@ &port0 { status = "okay"; phy-handle = <&phy0>; + phy-mode = "internal"; }; &port1 { status = "okay"; phy-handle = <&phy1>; + phy-mode = "internal"; }; &port2 { status = "okay"; phy-handle = <&phy2>; + phy-mode = "internal"; }; &port3 { status = "okay"; phy-handle = <&phy3>; + phy-mode = "internal"; }; From a00ea5b6f2bbef8b004b0b7228c61680a50c7c3f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 23 Aug 2021 06:45:20 +0000 Subject: [PATCH 225/474] powerpc/syscalls: Remove __NR__exit __NR__exit is nowhere used. On most architectures it was removed by commit 135ab6ec8fda ("[PATCH] remove remaining errno and __KERNEL_SYSCALLS__ references") but not on powerpc. powerpc removed __KERNEL_SYSCALLS__ in commit 3db03b4afb3e ("[PATCH] rename the provided execve functions to kernel_execve"), but __NR__exit was left over. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/6457eb4f327313323ed1f70e540bbb4ddc9178fa.1629701106.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/unistd.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index b541c690a31c..5eb462af6766 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -9,8 +9,6 @@ #define NR_syscalls __NR_syscalls -#define __NR__exit __NR_exit - #ifndef __ASSEMBLY__ #include From 3accc0faef081b6813967b34f7d05a3edb855cbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 23 Aug 2021 11:00:38 +0200 Subject: [PATCH 226/474] =?UTF-8?q?powerpc/prom:=20Fix=20unused=20variable?= =?UTF-8?q?=20=E2=80=98reserve=5Fmap=E2=80=99=20when=20CONFIG=5FPPC32=20is?= =?UTF-8?q?=20not=20set?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a compile error with W=1. arch/powerpc/kernel/prom.c: In function ‘early_reserve_mem’: arch/powerpc/kernel/prom.c:625:10: error: variable ‘reserve_map’ set but not used [-Werror=unused-but-set-variable] __be64 *reserve_map; ^~~~~~~~~~~ cc1: all warnings being treated as errors Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210823090039.166120-2-clg@kaod.org --- arch/powerpc/kernel/prom.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index f620e04dc9bf..44b2cdc0aae3 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -640,7 +640,9 @@ static void __init early_reserve_mem(void) } #endif /* CONFIG_BLK_DEV_INITRD */ -#ifdef CONFIG_PPC32 + if (!IS_ENABLED(CONFIG_PPC32)) + return; + /* * Handle the case where we might be booting from an old kexec * image that setup the mem_rsvmap as pairs of 32-bit values @@ -661,7 +663,6 @@ static void __init early_reserve_mem(void) } return; } -#endif } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM From cc47ad409ba9cc950e9c492c8ba653dabd392148 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 23 Aug 2021 11:00:39 +0200 Subject: [PATCH 227/474] powerpc/compat_sys: Declare syscalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a compile error with W=1. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210823090039.166120-3-clg@kaod.org --- arch/powerpc/include/asm/syscalls.h | 30 +++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 398171fdcd9f..7ee66ae5444d 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -6,6 +6,7 @@ #include #include #include +#include struct rtas_args; @@ -18,5 +19,34 @@ asmlinkage long sys_mmap2(unsigned long addr, size_t len, asmlinkage long ppc64_personality(unsigned long personality); asmlinkage long sys_rtas(struct rtas_args __user *uargs); +#ifdef CONFIG_COMPAT +unsigned long compat_sys_mmap2(unsigned long addr, size_t len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff); + +compat_ssize_t compat_sys_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, + u32 reg6, u32 pos1, u32 pos2); + +compat_ssize_t compat_sys_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count, + u32 reg6, u32 pos1, u32 pos2); + +compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offset1, u32 offset2, u32 count); + +int compat_sys_truncate64(const char __user *path, u32 reg4, + unsigned long len1, unsigned long len2); + +long compat_sys_fallocate(int fd, int mode, u32 offset1, u32 offset2, u32 len1, u32 len2); + +int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long len1, + unsigned long len2); + +long ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2, + size_t len, int advice); + +long compat_sys_sync_file_range2(int fd, unsigned int flags, + unsigned int offset1, unsigned int offset2, + unsigned int nbytes1, unsigned int nbytes2); +#endif + #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_SYSCALLS_H */ From ce5cb67c664fbc93d1af20b3fbd7a07eda9f6ee6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 16 Aug 2021 14:26:16 +0100 Subject: [PATCH 228/474] of: Move of_dma_set_restricted_buffer() into device.c Rob observes that: | of_dma_set_restricted_buffer() [...] should also be moved to | of/device.c. There's no reason for it to be in of/address.c. It has | nothing to do with address parsing. Move it to of/device.c, as he suggests. Cc: Claire Chang Cc: Konrad Rzeszutek Wilk Cc: Christoph Hellwig Cc: Robin Murphy Suggested-by: Rob Herring Link: https://lore.kernel.org/r/CAL_JsqJ7ROWWJX84x2kEex9NQ8G+2=ybRuNOobX+j8bjZzSemQ@mail.gmail.com Signed-off-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- drivers/of/address.c | 33 --------------------------------- drivers/of/device.c | 34 ++++++++++++++++++++++++++++++++++ drivers/of/of_private.h | 7 ------- 3 files changed, 34 insertions(+), 40 deletions(-) diff --git a/drivers/of/address.c b/drivers/of/address.c index 973257434398..94f017d808c4 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -996,38 +995,6 @@ out: of_node_put(node); return ret; } - -int of_dma_set_restricted_buffer(struct device *dev, struct device_node *np) -{ - struct device_node *node, *of_node = dev->of_node; - int count, i; - - count = of_property_count_elems_of_size(of_node, "memory-region", - sizeof(u32)); - /* - * If dev->of_node doesn't exist or doesn't contain memory-region, try - * the OF node having DMA configuration. - */ - if (count <= 0) { - of_node = np; - count = of_property_count_elems_of_size( - of_node, "memory-region", sizeof(u32)); - } - - for (i = 0; i < count; i++) { - node = of_parse_phandle(of_node, "memory-region", i); - /* - * There might be multiple memory regions, but only one - * restricted-dma-pool region is allowed. - */ - if (of_device_is_compatible(node, "restricted-dma-pool") && - of_device_is_available(node)) - return of_reserved_mem_device_init_by_idx(dev, of_node, - i); - } - - return 0; -} #endif /* CONFIG_HAS_DMA */ /** diff --git a/drivers/of/device.c b/drivers/of/device.c index 2defdca418ec..089c5b4b97fb 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -5,6 +5,7 @@ #include #include #include +#include #include /* for bus_dma_region */ #include #include @@ -52,6 +53,39 @@ int of_device_add(struct platform_device *ofdev) return device_add(&ofdev->dev); } +static int +of_dma_set_restricted_buffer(struct device *dev, struct device_node *np) +{ + struct device_node *node, *of_node = dev->of_node; + int count, i; + + count = of_property_count_elems_of_size(of_node, "memory-region", + sizeof(u32)); + /* + * If dev->of_node doesn't exist or doesn't contain memory-region, try + * the OF node having DMA configuration. + */ + if (count <= 0) { + of_node = np; + count = of_property_count_elems_of_size( + of_node, "memory-region", sizeof(u32)); + } + + for (i = 0; i < count; i++) { + node = of_parse_phandle(of_node, "memory-region", i); + /* + * There might be multiple memory regions, but only one + * restricted-dma-pool region is allowed. + */ + if (of_device_is_compatible(node, "restricted-dma-pool") && + of_device_is_available(node)) + return of_reserved_mem_device_init_by_idx(dev, of_node, + i); + } + + return 0; +} + /** * of_dma_configure_id - Setup DMA configuration * @dev: Device to apply DMA configuration diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h index f557bd22b0cf..631489f7f8c0 100644 --- a/drivers/of/of_private.h +++ b/drivers/of/of_private.h @@ -163,19 +163,12 @@ struct bus_dma_region; #if defined(CONFIG_OF_ADDRESS) && defined(CONFIG_HAS_DMA) int of_dma_get_range(struct device_node *np, const struct bus_dma_region **map); -int of_dma_set_restricted_buffer(struct device *dev, struct device_node *np); #else static inline int of_dma_get_range(struct device_node *np, const struct bus_dma_region **map) { return -ENODEV; } -static inline int of_dma_set_restricted_buffer(struct device *dev, - struct device_node *np) -{ - /* Do nothing, successfully. */ - return 0; -} #endif void fdt_init_reserved_mem(void); From f3cfd136aef0184919464b49d5b74d43605abcbc Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 16 Aug 2021 14:26:17 +0100 Subject: [PATCH 229/474] of: restricted dma: Don't fail device probe on rmem init failure If CONFIG_DMA_RESTRICTED_POOL=n then probing a device with a reference to a "restricted-dma-pool" will fail with a reasonably cryptic error: | pci-host-generic: probe of 10000.pci failed with error -22 Rework of_dma_set_restricted_buffer() so that it does not cause probing failure and instead either returns early if CONFIG_DMA_RESTRICTED_POOL=n or emits a diagnostic if the reserved DMA pool fails to initialise. Cc: Claire Chang Cc: Konrad Rzeszutek Wilk Cc: Christoph Hellwig Cc: Rob Herring Cc: Robin Murphy Signed-off-by: Will Deacon Signed-off-by: Konrad Rzeszutek Wilk --- drivers/of/device.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/of/device.c b/drivers/of/device.c index 089c5b4b97fb..5b043ee30824 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -53,12 +53,15 @@ int of_device_add(struct platform_device *ofdev) return device_add(&ofdev->dev); } -static int +static void of_dma_set_restricted_buffer(struct device *dev, struct device_node *np) { struct device_node *node, *of_node = dev->of_node; int count, i; + if (!IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL)) + return; + count = of_property_count_elems_of_size(of_node, "memory-region", sizeof(u32)); /* @@ -79,11 +82,11 @@ of_dma_set_restricted_buffer(struct device *dev, struct device_node *np) */ if (of_device_is_compatible(node, "restricted-dma-pool") && of_device_is_available(node)) - return of_reserved_mem_device_init_by_idx(dev, of_node, - i); + break; } - return 0; + if (i != count && of_reserved_mem_device_init_by_idx(dev, of_node, i)) + dev_warn(dev, "failed to initialise \"restricted-dma-pool\" memory node\n"); } /** @@ -200,7 +203,7 @@ int of_dma_configure_id(struct device *dev, struct device_node *np, arch_setup_dma_ops(dev, dma_start, size, iommu, coherent); if (!iommu) - return of_dma_set_restricted_buffer(dev, np); + of_dma_set_restricted_buffer(dev, np); return 0; } From 5d7d6dac8fe99ed59eee2300e4a03370f94d5222 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 5 Aug 2021 18:26:14 -0300 Subject: [PATCH 230/474] KVM: PPC: Book3S HV: Fix copy_tofrom_guest routines The __kvmhv_copy_tofrom_guest_radix function was introduced along with nested HV guest support. It uses the platform's Radix MMU quadrants to provide a nested hypervisor with fast access to its nested guests memory (H_COPY_TOFROM_GUEST hypercall). It has also since been added as a fast path for the kvmppc_ld/st routines which are used during instruction emulation. The commit def0bfdbd603 ("powerpc: use probe_user_read() and probe_user_write()") changed the low level copy function from raw_copy_from_user to probe_user_read, which adds a check to access_ok. In powerpc that is: static inline bool __access_ok(unsigned long addr, unsigned long size) { return addr < TASK_SIZE_MAX && size <= TASK_SIZE_MAX - addr; } and TASK_SIZE_MAX is 0x0010000000000000UL for 64-bit, which means that setting the two MSBs of the effective address (which correspond to the quadrant) now cause access_ok to reject the access. This was not caught earlier because the most common code path via kvmppc_ld/st contains a fallback (kvm_read_guest) that is likely to succeed for L1 guests. For nested guests there is no fallback. Another issue is that probe_user_read (now __copy_from_user_nofault) does not return the number of bytes not copied in case of failure, so the destination memory is not being cleared anymore in kvmhv_copy_from_guest_radix: ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n); if (ret > 0) <-- always false! memset(to + (n - ret), 0, ret); This patch fixes both issues by skipping access_ok and open-coding the low level __copy_to/from_user_inatomic. Fixes: def0bfdbd603 ("powerpc: use probe_user_read() and probe_user_write()") Signed-off-by: Fabiano Rosas Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210805212616.2641017-2-farosas@linux.ibm.com --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index b5905ae4377c..44eb7b1ef289 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -65,10 +65,12 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, } isync(); + pagefault_disable(); if (is_load) - ret = copy_from_user_nofault(to, (const void __user *)from, n); + ret = __copy_from_user_inatomic(to, (const void __user *)from, n); else - ret = copy_to_user_nofault((void __user *)to, from, n); + ret = __copy_to_user_inatomic((void __user *)to, from, n); + pagefault_enable(); /* switch the pid first to avoid running host with unallocated pid */ if (quadrant == 1 && pid != old_pid) From c232461c0c3b0aca637f0d7658a7f5d0bb393a4e Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 5 Aug 2021 18:26:15 -0300 Subject: [PATCH 231/474] KVM: PPC: Book3S HV: Add sanity check to copy_tofrom_guest Both paths into __kvmhv_copy_tofrom_guest_radix ensure that we arrive with an effective address that is smaller than our total addressable space and addresses quadrant 0. - The H_COPY_TOFROM_GUEST hypercall path rejects the call with H_PARAMETER if the effective address has any of the twelve most significant bits set. - The kvmhv_copy_tofrom_guest_radix path clears the top twelve bits before calling the internal function. Although the callers make sure that the effective address is sane, any future use of the function is exposed to a programming error, so add a sanity check. Suggested-by: Nicholas Piggin Signed-off-by: Fabiano Rosas Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210805212616.2641017-3-farosas@linux.ibm.com --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 44eb7b1ef289..1b1c9e9e539b 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -44,6 +44,9 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, (to != NULL) ? __pa(to): 0, (from != NULL) ? __pa(from): 0, n); + if (eaddr & (0xFFFUL << 52)) + return ret; + quadrant = 1; if (!pid) quadrant = 2; From 0eb596f1e6103ebe122792a425b88c5dc21c4087 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 5 Aug 2021 18:26:16 -0300 Subject: [PATCH 232/474] KVM: PPC: Book3S HV: Stop exporting symbols from book3s_64_mmu_radix The book3s_64_mmu_radix.o object is not part of the KVM builtins and all the callers of the exported symbols are in the same kvm-hv.ko module so we should not need to export any symbols. Signed-off-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210805212616.2641017-4-farosas@linux.ibm.com --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 1b1c9e9e539b..16359525a40f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -86,7 +86,6 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, return ret; } -EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix); static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, void *from, unsigned long n) @@ -122,14 +121,12 @@ long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, return ret; } -EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix); long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from, unsigned long n) { return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n); } -EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix); int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *gpte, u64 root, From 113ec9ccc8049c3772f0eab46b62c5d6654c09f7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 20 Aug 2021 05:16:05 +0000 Subject: [PATCH 233/474] powerpc/32: indirect function call use bctrl rather than blrl in ret_from_kernel_thread Copied from commit 89bbe4c798bc ("powerpc/64: indirect function call use bctrl rather than blrl in ret_from_kernel_thread") blrl is not recommended to use as an indirect function call, as it may corrupt the link stack predictor. This is not a performance critical path but this should be fixed for consistency. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/91b1d242525307ceceec7ef6e832bfbacdd4501b.1629436472.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/entry_32.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 0273a1349006..61fdd53cdd9a 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -161,10 +161,10 @@ ret_from_fork: ret_from_kernel_thread: REST_NVGPRS(r1) bl schedule_tail - mtlr r14 + mtctr r14 mr r3,r15 PPC440EP_ERR42 - blrl + bctrl li r3,0 b ret_from_syscall From f5007dbf4da729baa850b33a64dc3cc53757bdf8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 24 Aug 2021 07:56:26 +0000 Subject: [PATCH 234/474] powerpc/booke: Avoid link stack corruption in several places Use bcl 20,31,+4 instead of bl in order to preserve link stack. See commit c974809a26a1 ("powerpc/vdso: Avoid link stack corruption in __get_datapage()") for details. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e9fbc285eceb720e6c0e032ef47fe8b05f669b48.1629791751.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc_asm.h | 2 +- arch/powerpc/kernel/exceptions-64e.S | 6 +++--- arch/powerpc/kernel/fsl_booke_entry_mapping.S | 8 ++++---- arch/powerpc/kernel/head_44x.S | 6 +++--- arch/powerpc/kernel/head_fsl_booke.S | 6 +++--- arch/powerpc/mm/nohash/tlb_low.S | 4 ++-- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index ffe712307e11..1c538a9a11e0 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -260,7 +260,7 @@ n: /* Be careful, this will clobber the lr register. */ #define LOAD_REG_ADDR_PIC(reg, name) \ - bl 0f; \ + bcl 20,31,$+4; \ 0: mflr reg; \ addis reg,reg,(name - 0b)@ha; \ addi reg,reg,(name - 0b)@l; diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 1401787b0b93..7e0943d9f9b0 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -1127,7 +1127,7 @@ found_iprot: * r3 = MAS0_TLBSEL (for the iprot array) * r4 = SPRN_TLBnCFG */ - bl invstr /* Find our address */ + bcl 20,31,$+4 /* Find our address */ invstr: mflr r6 /* Make it accessible */ mfmsr r7 rlwinm r5,r7,27,31,31 /* extract MSR[IS] */ @@ -1196,7 +1196,7 @@ skpinv: addi r6,r6,1 /* Increment */ mfmsr r6 xori r6,r6,MSR_IS mtspr SPRN_SRR1,r6 - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r6 addi r6,r6,(2f - 1b) mtspr SPRN_SRR0,r6 @@ -1256,7 +1256,7 @@ skpinv: addi r6,r6,1 /* Increment */ * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping */ /* Now we branch the new virtual address mapped by this entry */ - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r6 addi r6,r6,(2f - 1b) tovirt(r6,r6) diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/fsl_booke_entry_mapping.S index 8bccce6544b5..dedc17fac8f8 100644 --- a/arch/powerpc/kernel/fsl_booke_entry_mapping.S +++ b/arch/powerpc/kernel/fsl_booke_entry_mapping.S @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* 1. Find the index of the entry we're executing in */ - bl invstr /* Find our address */ + bcl 20,31,$+4 /* Find our address */ invstr: mflr r6 /* Make it accessible */ mfmsr r7 rlwinm r4,r7,27,31,31 /* extract MSR[IS] */ @@ -85,7 +85,7 @@ skpinv: addi r6,r6,1 /* Increment */ addi r6,r6,10 slw r6,r8,r6 /* convert to mask */ - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r7 mfspr r8,SPRN_MAS3 @@ -117,7 +117,7 @@ skpinv: addi r6,r6,1 /* Increment */ xori r6,r4,1 slwi r6,r6,5 /* setup new context with other address space */ - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r9 rlwimi r7,r9,0,20,31 addi r7,r7,(2f - 1b) @@ -207,7 +207,7 @@ next_tlb_setup: lis r7,MSR_KERNEL@h ori r7,r7,MSR_KERNEL@l - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r9 rlwimi r6,r9,0,20,31 addi r6,r6,(2f - 1b) diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index ddc978a2d381..02d2928d1e01 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -70,7 +70,7 @@ _ENTRY(_start); * address. * r21 will be loaded with the physical runtime address of _stext */ - bl 0f /* Get our runtime address */ + bcl 20,31,$+4 /* Get our runtime address */ 0: mflr r21 /* Make it accessible */ addis r21,r21,(_stext - 0b)@ha addi r21,r21,(_stext - 0b)@l /* Get our current runtime base */ @@ -853,7 +853,7 @@ _GLOBAL(init_cpu_state) wmmucr: mtspr SPRN_MMUCR,r3 /* Put MMUCR */ sync - bl invstr /* Find our address */ + bcl 20,31,$+4 /* Find our address */ invstr: mflr r5 /* Make it accessible */ tlbsx r23,0,r5 /* Find entry we are in */ li r4,0 /* Start at TLB entry 0 */ @@ -1045,7 +1045,7 @@ head_start_47x: sync /* Find the entry we are running from */ - bl 1f + bcl 20,31,$+4 1: mflr r23 tlbsx r23,0,r23 tlbre r24,r23,0 diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 0f9642f36b49..dbf3b89e543c 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -79,7 +79,7 @@ _ENTRY(_start); mr r23,r3 mr r25,r4 - bl 0f + bcl 20,31,$+4 0: mflr r8 addis r3,r8,(is_second_reloc - 0b)@ha lwz r19,(is_second_reloc - 0b)@l(r3) @@ -1132,7 +1132,7 @@ _GLOBAL(switch_to_as1) bne 1b /* Get the tlb entry used by the current running code */ - bl 0f + bcl 20,31,$+4 0: mflr r4 tlbsx 0,r4 @@ -1166,7 +1166,7 @@ _GLOBAL(switch_to_as1) _GLOBAL(restore_to_as0) mflr r0 - bl 0f + bcl 20,31,$+4 0: mflr r9 addi r9,r9,1f - 0b diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S index 4613bf8e9aae..5add4a51e51f 100644 --- a/arch/powerpc/mm/nohash/tlb_low.S +++ b/arch/powerpc/mm/nohash/tlb_low.S @@ -199,7 +199,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2) * Touch enough instruction cache lines to ensure cache hits */ 1: mflr r9 - bl 2f + bcl 20,31,$+4 2: mflr r6 li r7,32 PPC_ICBT(0,R6,R7) /* touch next cache line */ @@ -414,7 +414,7 @@ _GLOBAL(loadcam_multi) * Set up temporary TLB entry that is the same as what we're * running from, but in AS=1. */ - bl 1f + bcl 20,31,$+4 1: mflr r6 tlbsx 0,r8 mfspr r6,SPRN_MAS1 From 33e1402435cb9f3021439a15935ea2dc69ec1844 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 24 Aug 2021 07:56:35 +0000 Subject: [PATCH 235/474] powerpc: Avoid link stack corruption in misc asm functions bl;mflr is used at several places to get code position. Use bcl 20,31,+4 instead of bl in order to preserve link stack. See commit c974809a26a1 ("powerpc/vdso: Avoid link stack corruption in __get_datapage()") for details. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c6eabb4fb6c156f75d56dcbcc6f243e5ac0fba42.1629791763.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/misc.S | 2 +- arch/powerpc/kernel/misc_32.S | 2 +- arch/powerpc/kernel/misc_64.S | 2 +- arch/powerpc/kernel/reloc_32.S | 2 +- arch/powerpc/kexec/relocate_32.S | 12 ++++++------ 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index 5be96feccb55..fb7de3543c03 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -29,7 +29,7 @@ _GLOBAL(reloc_offset) li r3, 0 _GLOBAL(add_reloc_offset) mflr r0 - bl 1f + bcl 20,31,$+4 1: mflr r5 PPC_LL r4,(2f-1b)(r5) subf r5,r4,r5 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index d8645efff902..e5127b19fec2 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -67,7 +67,7 @@ _GLOBAL(reloc_got2) srwi. r8,r8,2 beqlr mtctr r8 - bl 1f + bcl 20,31,$+4 1: mflr r0 lis r4,1b@ha addi r4,r4,1b@l diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 4b761a18a74d..d38a019b38e1 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -255,7 +255,7 @@ _GLOBAL(scom970_write) * Physical (hardware) cpu id should be in r3. */ _GLOBAL(kexec_wait) - bl 1f + bcl 20,31,$+4 1: mflr r5 addi r5,r5,kexec_flag-1b diff --git a/arch/powerpc/kernel/reloc_32.S b/arch/powerpc/kernel/reloc_32.S index 10e96f3e22fe..0508c14b4c28 100644 --- a/arch/powerpc/kernel/reloc_32.S +++ b/arch/powerpc/kernel/reloc_32.S @@ -30,7 +30,7 @@ R_PPC_RELATIVE = 22 _GLOBAL(relocate) mflr r0 /* Save our LR */ - bl 0f /* Find our current runtime address */ + bcl 20,31,$+4 /* Find our current runtime address */ 0: mflr r12 /* Make it accessible */ mtlr r0 diff --git a/arch/powerpc/kexec/relocate_32.S b/arch/powerpc/kexec/relocate_32.S index 61946c19e07c..cf6e52bdf8d8 100644 --- a/arch/powerpc/kexec/relocate_32.S +++ b/arch/powerpc/kexec/relocate_32.S @@ -93,7 +93,7 @@ wmmucr: * Invalidate all the TLB entries except the current entry * where we are running from */ - bl 0f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 0: mflr r5 /* Make it accessible */ tlbsx r23,0,r5 /* Find entry we are in */ li r4,0 /* Start at TLB entry 0 */ @@ -158,7 +158,7 @@ write_out: /* Switch to other address space in MSR */ insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */ - bl 1f + bcl 20,31,$+4 1: mflr r8 addi r8, r8, (2f-1b) /* Find the target offset */ @@ -202,7 +202,7 @@ next_tlb: li r9,0 insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */ - bl 1f + bcl 20,31,$+4 1: mflr r8 and r8, r8, r11 /* Get our offset within page */ addi r8, r8, (2f-1b) @@ -240,7 +240,7 @@ setup_map_47x: sync /* Find the entry we are running from */ - bl 2f + bcl 20,31,$+4 2: mflr r23 tlbsx r23, 0, r23 tlbre r24, r23, 0 /* TLB Word 0 */ @@ -296,7 +296,7 @@ clear_utlb_entry: /* Update the msr to the new TS */ insrwi r5, r7, 1, 26 - bl 1f + bcl 20,31,$+4 1: mflr r6 addi r6, r6, (2f-1b) @@ -355,7 +355,7 @@ write_utlb: /* Defaults to 256M */ lis r10, 0x1000 - bl 1f + bcl 20,31,$+4 1: mflr r4 addi r4, r4, (2f-1b) /* virtual address of 2f */ From 11f27a7fa4ca27935de74e3eb052bdc430d5f8d8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Jul 2021 16:49:40 +0000 Subject: [PATCH 236/474] powerpc/ptdump: Use DEFINE_SHOW_ATTRIBUTE() Use DEFINE_SHOW_ATTRIBUTE() instead of open coding open() and fops. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b864a92693ca8413ef0b19f0c12065c212899b6e.1625762905.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/ptdump/bats.c | 14 ++------------ arch/powerpc/mm/ptdump/hashpagetable.c | 12 +----------- arch/powerpc/mm/ptdump/ptdump.c | 13 +------------ arch/powerpc/mm/ptdump/segment_regs.c | 12 +----------- 4 files changed, 5 insertions(+), 46 deletions(-) diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c index 8bf7383fb26c..820c119013e4 100644 --- a/arch/powerpc/mm/ptdump/bats.c +++ b/arch/powerpc/mm/ptdump/bats.c @@ -57,7 +57,7 @@ static void bat_show_603(struct seq_file *m, int idx, u32 lower, u32 upper, bool #define BAT_SHOW_603(_m, _n, _l, _u, _d) bat_show_603(_m, _n, mfspr(_l), mfspr(_u), _d) -static int bats_show_603(struct seq_file *m, void *v) +static int bats_show(struct seq_file *m, void *v) { seq_puts(m, "---[ Instruction Block Address Translation ]---\n"); @@ -88,17 +88,7 @@ static int bats_show_603(struct seq_file *m, void *v) return 0; } -static int bats_open(struct inode *inode, struct file *file) -{ - return single_open(file, bats_show_603, NULL); -} - -static const struct file_operations bats_fops = { - .open = bats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(bats); static int __init bats_init(void) { diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index ad6df9a2e7c8..c7f824d294b2 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -526,17 +526,7 @@ static int ptdump_show(struct seq_file *m, void *v) return 0; } -static int ptdump_open(struct inode *inode, struct file *file) -{ - return single_open(file, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump); static int ptdump_init(void) { diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 5062c58b1e5b..349fd8fe173f 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -397,18 +397,7 @@ static int ptdump_show(struct seq_file *m, void *v) return 0; } - -static int ptdump_open(struct inode *inode, struct file *file) -{ - return single_open(file, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump); static void build_pgtable_complete_mask(void) { diff --git a/arch/powerpc/mm/ptdump/segment_regs.c b/arch/powerpc/mm/ptdump/segment_regs.c index 9223dfb85c51..9df3af8d481f 100644 --- a/arch/powerpc/mm/ptdump/segment_regs.c +++ b/arch/powerpc/mm/ptdump/segment_regs.c @@ -41,17 +41,7 @@ static int sr_show(struct seq_file *m, void *v) return 0; } -static int sr_open(struct inode *inode, struct file *file) -{ - return single_open(file, sr_show, NULL); -} - -static const struct file_operations sr_fops = { - .open = sr_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(sr); static int __init sr_init(void) { From 64b87b0c70e0fd28352895cba3c0a9631e0072dd Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Jul 2021 16:49:41 +0000 Subject: [PATCH 237/474] powerpc/ptdump: Remove unused 'page_size' parameter note_page_update_state() doesn't use page_size. Remove it. Could also be removed to note_page() but as a following patch will remove all current users of note_page(), just leave it as is for now. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e2f80d052001155251bfe009c360d0c5d9242c6b.1625762906.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/ptdump/ptdump.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 349fd8fe173f..3eb8732641da 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -189,7 +189,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) } static void note_page_update_state(struct pg_state *st, unsigned long addr, - unsigned int level, u64 val, unsigned long page_size) + unsigned int level, u64 val) { u64 flag = val & pg_level[level].mask; u64 pa = val & PTE_RPN_MASK; @@ -213,7 +213,7 @@ static void note_page(struct pg_state *st, unsigned long addr, /* At first no level is set */ if (!st->level) { pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); - note_page_update_state(st, addr, level, val, page_size); + note_page_update_state(st, addr, level, val); /* * Dump the section of virtual memory when: * - the PTE flags from one entry to the next differs. @@ -242,7 +242,7 @@ static void note_page(struct pg_state *st, unsigned long addr, * Address indicates we have passed the end of the * current section of virtual memory */ - note_page_update_state(st, addr, level, val, page_size); + note_page_update_state(st, addr, level, val); } } From cf98d2b6eea6a1b2c43f85680ad58fcc3ea9496b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Jul 2021 16:49:42 +0000 Subject: [PATCH 238/474] powerpc/ptdump: Reduce level numbers by 1 in note_page() and add p4d level Do the same as commit f8f0d0b6fa20 ("mm: ptdump: reduce level numbers by 1 in note_page()") and add missing p4d level. This will align powerpc to the users of generic ptdump. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d76495c574132b197b445a1f133755cca4b912a4.1625762906.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/ptdump/8xx.c | 6 ++++-- arch/powerpc/mm/ptdump/book3s64.c | 6 ++++-- arch/powerpc/mm/ptdump/ptdump.c | 17 +++++++++-------- arch/powerpc/mm/ptdump/shared.c | 6 ++++-- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c index 86da2a669680..fac932eb8f9a 100644 --- a/arch/powerpc/mm/ptdump/8xx.c +++ b/arch/powerpc/mm/ptdump/8xx.c @@ -75,8 +75,10 @@ static const struct flag_info flag_array[] = { }; struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ + { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c index 14f73868db66..5ad92d9dc5d1 100644 --- a/arch/powerpc/mm/ptdump/book3s64.c +++ b/arch/powerpc/mm/ptdump/book3s64.c @@ -103,8 +103,10 @@ static const struct flag_info flag_array[] = { }; struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ + { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 3eb8732641da..fb531bc64fc5 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -58,7 +58,7 @@ struct pg_state { const struct addr_marker *marker; unsigned long start_address; unsigned long start_pa; - unsigned int level; + int level; u64 current_flags; bool check_wx; unsigned long wx_pages; @@ -188,10 +188,9 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) st->wx_pages += (addr - st->start_address) / PAGE_SIZE; } -static void note_page_update_state(struct pg_state *st, unsigned long addr, - unsigned int level, u64 val) +static void note_page_update_state(struct pg_state *st, unsigned long addr, int level, u64 val) { - u64 flag = val & pg_level[level].mask; + u64 flag = level >= 0 ? val & pg_level[level].mask : 0; u64 pa = val & PTE_RPN_MASK; st->level = level; @@ -206,12 +205,12 @@ static void note_page_update_state(struct pg_state *st, unsigned long addr, } static void note_page(struct pg_state *st, unsigned long addr, - unsigned int level, u64 val, unsigned long page_size) + int level, u64 val, unsigned long page_size) { - u64 flag = val & pg_level[level].mask; + u64 flag = level >= 0 ? val & pg_level[level].mask : 0; /* At first no level is set */ - if (!st->level) { + if (st->level == -1) { pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); note_page_update_state(st, addr, level, val); /* @@ -383,6 +382,7 @@ static int ptdump_show(struct seq_file *m, void *v) struct pg_state st = { .seq = m, .marker = address_markers, + .level = -1, .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, }; @@ -393,7 +393,7 @@ static int ptdump_show(struct seq_file *m, void *v) /* Traverse kernel page tables */ walk_pagetables(&st); - note_page(&st, 0, 0, 0, 0); + note_page(&st, 0, -1, 0, 0); return 0; } @@ -415,6 +415,7 @@ void ptdump_check_wx(void) struct pg_state st = { .seq = NULL, .marker = address_markers, + .level = -1, .check_wx = true, .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, }; diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c index c005fe041c18..03607ab90c66 100644 --- a/arch/powerpc/mm/ptdump/shared.c +++ b/arch/powerpc/mm/ptdump/shared.c @@ -68,8 +68,10 @@ static const struct flag_info flag_array[] = { }; struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ + { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ From e084728393a58e7fdafeee2e6b20e0faff09b557 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Jul 2021 16:49:43 +0000 Subject: [PATCH 239/474] powerpc/ptdump: Convert powerpc to GENERIC_PTDUMP This patch converts powerpc to the generic PTDUMP implementation. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/03166d569526be70214fe9370a7bad219d2f41c8.1625762907.git.christophe.leroy@csgroup.eu --- arch/powerpc/Kconfig | 2 + arch/powerpc/Kconfig.debug | 30 ------- arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/mmu_decl.h | 2 +- arch/powerpc/mm/ptdump/Makefile | 9 +- arch/powerpc/mm/ptdump/ptdump.c | 146 ++++++++------------------------ 6 files changed, 47 insertions(+), 144 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d01e3401581d..ec866085ff8b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -123,6 +123,7 @@ config PPC select ARCH_HAS_COPY_MC if PPC64 select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE + select ARCH_HAS_DEBUG_WX if STRICT_KERNEL_RWX select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_DMA_MAP_DIRECT if PPC_PSERIES select ARCH_HAS_ELF_RANDOMIZE @@ -182,6 +183,7 @@ config PPC select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_PCI_IOMAP if PCI + select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 205cd77f321f..192f0ed0097f 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -365,36 +365,6 @@ config FAIL_IOMMU If you are unsure, say N. -config PPC_PTDUMP - bool "Export kernel pagetable layout to userspace via debugfs" - depends on DEBUG_KERNEL && DEBUG_FS - help - This option exports the state of the kernel pagetables to a - debugfs file. This is only useful for kernel developers who are - working in architecture specific areas of the kernel - probably - not a good idea to enable this feature in a production kernel. - - If you are unsure, say N. - -config PPC_DEBUG_WX - bool "Warn on W+X mappings at boot" - depends on PPC_PTDUMP && STRICT_KERNEL_RWX - help - Generate a warning if any W+X mappings are found at boot. - - This is useful for discovering cases where the kernel is leaving - W+X mappings after applying NX, as such mappings are a security risk. - - Note that even if the check fails, your kernel is possibly - still fine, as W+X mappings are not a security hole in - themselves, what they do is that they make the exploitation - of other unfixed kernel bugs easier. - - There is no runtime or memory usage effect of this option - once the kernel has booted up - it's a one time check. - - If in doubt, say "Y". - config PPC_FAST_ENDIAN_SWITCH bool "Deprecated fast endian-switch syscall" depends on DEBUG_KERNEL && PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index eae4ec2988fc..df8172da2301 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -18,5 +18,5 @@ obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o -obj-$(CONFIG_PPC_PTDUMP) += ptdump/ +obj-$(CONFIG_PTDUMP_CORE) += ptdump/ obj-$(CONFIG_KASAN) += kasan/ diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 7dac910c0b21..dd1cabc2ea0f 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -180,7 +180,7 @@ static inline void mmu_mark_rodata_ro(void) { } void __init mmu_mapin_immr(void); #endif -#ifdef CONFIG_PPC_DEBUG_WX +#ifdef CONFIG_DEBUG_WX void ptdump_check_wx(void); #else static inline void ptdump_check_wx(void) { } diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile index 712762be3cb1..4050cbb55acf 100644 --- a/arch/powerpc/mm/ptdump/Makefile +++ b/arch/powerpc/mm/ptdump/Makefile @@ -5,5 +5,10 @@ obj-y += ptdump.o obj-$(CONFIG_4xx) += shared.o obj-$(CONFIG_PPC_8xx) += 8xx.o obj-$(CONFIG_PPC_BOOK3E_MMU) += shared.o -obj-$(CONFIG_PPC_BOOK3S_32) += shared.o bats.o segment_regs.o -obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o hashpagetable.o +obj-$(CONFIG_PPC_BOOK3S_32) += shared.o +obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o + +ifdef CONFIG_PTDUMP_DEBUGFS +obj-$(CONFIG_PPC_BOOK3S_32) += bats.o segment_regs.o +obj-$(CONFIG_PPC_BOOK3S_64) += hashpagetable.o +endif diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index fb531bc64fc5..2d80d775d15e 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,7 @@ * */ struct pg_state { + struct ptdump_state ptdump; struct seq_file *seq; const struct addr_marker *marker; unsigned long start_address; @@ -102,6 +104,11 @@ static struct addr_marker address_markers[] = { { -1, NULL }, }; +static struct ptdump_range ptdump_range[] __ro_after_init = { + {TASK_SIZE_MAX, ~0UL}, + {0, 0} +}; + #define pt_dump_seq_printf(m, fmt, args...) \ ({ \ if (m) \ @@ -204,10 +211,10 @@ static void note_page_update_state(struct pg_state *st, unsigned long addr, int } } -static void note_page(struct pg_state *st, unsigned long addr, - int level, u64 val, unsigned long page_size) +static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) { u64 flag = level >= 0 ? val & pg_level[level].mask : 0; + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); /* At first no level is set */ if (st->level == -1) { @@ -245,94 +252,6 @@ static void note_page(struct pg_state *st, unsigned long addr, } } -static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) -{ - pte_t *pte = pte_offset_kernel(pmd, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PTE; i++, pte++) { - addr = start + i * PAGE_SIZE; - note_page(st, addr, 4, pte_val(*pte), PAGE_SIZE); - - } -} - -static void walk_hugepd(struct pg_state *st, hugepd_t *phpd, unsigned long start, - int pdshift, int level) -{ -#ifdef CONFIG_ARCH_HAS_HUGEPD - unsigned int i; - int shift = hugepd_shift(*phpd); - int ptrs_per_hpd = pdshift - shift > 0 ? 1 << (pdshift - shift) : 1; - - if (start & ((1 << shift) - 1)) - return; - - for (i = 0; i < ptrs_per_hpd; i++) { - unsigned long addr = start + (i << shift); - pte_t *pte = hugepte_offset(*phpd, addr, pdshift); - - note_page(st, addr, level + 1, pte_val(*pte), 1 << shift); - } -#endif -} - -static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) -{ - pmd_t *pmd = pmd_offset(pud, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { - addr = start + i * PMD_SIZE; - if (!pmd_none(*pmd) && !pmd_is_leaf(*pmd)) - /* pmd exists */ - walk_pte(st, pmd, addr); - else - note_page(st, addr, 3, pmd_val(*pmd), PMD_SIZE); - } -} - -static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) -{ - pud_t *pud = pud_offset(p4d, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PUD; i++, pud++) { - addr = start + i * PUD_SIZE; - if (!pud_none(*pud) && !pud_is_leaf(*pud)) - /* pud exists */ - walk_pmd(st, pud, addr); - else - note_page(st, addr, 2, pud_val(*pud), PUD_SIZE); - } -} - -static void walk_pagetables(struct pg_state *st) -{ - unsigned int i; - unsigned long addr = st->start_address & PGDIR_MASK; - pgd_t *pgd = pgd_offset_k(addr); - - /* - * Traverse the linux pagetable structure and dump pages that are in - * the hash pagetable. - */ - for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { - p4d_t *p4d = p4d_offset(pgd, 0); - - if (p4d_none(*p4d) || p4d_is_leaf(*p4d)) - note_page(st, addr, 1, p4d_val(*p4d), PGDIR_SIZE); - else if (is_hugepd(__hugepd(p4d_val(*p4d)))) - walk_hugepd(st, (hugepd_t *)p4d, addr, PGDIR_SHIFT, 1); - else - /* p4d exists */ - walk_pud(st, p4d, addr); - } -} - static void populate_markers(void) { int i = 0; @@ -383,17 +302,14 @@ static int ptdump_show(struct seq_file *m, void *v) .seq = m, .marker = address_markers, .level = -1, - .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, + .ptdump = { + .note_page = note_page, + .range = ptdump_range, + } }; -#ifdef CONFIG_PPC64 - if (!radix_enabled()) - st.start_address = KERN_VIRT_START; -#endif - /* Traverse kernel page tables */ - walk_pagetables(&st); - note_page(&st, 0, -1, 0, 0); + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); return 0; } @@ -409,23 +325,24 @@ static void build_pgtable_complete_mask(void) pg_level[i].mask |= pg_level[i].flag[j].mask; } -#ifdef CONFIG_PPC_DEBUG_WX +#ifdef CONFIG_DEBUG_WX void ptdump_check_wx(void) { struct pg_state st = { .seq = NULL, - .marker = address_markers, + .marker = (struct addr_marker[]) { + { 0, NULL}, + { -1, NULL}, + }, .level = -1, .check_wx = true, - .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, + .ptdump = { + .note_page = note_page, + .range = ptdump_range, + } }; -#ifdef CONFIG_PPC64 - if (!radix_enabled()) - st.start_address = KERN_VIRT_START; -#endif - - walk_pagetables(&st); + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); if (st.wx_pages) pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", @@ -435,12 +352,21 @@ void ptdump_check_wx(void) } #endif -static int ptdump_init(void) +static int __init ptdump_init(void) { +#ifdef CONFIG_PPC64 + if (!radix_enabled()) + ptdump_range[0].start = KERN_VIRT_START; + else + ptdump_range[0].start = PAGE_OFFSET; +#endif + populate_markers(); build_pgtable_complete_mask(); - debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, - &ptdump_fops); + + if (IS_ENABLED(CONFIG_PTDUMP_DEBUGFS)) + debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); + return 0; } device_initcall(ptdump_init); From 316389e904f968d24d44cd96a6d171ee1ef269cf Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 25 Jun 2021 10:58:33 +0000 Subject: [PATCH 240/474] powerpc/syscalls: Simplify do_mmap2() When shift is nul, operations remain valid so no test needed. And 'ret' is unnecessary. And use IS_ALIGNED() to check alignment, that's more clear. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/373ec500f386374bc5735007df3d3869eac47be1.1624618701.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/syscalls.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index bf4ae0f0e36c..825931e400df 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -41,20 +41,13 @@ static inline long do_mmap2(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long off, int shift) { - long ret = -EINVAL; - if (!arch_validate_prot(prot, addr)) - goto out; + return -EINVAL; - if (shift) { - if (off & ((1 << shift) - 1)) - goto out; - off >>= shift; - } + if (!IS_ALIGNED(off, 1 << shift)) + return -EINVAL; - ret = ksys_mmap_pgoff(addr, len, prot, flags, fd, off); -out: - return ret; + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> shift); } SYSCALL_DEFINE6(mmap2, unsigned long, addr, size_t, len, From 19e932eb6ea47f4f37513eb2ae0daee19117765c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 17 Aug 2021 16:00:14 +0000 Subject: [PATCH 241/474] powerpc/ptrace: Make user_mode() common to PPC32 and PPC64 Today we have: #ifdef __powerpc64__ #define user_mode(regs) ((((regs)->msr) >> MSR_PR_LG) & 0x1) #else #define user_mode(regs) (((regs)->msr & MSR_PR) != 0) #endif With ppc64_defconfig, we get: if (!user_mode(regs)) 14b4: e9 3e 01 08 ld r9,264(r30) 14b8: 71 29 40 00 andi. r9,r9,16384 14bc: 41 82 07 a4 beq 1c60 <.emulate_instruction+0x7d0> If taking the ppc32 definition of user_mode(), the exact same code is generated for ppc64_defconfig. So, only keep one version of user_mode(), preferably the one not using MSR_PR_LG which should be kept internal to reg.h. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/000a28c51808bbd802b505af42d2cb316c2be7d3.1629216000.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ptrace.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 3e5d470a6155..333979cf5d1e 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -181,11 +181,7 @@ static inline unsigned long frame_pointer(struct pt_regs *regs) return 0; } -#ifdef __powerpc64__ -#define user_mode(regs) ((((regs)->msr) >> MSR_PR_LG) & 0x1) -#else #define user_mode(regs) (((regs)->msr & MSR_PR) != 0) -#endif #define force_successful_syscall_return() \ do { \ From 9401f4e46cf6965e23738f70e149172344a01eef Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 2 Mar 2021 08:48:11 +0000 Subject: [PATCH 242/474] powerpc: Use lwarx/ldarx directly instead of PPC_LWARX/LDARX macros Force the eh flag at 0 on PPC32. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1fc81f07cabebb875b963e295408cc3dd38c8d85.1614674882.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/asm-compat.h | 4 ++-- arch/powerpc/include/asm/atomic.h | 4 ++-- arch/powerpc/include/asm/bitops.h | 8 ++++---- arch/powerpc/include/asm/ppc-opcode.h | 2 -- arch/powerpc/include/asm/simple_spinlock.h | 6 +++--- 5 files changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/asm-compat.h b/arch/powerpc/include/asm/asm-compat.h index 19b70c5b5f18..2b736d9fbb1b 100644 --- a/arch/powerpc/include/asm/asm-compat.h +++ b/arch/powerpc/include/asm/asm-compat.h @@ -17,7 +17,7 @@ #define PPC_LONG stringify_in_c(.8byte) #define PPC_LONG_ALIGN stringify_in_c(.balign 8) #define PPC_TLNEI stringify_in_c(tdnei) -#define PPC_LLARX(t, a, b, eh) PPC_LDARX(t, a, b, eh) +#define PPC_LLARX stringify_in_c(ldarx) #define PPC_STLCX stringify_in_c(stdcx.) #define PPC_CNTLZL stringify_in_c(cntlzd) #define PPC_MTOCRF(FXM, RS) MTOCRF((FXM), RS) @@ -50,7 +50,7 @@ #define PPC_LONG stringify_in_c(.long) #define PPC_LONG_ALIGN stringify_in_c(.balign 4) #define PPC_TLNEI stringify_in_c(twnei) -#define PPC_LLARX(t, a, b, eh) PPC_LWARX(t, a, b, eh) +#define PPC_LLARX stringify_in_c(lwarx) #define PPC_STLCX stringify_in_c(stwcx.) #define PPC_CNTLZL stringify_in_c(cntlzw) #define PPC_MTOCRF stringify_in_c(mtcrf) diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index a1732a79e92a..6a53ef178bfd 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -207,7 +207,7 @@ arch_atomic_try_cmpxchg_lock(atomic_t *v, int *old, int new) int r, o = *old; __asm__ __volatile__ ( -"1:\t" PPC_LWARX(%0,0,%2,1) " # atomic_try_cmpxchg_acquire \n" +"1: lwarx %0,0,%2,%5 # atomic_try_cmpxchg_acquire \n" " cmpw 0,%0,%3 \n" " bne- 2f \n" " stwcx. %4,0,%2 \n" @@ -215,7 +215,7 @@ arch_atomic_try_cmpxchg_lock(atomic_t *v, int *old, int new) "\t" PPC_ACQUIRE_BARRIER " \n" "2: \n" : "=&r" (r), "+m" (v->counter) - : "r" (&v->counter), "r" (o), "r" (new) + : "r" (&v->counter), "r" (o), "r" (new), "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) : "cr0", "memory"); if (unlikely(r != o)) diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 299ab33505a6..11847b6a244e 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -70,7 +70,7 @@ static inline void fn(unsigned long mask, \ unsigned long *p = (unsigned long *)_p; \ __asm__ __volatile__ ( \ prefix \ -"1:" PPC_LLARX(%0,0,%3,0) "\n" \ +"1:" PPC_LLARX "%0,0,%3,0\n" \ stringify_in_c(op) "%0,%0,%2\n" \ PPC_STLCX "%0,0,%3\n" \ "bne- 1b\n" \ @@ -115,13 +115,13 @@ static inline unsigned long fn( \ unsigned long *p = (unsigned long *)_p; \ __asm__ __volatile__ ( \ prefix \ -"1:" PPC_LLARX(%0,0,%3,eh) "\n" \ +"1:" PPC_LLARX "%0,0,%3,%4\n" \ stringify_in_c(op) "%1,%0,%2\n" \ PPC_STLCX "%1,0,%3\n" \ "bne- 1b\n" \ postfix \ : "=&r" (old), "=&r" (t) \ - : "r" (mask), "r" (p) \ + : "r" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \ : "cc", "memory"); \ return (old & mask); \ } @@ -170,7 +170,7 @@ clear_bit_unlock_return_word(int nr, volatile unsigned long *addr) __asm__ __volatile__ ( PPC_RELEASE_BARRIER -"1:" PPC_LLARX(%0,0,%3,0) "\n" +"1:" PPC_LLARX "%0,0,%3,0\n" "andc %1,%0,%2\n" PPC_STLCX "%1,0,%3\n" "bne- 1b\n" diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index bede76dd3db7..baea657bc868 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -576,8 +576,6 @@ #define PPC_DIVDE(t, a, b) stringify_in_c(.long PPC_RAW_DIVDE(t, a, b)) #define PPC_DIVDEU(t, a, b) stringify_in_c(.long PPC_RAW_DIVDEU(t, a, b)) #define PPC_LQARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LQARX(t, a, b, eh)) -#define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LDARX(t, a, b, eh)) -#define PPC_LWARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LWARX(t, a, b, eh)) #define PPC_STQCX(t, a, b) stringify_in_c(.long PPC_RAW_STQCX(t, a, b)) #define PPC_MADDHD(t, a, b, c) stringify_in_c(.long PPC_RAW_MADDHD(t, a, b, c)) #define PPC_MADDHDU(t, a, b, c) stringify_in_c(.long PPC_RAW_MADDHDU(t, a, b, c)) diff --git a/arch/powerpc/include/asm/simple_spinlock.h b/arch/powerpc/include/asm/simple_spinlock.h index 552f325412cc..8985791a2ba5 100644 --- a/arch/powerpc/include/asm/simple_spinlock.h +++ b/arch/powerpc/include/asm/simple_spinlock.h @@ -51,7 +51,7 @@ static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock) token = LOCK_TOKEN; __asm__ __volatile__( -"1: " PPC_LWARX(%0,0,%2,1) "\n\ +"1: lwarx %0,0,%2,1\n\ cmpwi 0,%0,0\n\ bne- 2f\n\ stwcx. %1,0,%2\n\ @@ -179,7 +179,7 @@ static inline long __arch_read_trylock(arch_rwlock_t *rw) long tmp; __asm__ __volatile__( -"1: " PPC_LWARX(%0,0,%1,1) "\n" +"1: lwarx %0,0,%1,1\n" __DO_SIGN_EXTEND " addic. %0,%0,1\n\ ble- 2f\n" @@ -203,7 +203,7 @@ static inline long __arch_write_trylock(arch_rwlock_t *rw) token = WRLOCK_TOKEN; __asm__ __volatile__( -"1: " PPC_LWARX(%0,0,%2,1) "\n\ +"1: lwarx %0,0,%2,1\n\ cmpwi 0,%0,0\n\ bne- 2f\n" " stwcx. %1,0,%2\n\ From fd42b7b09c602c904452c0c3e5955ca21d8e387a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Aug 2021 02:00:35 +1000 Subject: [PATCH 243/474] KVM: PPC: Book3S HV: Initialise vcpu MSR with MSR_ME It is possible to create a VCPU without setting the MSR before running it, which results in a warning in kvmhv_vcpu_entry_p9() that MSR_ME is not set. This is pretty harmless because the MSR_ME bit is added to HSRR1 before HRFID to guest, and a normal qemu guest doesn't hit it. Initialise the vcpu MSR with MSR_ME set. Reported-by: Alexey Kardashevskiy Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210811160134.904987-2-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 085fb8ecbf68..c402eb0276b0 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2684,6 +2684,7 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) spin_lock_init(&vcpu->arch.vpa_update_lock); spin_lock_init(&vcpu->arch.tbacct_lock); vcpu->arch.busy_preempt = TB_NIL; + vcpu->arch.shregs.msr = MSR_ME; vcpu->arch.intr_msr = MSR_SF | MSR_ME; /* From daac40e8d7a63ab8608132a7cfce411986feac32 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Aug 2021 02:00:36 +1000 Subject: [PATCH 244/474] KVM: PPC: Book3S HV: Remove TM emulation from POWER7/8 path TM fake-suspend emulation is only used by POWER9. Remove it from the old code path. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210811160134.904987-3-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 42 ------------------------- 1 file changed, 42 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 8dd437d7a2c6..75079397c2a5 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1088,12 +1088,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE beq kvmppc_hisi -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - /* For softpatch interrupt, go off and do TM instruction emulation */ - cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH - beq kvmppc_tm_emul -#endif - /* See if this is a leftover HDEC interrupt */ cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER bne 2f @@ -1599,42 +1593,6 @@ maybe_reenter_guest: blt deliver_guest_interrupt b guest_exit_cont -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -/* - * Softpatch interrupt for transactional memory emulation cases - * on POWER9 DD2.2. This is early in the guest exit path - we - * haven't saved registers or done a treclaim yet. - */ -kvmppc_tm_emul: - /* Save instruction image in HEIR */ - mfspr r3, SPRN_HEIR - stw r3, VCPU_HEIR(r9) - - /* - * The cases we want to handle here are those where the guest - * is in real suspend mode and is trying to transition to - * transactional mode. - */ - lbz r0, HSTATE_FAKE_SUSPEND(r13) - cmpwi r0, 0 /* keep exiting guest if in fake suspend */ - bne guest_exit_cont - rldicl r3, r11, 64 - MSR_TS_S_LG, 62 - cmpwi r3, 1 /* or if not in suspend state */ - bne guest_exit_cont - - /* Call C code to do the emulation */ - mr r3, r9 - bl kvmhv_p9_tm_emulation_early - nop - ld r9, HSTATE_KVM_VCPU(r13) - li r12, BOOK3S_INTERRUPT_HV_SOFTPATCH - cmpwi r3, 0 - beq guest_exit_cont /* continue exiting if not handled */ - ld r10, VCPU_PC(r9) - ld r11, VCPU_MSR(r9) - b fast_interrupt_c_return /* go back to guest if handled */ -#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ - /* * Check whether an HDSI is an HPTE not found fault or something else. * If it is an HPTE not found fault that is due to the guest accessing From 4782e0cd0d184d727ad3b0cfe20d1d44d9f98239 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Aug 2021 02:00:37 +1000 Subject: [PATCH 245/474] KVM: PPC: Book3S HV P9: Fixes for TM softpatch interrupt NIP The softpatch interrupt sets HSRR0 to the faulting instruction +4, so it should subtract 4 for the faulting instruction address in the case it is a TM softpatch interrupt (the instruction was not executed) and it was not emulated. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210811160134.904987-4-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_tm.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c index cc90b8b82329..e7c36f8bf205 100644 --- a/arch/powerpc/kvm/book3s_hv_tm.c +++ b/arch/powerpc/kvm/book3s_hv_tm.c @@ -46,6 +46,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) u64 newmsr, bescr; int ra, rs; + /* + * The TM softpatch interrupt sets NIP to the instruction following + * the faulting instruction, which is not executed. Rewind nip to the + * faulting instruction so it looks like a normal synchronous + * interrupt, then update nip in the places where the instruction is + * emulated. + */ + vcpu->arch.regs.nip -= 4; + /* * rfid, rfebb, and mtmsrd encode bit 31 = 0 since it's a reserved bit * in these instructions, so masking bit 31 out doesn't change these @@ -67,7 +76,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) (newmsr & MSR_TM))); newmsr = sanitize_msr(newmsr); vcpu->arch.shregs.msr = newmsr; - vcpu->arch.cfar = vcpu->arch.regs.nip - 4; + vcpu->arch.cfar = vcpu->arch.regs.nip; vcpu->arch.regs.nip = vcpu->arch.shregs.srr0; return RESUME_GUEST; @@ -100,7 +109,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) vcpu->arch.bescr = bescr; msr = (msr & ~MSR_TS_MASK) | MSR_TS_T; vcpu->arch.shregs.msr = msr; - vcpu->arch.cfar = vcpu->arch.regs.nip - 4; + vcpu->arch.cfar = vcpu->arch.regs.nip; vcpu->arch.regs.nip = vcpu->arch.ebbrr; return RESUME_GUEST; @@ -116,6 +125,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) newmsr = (newmsr & ~MSR_LE) | (msr & MSR_LE); newmsr = sanitize_msr(newmsr); vcpu->arch.shregs.msr = newmsr; + vcpu->arch.regs.nip += 4; return RESUME_GUEST; /* ignore bit 31, see comment above */ @@ -152,6 +162,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) msr = (msr & ~MSR_TS_MASK) | MSR_TS_S; } vcpu->arch.shregs.msr = msr; + vcpu->arch.regs.nip += 4; return RESUME_GUEST; /* ignore bit 31, see comment above */ @@ -189,6 +200,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29); vcpu->arch.shregs.msr &= ~MSR_TS_MASK; + vcpu->arch.regs.nip += 4; return RESUME_GUEST; /* ignore bit 31, see comment above */ @@ -220,6 +232,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29); vcpu->arch.shregs.msr = msr | MSR_TS_S; + vcpu->arch.regs.nip += 4; return RESUME_GUEST; } From d82b392d9b3556b63e3f9916cf057ea847e173a9 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Aug 2021 02:00:38 +1000 Subject: [PATCH 246/474] KVM: PPC: Book3S HV Nested: Fix TM softpatch HFAC interrupt emulation Have the TM softpatch emulation code set up the HFAC interrupt and return -1 in case an instruction was executed with HFSCR bits clear, and have the interrupt exit handler fall through to the HFAC handler. When the L0 is running a nested guest, this ensures the HFAC interrupt is correctly passed up to the L1. The "direct guest" exit handler will turn these into PROGILL program interrupts so functionality in practice will be unchanged. But it's possible an L1 would want to handle these in a different way. Also rearrange the FAC interrupt emulation code to match the HFAC format while here (mainly, adding the FSCR_INTR_CAUSE mask). Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210811160134.904987-5-npiggin@gmail.com --- arch/powerpc/include/asm/reg.h | 3 ++- arch/powerpc/kvm/book3s_hv.c | 35 ++++++++++++++++---------- arch/powerpc/kvm/book3s_hv_tm.c | 44 ++++++++++++++++++--------------- 3 files changed, 48 insertions(+), 34 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index be85cf156a1f..e9d27265253b 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -415,6 +415,7 @@ #define FSCR_TAR __MASK(FSCR_TAR_LG) #define FSCR_EBB __MASK(FSCR_EBB_LG) #define FSCR_DSCR __MASK(FSCR_DSCR_LG) +#define FSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56) /* interrupt cause */ #define SPRN_HFSCR 0xbe /* HV=1 Facility Status & Control Register */ #define HFSCR_PREFIX __MASK(FSCR_PREFIX_LG) #define HFSCR_MSGP __MASK(FSCR_MSGP_LG) @@ -426,7 +427,7 @@ #define HFSCR_DSCR __MASK(FSCR_DSCR_LG) #define HFSCR_VECVSX __MASK(FSCR_VECVSX_LG) #define HFSCR_FP __MASK(FSCR_FP_LG) -#define HFSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56) /* interrupt cause */ +#define HFSCR_INTR_CAUSE FSCR_INTR_CAUSE #define SPRN_TAR 0x32f /* Target Address Register */ #define SPRN_LPCR 0x13E /* LPAR Control Register */ #define LPCR_VPM0 ASM_CONST(0x8000000000000000) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index c402eb0276b0..e7df8a3ca62c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1679,6 +1679,21 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, r = RESUME_GUEST; } break; + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case BOOK3S_INTERRUPT_HV_SOFTPATCH: + /* + * This occurs for various TM-related instructions that + * we need to emulate on POWER9 DD2.2. We have already + * handled the cases where the guest was in real-suspend + * mode and was transitioning to transactional state. + */ + r = kvmhv_p9_tm_emulation(vcpu); + if (r != -1) + break; + fallthrough; /* go to facility unavailable handler */ +#endif + /* * This occurs if the guest (kernel or userspace), does something that * is prohibited by HFSCR. @@ -1697,18 +1712,6 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, } break; -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - case BOOK3S_INTERRUPT_HV_SOFTPATCH: - /* - * This occurs for various TM-related instructions that - * we need to emulate on POWER9 DD2.2. We have already - * handled the cases where the guest was in real-suspend - * mode and was transitioning to transactional state. - */ - r = kvmhv_p9_tm_emulation(vcpu); - break; -#endif - case BOOK3S_INTERRUPT_HV_RM_HARD: r = RESUME_PASSTHROUGH; break; @@ -1811,9 +1814,15 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) * mode and was transitioning to transactional state. */ r = kvmhv_p9_tm_emulation(vcpu); - break; + if (r != -1) + break; + fallthrough; /* go to facility unavailable handler */ #endif + case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: + r = RESUME_HOST; + break; + case BOOK3S_INTERRUPT_HV_RM_HARD: vcpu->arch.trap = 0; r = RESUME_GUEST; diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c index e7c36f8bf205..866cadd70094 100644 --- a/arch/powerpc/kvm/book3s_hv_tm.c +++ b/arch/powerpc/kvm/book3s_hv_tm.c @@ -88,14 +88,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) } /* check EBB facility is available */ if (!(vcpu->arch.hfscr & HFSCR_EBB)) { - /* generate an illegal instruction interrupt */ - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - return RESUME_GUEST; + vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE; + vcpu->arch.hfscr |= (u64)FSCR_EBB_LG << 56; + vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL; + return -1; /* rerun host interrupt handler */ } if ((msr & MSR_PR) && !(vcpu->arch.fscr & FSCR_EBB)) { /* generate a facility unavailable interrupt */ - vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) | - ((u64)FSCR_EBB_LG << 56); + vcpu->arch.fscr &= ~FSCR_INTR_CAUSE; + vcpu->arch.fscr |= (u64)FSCR_EBB_LG << 56; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); return RESUME_GUEST; } @@ -138,14 +139,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) } /* check for TM disabled in the HFSCR or MSR */ if (!(vcpu->arch.hfscr & HFSCR_TM)) { - /* generate an illegal instruction interrupt */ - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - return RESUME_GUEST; + vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE; + vcpu->arch.hfscr |= (u64)FSCR_TM_LG << 56; + vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL; + return -1; /* rerun host interrupt handler */ } if (!(msr & MSR_TM)) { /* generate a facility unavailable interrupt */ - vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) | - ((u64)FSCR_TM_LG << 56); + vcpu->arch.fscr &= ~FSCR_INTR_CAUSE; + vcpu->arch.fscr |= (u64)FSCR_TM_LG << 56; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); return RESUME_GUEST; @@ -169,14 +171,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) case (PPC_INST_TRECLAIM & PO_XOP_OPCODE_MASK): /* check for TM disabled in the HFSCR or MSR */ if (!(vcpu->arch.hfscr & HFSCR_TM)) { - /* generate an illegal instruction interrupt */ - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - return RESUME_GUEST; + vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE; + vcpu->arch.hfscr |= (u64)FSCR_TM_LG << 56; + vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL; + return -1; /* rerun host interrupt handler */ } if (!(msr & MSR_TM)) { /* generate a facility unavailable interrupt */ - vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) | - ((u64)FSCR_TM_LG << 56); + vcpu->arch.fscr &= ~FSCR_INTR_CAUSE; + vcpu->arch.fscr |= (u64)FSCR_TM_LG << 56; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); return RESUME_GUEST; @@ -208,14 +211,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) /* XXX do we need to check for PR=0 here? */ /* check for TM disabled in the HFSCR or MSR */ if (!(vcpu->arch.hfscr & HFSCR_TM)) { - /* generate an illegal instruction interrupt */ - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - return RESUME_GUEST; + vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE; + vcpu->arch.hfscr |= (u64)FSCR_TM_LG << 56; + vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL; + return -1; /* rerun host interrupt handler */ } if (!(msr & MSR_TM)) { /* generate a facility unavailable interrupt */ - vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) | - ((u64)FSCR_TM_LG << 56); + vcpu->arch.fscr &= ~FSCR_INTR_CAUSE; + vcpu->arch.fscr |= (u64)FSCR_TM_LG << 56; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); return RESUME_GUEST; From 7487cabc7ed2f7716bf304e4e97c57fd995cf70e Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 12 Aug 2021 02:00:39 +1000 Subject: [PATCH 247/474] KVM: PPC: Book3S HV Nested: Sanitise vcpu registers As one of the arguments of the H_ENTER_NESTED hypercall, the nested hypervisor (L1) prepares a structure containing the values of various hypervisor-privileged registers with which it wants the nested guest (L2) to run. Since the nested HV runs in supervisor mode it needs the host to write to these registers. To stop a nested HV manipulating this mechanism and using a nested guest as a proxy to access a facility that has been made unavailable to it, we have a routine that sanitises the values of the HV registers before copying them into the nested guest's vcpu struct. However, when coming out of the guest the values are copied as they were back into L1 memory, which means that any sanitisation we did during guest entry will be exposed to L1 after H_ENTER_NESTED returns. This patch alters this sanitisation to have effect on the vcpu->arch registers directly before entering and after exiting the guest, leaving the structure that is copied back into L1 unchanged (except when we really want L1 to access the value, e.g the Cause bits of HFSCR). Signed-off-by: Fabiano Rosas Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Reviewed-by: Nicholas Piggin Link: https://lore.kernel.org/r/20210811160134.904987-6-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_nested.c | 94 ++++++++++++++--------------- 1 file changed, 46 insertions(+), 48 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 898f942eb198..1eb4e989edc7 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -105,7 +105,6 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, struct kvmppc_vcore *vc = vcpu->arch.vcore; hr->dpdes = vc->dpdes; - hr->hfscr = vcpu->arch.hfscr; hr->purr = vcpu->arch.purr; hr->spurr = vcpu->arch.spurr; hr->ic = vcpu->arch.ic; @@ -128,55 +127,17 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, case BOOK3S_INTERRUPT_H_INST_STORAGE: hr->asdr = vcpu->arch.fault_gpa; break; + case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: + hr->hfscr = ((~HFSCR_INTR_CAUSE & hr->hfscr) | + (HFSCR_INTR_CAUSE & vcpu->arch.hfscr)); + break; case BOOK3S_INTERRUPT_H_EMUL_ASSIST: hr->heir = vcpu->arch.emul_inst; break; } } -/* - * This can result in some L0 HV register state being leaked to an L1 - * hypervisor when the hv_guest_state is copied back to the guest after - * being modified here. - * - * There is no known problem with such a leak, and in many cases these - * register settings could be derived by the guest by observing behaviour - * and timing, interrupts, etc., but it is an issue to consider. - */ -static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) -{ - struct kvmppc_vcore *vc = vcpu->arch.vcore; - u64 mask; - - /* - * Don't let L1 change LPCR bits for the L2 except these: - */ - mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | - LPCR_LPES | LPCR_MER; - - /* - * Additional filtering is required depending on hardware - * and configuration. - */ - hr->lpcr = kvmppc_filter_lpcr_hv(vcpu->kvm, - (vc->lpcr & ~mask) | (hr->lpcr & mask)); - - /* - * Don't let L1 enable features for L2 which we've disabled for L1, - * but preserve the interrupt cause field. - */ - hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr); - - /* Don't let data address watchpoint match in hypervisor state */ - hr->dawrx0 &= ~DAWRX_HYP; - hr->dawrx1 &= ~DAWRX_HYP; - - /* Don't let completed instruction address breakpt match in HV state */ - if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) - hr->ciabr &= ~CIABR_PRIV; -} - -static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) +static void restore_hv_regs(struct kvm_vcpu *vcpu, const struct hv_guest_state *hr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -288,6 +249,43 @@ static int kvmhv_write_guest_state_and_regs(struct kvm_vcpu *vcpu, sizeof(struct pt_regs)); } +static void load_l2_hv_regs(struct kvm_vcpu *vcpu, + const struct hv_guest_state *l2_hv, + const struct hv_guest_state *l1_hv, u64 *lpcr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + u64 mask; + + restore_hv_regs(vcpu, l2_hv); + + /* + * Don't let L1 change LPCR bits for the L2 except these: + */ + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | + LPCR_LPES | LPCR_MER; + + /* + * Additional filtering is required depending on hardware + * and configuration. + */ + *lpcr = kvmppc_filter_lpcr_hv(vcpu->kvm, + (vc->lpcr & ~mask) | (*lpcr & mask)); + + /* + * Don't let L1 enable features for L2 which we've disabled for L1, + * but preserve the interrupt cause field. + */ + vcpu->arch.hfscr = l2_hv->hfscr & (HFSCR_INTR_CAUSE | vcpu->arch.hfscr); + + /* Don't let data address watchpoint match in hypervisor state */ + vcpu->arch.dawrx0 = l2_hv->dawrx0 & ~DAWRX_HYP; + vcpu->arch.dawrx1 = l2_hv->dawrx1 & ~DAWRX_HYP; + + /* Don't let completed instruction address breakpt match in HV state */ + if ((l2_hv->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) + vcpu->arch.ciabr = l2_hv->ciabr & ~CIABR_PRIV; +} + long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) { long int err, r; @@ -296,7 +294,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) struct hv_guest_state l2_hv = {0}, saved_l1_hv; struct kvmppc_vcore *vc = vcpu->arch.vcore; u64 hv_ptr, regs_ptr; - u64 hdec_exp; + u64 hdec_exp, lpcr; s64 delta_purr, delta_spurr, delta_ic, delta_vtb; if (vcpu->kvm->arch.l1_ptcr == 0) @@ -369,8 +367,8 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) /* Guest must always run with ME enabled, HV disabled. */ vcpu->arch.shregs.msr = (vcpu->arch.regs.msr | MSR_ME) & ~MSR_HV; - sanitise_hv_regs(vcpu, &l2_hv); - restore_hv_regs(vcpu, &l2_hv); + lpcr = l2_hv.lpcr; + load_l2_hv_regs(vcpu, &l2_hv, &saved_l1_hv, &lpcr); vcpu->arch.ret = RESUME_GUEST; vcpu->arch.trap = 0; @@ -380,7 +378,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) r = RESUME_HOST; break; } - r = kvmhv_run_single_vcpu(vcpu, hdec_exp, l2_hv.lpcr); + r = kvmhv_run_single_vcpu(vcpu, hdec_exp, lpcr); } while (is_kvmppc_resume_guest(r)); /* save L2 state for return */ From 8b210a880b35ba75eb42b79dfd65e369c1feb119 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Aug 2021 02:00:40 +1000 Subject: [PATCH 248/474] KVM: PPC: Book3S HV Nested: Make nested HFSCR state accessible When the L0 runs a nested L2, there are several permutations of HFSCR that can be relevant. The HFSCR that the L1 vcpu L1 requested, the HFSCR that the L1 vcpu may use, and the HFSCR that is actually being used to run the L2. The L1 requested HFSCR is not accessible outside the nested hcall handler, so copy that into a new kvm_nested_guest.hfscr field. The permitted HFSCR is taken from the HFSCR that the L1 runs with, which is also not accessible while the hcall is being made. Move this into a new kvm_vcpu_arch.hfscr_permitted field. These will be used by the next patch to improve facility handling for nested guests, and later by facility demand faulting patches. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210811160134.904987-7-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_book3s_64.h | 1 + arch/powerpc/include/asm/kvm_host.h | 2 ++ arch/powerpc/kvm/book3s_hv.c | 2 ++ arch/powerpc/kvm/book3s_hv_nested.c | 5 +++-- 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index eaf3a562bf1e..19b6942c6969 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -39,6 +39,7 @@ struct kvm_nested_guest { pgd_t *shadow_pgtable; /* our page table for this guest */ u64 l1_gr_to_hr; /* L1's addr of part'n-scoped table */ u64 process_table; /* process table entry for this guest */ + u64 hfscr; /* HFSCR that the L1 requested for this nested guest */ long refcnt; /* number of pointers to this struct */ struct mutex tlb_lock; /* serialize page faults and tlbies */ struct kvm_nested_guest *next; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 9f52f282b1aa..a779f7849cfb 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -811,6 +811,8 @@ struct kvm_vcpu_arch { u32 online; + u64 hfscr_permitted; /* A mask of permitted HFSCR facilities */ + /* For support of nested guests */ struct kvm_nested_guest *nested; u32 nested_vcpu_id; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e7df8a3ca62c..9d31267d26b3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2715,6 +2715,8 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) if (cpu_has_feature(CPU_FTR_TM_COMP)) vcpu->arch.hfscr |= HFSCR_TM; + vcpu->arch.hfscr_permitted = vcpu->arch.hfscr; + kvmppc_mmu_book3s_hv_init(vcpu); vcpu->arch.state = KVMPPC_VCPU_NOTREADY; diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 1eb4e989edc7..5ad5014c6f68 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -272,10 +272,10 @@ static void load_l2_hv_regs(struct kvm_vcpu *vcpu, (vc->lpcr & ~mask) | (*lpcr & mask)); /* - * Don't let L1 enable features for L2 which we've disabled for L1, + * Don't let L1 enable features for L2 which we don't allow for L1, * but preserve the interrupt cause field. */ - vcpu->arch.hfscr = l2_hv->hfscr & (HFSCR_INTR_CAUSE | vcpu->arch.hfscr); + vcpu->arch.hfscr = l2_hv->hfscr & (HFSCR_INTR_CAUSE | vcpu->arch.hfscr_permitted); /* Don't let data address watchpoint match in hypervisor state */ vcpu->arch.dawrx0 = l2_hv->dawrx0 & ~DAWRX_HYP; @@ -362,6 +362,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) /* set L1 state to L2 state */ vcpu->arch.nested = l2; vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token; + l2->hfscr = l2_hv.hfscr; vcpu->arch.regs = l2_regs; /* Guest must always run with ME enabled, HV disabled. */ From 7c3ded5735141ff4d049747c9f76672a8b737c49 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 12 Aug 2021 02:00:41 +1000 Subject: [PATCH 249/474] KVM: PPC: Book3S HV Nested: Stop forwarding all HFUs to L1 If the nested hypervisor has no access to a facility because it has been disabled by the host, it should also not be able to see the Hypervisor Facility Unavailable that arises from one of its guests trying to access the facility. This patch turns a HFU that happened in L2 into a Hypervisor Emulation Assistance interrupt and forwards it to L1 for handling. The ones that happened because L1 explicitly disabled the facility for L2 are still let through, along with the corresponding Cause bits in the HFSCR. Signed-off-by: Fabiano Rosas [np: move handling into kvmppc_handle_nested_exit] Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210811160134.904987-8-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 9d31267d26b3..15e2ba93678e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1730,6 +1730,7 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) { + struct kvm_nested_guest *nested = vcpu->arch.nested; int r; int srcu_idx; @@ -1819,9 +1820,35 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) fallthrough; /* go to facility unavailable handler */ #endif - case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: - r = RESUME_HOST; + case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: { + u64 cause = vcpu->arch.hfscr >> 56; + + /* + * Only pass HFU interrupts to the L1 if the facility is + * permitted but disabled by the L1's HFSCR, otherwise + * the interrupt does not make sense to the L1 so turn + * it into a HEAI. + */ + if (!(vcpu->arch.hfscr_permitted & (1UL << cause)) || + (nested->hfscr & (1UL << cause))) { + vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST; + + /* + * If the fetch failed, return to guest and + * try executing it again. + */ + r = kvmppc_get_last_inst(vcpu, INST_GENERIC, + &vcpu->arch.emul_inst); + if (r != EMULATE_DONE) + r = RESUME_GUEST; + else + r = RESUME_HOST; + } else { + r = RESUME_HOST; + } + break; + } case BOOK3S_INTERRUPT_HV_RM_HARD: vcpu->arch.trap = 0; From f2e29db156523bf08a0524e0f4306a641912c6d9 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 12 Aug 2021 02:00:42 +1000 Subject: [PATCH 250/474] KVM: PPC: Book3S HV Nested: save_hv_return_state does not require trap argument vcpu is already anargument so vcpu->arch.trap can be used directly. Signed-off-by: Fabiano Rosas Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210811160134.904987-9-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_nested.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 5ad5014c6f68..ed8a2c9f5629 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -99,7 +99,7 @@ static void byteswap_hv_regs(struct hv_guest_state *hr) hr->dawrx1 = swab64(hr->dawrx1); } -static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, +static void save_hv_return_state(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -118,7 +118,7 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, hr->pidr = vcpu->arch.pid; hr->cfar = vcpu->arch.cfar; hr->ppr = vcpu->arch.ppr; - switch (trap) { + switch (vcpu->arch.trap) { case BOOK3S_INTERRUPT_H_DATA_STORAGE: hr->hdar = vcpu->arch.fault_dar; hr->hdsisr = vcpu->arch.fault_dsisr; @@ -389,7 +389,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) delta_spurr = vcpu->arch.spurr - l2_hv.spurr; delta_ic = vcpu->arch.ic - l2_hv.ic; delta_vtb = vc->vtb - l2_hv.vtb; - save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv); + save_hv_return_state(vcpu, &l2_hv); /* restore L1 state */ vcpu->arch.nested = NULL; From 1782663897945a5cf28e564ba5eed730098e9aa4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Aug 2021 02:00:43 +1000 Subject: [PATCH 251/474] KVM: PPC: Book3S HV Nested: Reflect guest PMU in-use to L0 when guest SPRs are live After the L1 saves its PMU SPRs but before loading the L2's PMU SPRs, switch the pmcregs_in_use field in the L1 lppaca to the value advertised by the L2 in its VPA. On the way out of the L2, set it back after saving the L2 PMU registers (if they were in-use). This transfers the PMU liveness indication between the L1 and L2 at the points where the registers are not live. This fixes the nested HV bug for which a workaround was added to the L0 HV by commit 63279eeb7f93a ("KVM: PPC: Book3S HV: Always save guest pmu for guest capable of nesting"), which explains the problem in detail. That workaround is no longer required for guests that include this bug fix. Fixes: 360cae313702 ("KVM: PPC: Book3S HV: Nested guest entry via hypercall") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20210811160134.904987-10-npiggin@gmail.com --- arch/powerpc/include/asm/pmc.h | 7 +++++++ arch/powerpc/kvm/book3s_hv.c | 20 ++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/arch/powerpc/include/asm/pmc.h b/arch/powerpc/include/asm/pmc.h index c6bbe9778d3c..3c09109e708e 100644 --- a/arch/powerpc/include/asm/pmc.h +++ b/arch/powerpc/include/asm/pmc.h @@ -34,6 +34,13 @@ static inline void ppc_set_pmu_inuse(int inuse) #endif } +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +static inline int ppc_get_pmu_inuse(void) +{ + return get_paca()->pmcregs_in_use; +} +#endif + extern void power4_enable_pmcs(void); #else /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 15e2ba93678e..938b0948478f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -3891,6 +3892,18 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); +#ifdef CONFIG_PPC_PSERIES + if (kvmhv_on_pseries()) { + barrier(); + if (vcpu->arch.vpa.pinned_addr) { + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; + get_lppaca()->pmcregs_in_use = lp->pmcregs_in_use; + } else { + get_lppaca()->pmcregs_in_use = 1; + } + barrier(); + } +#endif kvmhv_load_guest_pmu(vcpu); msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); @@ -4025,6 +4038,13 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, save_pmu |= nesting_enabled(vcpu->kvm); kvmhv_save_guest_pmu(vcpu, save_pmu); +#ifdef CONFIG_PPC_PSERIES + if (kvmhv_on_pseries()) { + barrier(); + get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse(); + barrier(); + } +#endif vc->entry_exit_map = 0x101; vc->in_guest = 0; From 0c8fb653d487d2873f5eefdcaf4cecff4e103828 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Aug 2021 02:00:44 +1000 Subject: [PATCH 252/474] powerpc/64s: Remove WORT SPR from POWER9/10 This register is not architected and not implemented in POWER9 or 10, it just reads back zeroes for compatibility. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20210811160134.904987-11-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 3 --- arch/powerpc/platforms/powernv/idle.c | 2 -- 2 files changed, 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 938b0948478f..dc8f18b0aa75 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3767,7 +3767,6 @@ static void load_spr_state(struct kvm_vcpu *vcpu) mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); mtspr(SPRN_BESCR, vcpu->arch.bescr); - mtspr(SPRN_WORT, vcpu->arch.wort); mtspr(SPRN_TIDR, vcpu->arch.tid); mtspr(SPRN_AMR, vcpu->arch.amr); mtspr(SPRN_UAMOR, vcpu->arch.uamor); @@ -3794,7 +3793,6 @@ static void store_spr_state(struct kvm_vcpu *vcpu) vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); vcpu->arch.bescr = mfspr(SPRN_BESCR); - vcpu->arch.wort = mfspr(SPRN_WORT); vcpu->arch.tid = mfspr(SPRN_TIDR); vcpu->arch.amr = mfspr(SPRN_AMR); vcpu->arch.uamor = mfspr(SPRN_UAMOR); @@ -3826,7 +3824,6 @@ static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu, struct p9_host_os_sprs *host_os_sprs) { mtspr(SPRN_PSPB, 0); - mtspr(SPRN_WORT, 0); mtspr(SPRN_UAMOR, 0); mtspr(SPRN_DSCR, host_os_sprs->dscr); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 528a7e0cf83a..180baecad914 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -667,7 +667,6 @@ static unsigned long power9_idle_stop(unsigned long psscr) sprs.purr = mfspr(SPRN_PURR); sprs.spurr = mfspr(SPRN_SPURR); sprs.dscr = mfspr(SPRN_DSCR); - sprs.wort = mfspr(SPRN_WORT); sprs.ciabr = mfspr(SPRN_CIABR); sprs.mmcra = mfspr(SPRN_MMCRA); @@ -785,7 +784,6 @@ core_woken: mtspr(SPRN_PURR, sprs.purr); mtspr(SPRN_SPURR, sprs.spurr); mtspr(SPRN_DSCR, sprs.dscr); - mtspr(SPRN_WORT, sprs.wort); mtspr(SPRN_CIABR, sprs.ciabr); mtspr(SPRN_MMCRA, sprs.mmcra); From b1643084d164cea0c107a39bcdf0119fc52619af Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Wed, 18 Aug 2021 22:45:54 +0530 Subject: [PATCH 253/474] powerpc/perf: Use stack siar instead of mfspr Minor optimization in the 'perf_instruction_pointer' function code by making use of stack siar instead of mfspr. Fixes: 75382aa72f06 ("powerpc/perf: Move code to select SIAR or pt_regs into perf_read_regs") Signed-off-by: Kajol Jain Tested-by: Nageswara R Sastry Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210818171556.36912-1-kjain@linux.ibm.com --- arch/powerpc/perf/core-book3s.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 91203ed9d0ff..3a782a35100d 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2269,7 +2269,7 @@ unsigned long perf_instruction_pointer(struct pt_regs *regs) else return regs->nip; } else if (use_siar && siar_valid(regs)) - return mfspr(SPRN_SIAR) + perf_ip_adjust(regs); + return siar + perf_ip_adjust(regs); else if (use_siar) return 0; // no valid instruction pointer else From cc90c6742ef5b438f4cb86029d7a794bd0a44a06 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Wed, 18 Aug 2021 22:45:55 +0530 Subject: [PATCH 254/474] powerpc/perf: Drop the case of returning 0 as instruction pointer Drop the case of returning 0 as instruction pointer since kernel never executes at 0 and userspace almost never does either. Fixes: e6878835ac47 ("powerpc/perf: Sample only if SIAR-Valid bit is set in P7+") Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210818171556.36912-2-kjain@linux.ibm.com --- arch/powerpc/perf/core-book3s.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 3a782a35100d..9bb466d2d99e 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2270,8 +2270,6 @@ unsigned long perf_instruction_pointer(struct pt_regs *regs) return regs->nip; } else if (use_siar && siar_valid(regs)) return siar + perf_ip_adjust(regs); - else if (use_siar) - return 0; // no valid instruction pointer else return regs->nip; } From 3c69a5f22223fa3e312689ec218b5059784d49d7 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Wed, 18 Aug 2021 22:45:56 +0530 Subject: [PATCH 255/474] powerpc/perf: Fix the check for SIAR value Incase of random sampling, there can be scenarios where Sample Instruction Address Register(SIAR) may not latch to the sampled instruction and could result in the value of 0. In these scenarios it is preferred to return regs->nip. These corner cases are seen in the previous generation (p9) also. Patch adds the check for SIAR value along with regs_use_siar and siar_valid checks so that the function will return regs->nip incase SIAR is zero. Patch drops the code under PPMU_P10_DD1 flag check which handles SIAR 0 case only for Power10 DD1. Fixes: 2ca13a4cc56c9 ("powerpc/perf: Use regs->nip when SIAR is zero") Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210818171556.36912-3-kjain@linux.ibm.com --- arch/powerpc/perf/core-book3s.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 9bb466d2d99e..73e62e9b179b 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2260,15 +2260,9 @@ unsigned long perf_misc_flags(struct pt_regs *regs) */ unsigned long perf_instruction_pointer(struct pt_regs *regs) { - bool use_siar = regs_use_siar(regs); unsigned long siar = mfspr(SPRN_SIAR); - if (ppmu && (ppmu->flags & PPMU_P10_DD1)) { - if (siar) - return siar; - else - return regs->nip; - } else if (use_siar && siar_valid(regs)) + if (regs_use_siar(regs) && siar_valid(regs) && siar) return siar + perf_ip_adjust(regs); else return regs->nip; From c95278a0534449efc64ac8169382bce217963be2 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Thu, 29 Jul 2021 14:13:16 +1000 Subject: [PATCH 256/474] selftests/powerpc: Add missing clobbered register to to ptrace TM tests ISA v3.1 removes TM but includes a synthetic implementation for backwards compatibility. With this implementation, the tests ptrace-tm-spd-gpr and ptrace-tm-gpr should never be able to make any forward progress and eventually should be killed by the timeout. Instead on a P10 running in P9 mode, ptrace_tm_gpr fails like so: test: ptrace_tm_gpr tags: git_version:unknown Starting the child ... ... GPR[27]: 1 Expected: 2 GPR[28]: 1 Expected: 2 GPR[29]: 1 Expected: 2 GPR[30]: 1 Expected: 2 GPR[31]: 1 Expected: 2 [FAIL] Test FAILED on line 98 failure: ptrace_tm_gpr selftests: ptrace-tm-gpr [FAIL] The problem is in the inline assembly of the child. r0 is loaded with a value in the child's transaction abort handler but this register is not included in the clobbers list. This means it is possible that this statement: cptr[1] = 0; which is meant to signal the parent to wait may actually use the value placed into r0 by the inline assembly incorrectly signal the parent to continue. By inspection the same problem is present in ptrace-tm-spd-gpr. Adding r0 to the clobbbers list makes the test fail correctly via a timeout on a P10 running in P8/P9 compatibility mode. Suggested-by: Michael Neuling Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210729041317.366612-1-jniethe5@gmail.com --- tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c | 2 +- tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c index 82f7bdc2e5e6..7df7100a29be 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c @@ -57,7 +57,7 @@ trans: : [gpr_1]"i"(GPR_1), [gpr_2]"i"(GPR_2), [sprn_texasr] "i" (SPRN_TEXASR), [flt_1] "b" (&a), [flt_2] "b" (&b), [cptr1] "b" (&cptr[1]) - : "memory", "r7", "r8", "r9", "r10", + : "memory", "r0", "r7", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c index ad65be6e8e85..8706bea5d015 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c @@ -65,7 +65,7 @@ trans: : [gpr_1]"i"(GPR_1), [gpr_2]"i"(GPR_2), [gpr_4]"i"(GPR_4), [sprn_texasr] "i" (SPRN_TEXASR), [flt_1] "b" (&a), [flt_4] "b" (&d) - : "memory", "r5", "r6", "r7", + : "memory", "r0", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" From e42edf9b9d126bb1c743f2e7984877ba27f09fe7 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Thu, 29 Jul 2021 14:13:17 +1000 Subject: [PATCH 257/474] selftests: Skip TM tests on synthetic TM implementations Transactional Memory was removed from the architecture in ISA v3.1. For threads running in P8/P9 compatibility mode on P10 a synthetic TM implementation is provided. In this implementation, tbegin. always sets cr0 eq meaning the abort handler is always called. This is not an issue as users of TM are expected to have a fallback non transactional way to make forward progress in the abort handler. The TEXASR indicates if a transaction failure is due to a synthetic implementation. Some of the TM self tests need a non-degenerate TM implementation for their testing to be meaningful so check for a synthetic implementation and skip the test if so. Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210729041317.366612-2-jniethe5@gmail.com --- .../selftests/powerpc/ptrace/ptrace-tm-gpr.c | 1 + .../powerpc/ptrace/ptrace-tm-spd-gpr.c | 1 + .../powerpc/ptrace/ptrace-tm-spd-tar.c | 1 + .../powerpc/ptrace/ptrace-tm-spd-vsx.c | 1 + .../selftests/powerpc/ptrace/ptrace-tm-spr.c | 1 + .../selftests/powerpc/ptrace/ptrace-tm-tar.c | 1 + .../selftests/powerpc/ptrace/ptrace-tm-vsx.c | 1 + .../selftests/powerpc/signal/signal_tm.c | 1 + tools/testing/selftests/powerpc/tm/tm-exec.c | 1 + tools/testing/selftests/powerpc/tm/tm-fork.c | 1 + .../testing/selftests/powerpc/tm/tm-poison.c | 1 + .../selftests/powerpc/tm/tm-resched-dscr.c | 1 + .../powerpc/tm/tm-signal-context-chk-fpu.c | 1 + .../powerpc/tm/tm-signal-context-chk-gpr.c | 1 + .../powerpc/tm/tm-signal-context-chk-vmx.c | 1 + .../powerpc/tm/tm-signal-context-chk-vsx.c | 1 + .../powerpc/tm/tm-signal-pagefault.c | 1 + .../powerpc/tm/tm-signal-sigreturn-nt.c | 1 + .../selftests/powerpc/tm/tm-signal-stack.c | 1 + .../selftests/powerpc/tm/tm-sigreturn.c | 1 + .../testing/selftests/powerpc/tm/tm-syscall.c | 2 +- tools/testing/selftests/powerpc/tm/tm-tar.c | 1 + tools/testing/selftests/powerpc/tm/tm-tmspr.c | 1 + tools/testing/selftests/powerpc/tm/tm-trap.c | 1 + .../selftests/powerpc/tm/tm-unavailable.c | 1 + .../selftests/powerpc/tm/tm-vmx-unavail.c | 1 + .../testing/selftests/powerpc/tm/tm-vmxcopy.c | 1 + tools/testing/selftests/powerpc/tm/tm.h | 36 +++++++++++++++++++ 28 files changed, 63 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c index 7df7100a29be..67ca297c5cca 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c @@ -113,6 +113,7 @@ int ptrace_tm_gpr(void) int ret, status; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT); pid = fork(); if (pid < 0) { diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c index 8706bea5d015..6f2bce1b6c5d 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c @@ -119,6 +119,7 @@ int ptrace_tm_spd_gpr(void) int ret, status; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); shm_id = shmget(IPC_PRIVATE, sizeof(int) * 3, 0777|IPC_CREAT); pid = fork(); if (pid < 0) { diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c index 2ecfa1158e2b..e112a34fbe59 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c @@ -129,6 +129,7 @@ int ptrace_tm_spd_tar(void) int ret, status; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); shm_id = shmget(IPC_PRIVATE, sizeof(int) * 3, 0777|IPC_CREAT); pid = fork(); if (pid == 0) diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c index 6f7fb51f0809..40133d49fe39 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c @@ -129,6 +129,7 @@ int ptrace_tm_spd_vsx(void) int ret, status, i; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); shm_id = shmget(IPC_PRIVATE, sizeof(int) * 3, 0777|IPC_CREAT); for (i = 0; i < 128; i++) { diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c index 068bfed2e606..880ba6a29a48 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c @@ -114,6 +114,7 @@ int ptrace_tm_spr(void) int ret, status; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); shm_id = shmget(IPC_PRIVATE, sizeof(struct shared), 0777|IPC_CREAT); shm_id1 = shmget(IPC_PRIVATE, sizeof(int), 0777|IPC_CREAT); pid = fork(); diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c index 46ef378a15ec..d0db6df0f0ea 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c @@ -117,6 +117,7 @@ int ptrace_tm_tar(void) int ret, status; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT); pid = fork(); if (pid == 0) diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c index 70ca01234f79..4f05ce4fd282 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c @@ -113,6 +113,7 @@ int ptrace_tm_vsx(void) int ret, status, i; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT); for (i = 0; i < 128; i++) { diff --git a/tools/testing/selftests/powerpc/signal/signal_tm.c b/tools/testing/selftests/powerpc/signal/signal_tm.c index 5bf2224ef7f2..c9cf66a3daa2 100644 --- a/tools/testing/selftests/powerpc/signal/signal_tm.c +++ b/tools/testing/selftests/powerpc/signal/signal_tm.c @@ -56,6 +56,7 @@ static int test_signal_tm() } SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); for (i = 0; i < MAX_ATTEMPT; i++) { /* diff --git a/tools/testing/selftests/powerpc/tm/tm-exec.c b/tools/testing/selftests/powerpc/tm/tm-exec.c index 260cfdb97d23..c59919d6710d 100644 --- a/tools/testing/selftests/powerpc/tm/tm-exec.c +++ b/tools/testing/selftests/powerpc/tm/tm-exec.c @@ -27,6 +27,7 @@ static char *path; static int test_exec(void) { SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); asm __volatile__( "tbegin.;" diff --git a/tools/testing/selftests/powerpc/tm/tm-fork.c b/tools/testing/selftests/powerpc/tm/tm-fork.c index 6efa5a685a77..c27b935f0e9f 100644 --- a/tools/testing/selftests/powerpc/tm/tm-fork.c +++ b/tools/testing/selftests/powerpc/tm/tm-fork.c @@ -21,6 +21,7 @@ int test_fork(void) { SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); asm __volatile__( "tbegin.;" diff --git a/tools/testing/selftests/powerpc/tm/tm-poison.c b/tools/testing/selftests/powerpc/tm/tm-poison.c index 27c083a03d1f..a7bbf034b5bb 100644 --- a/tools/testing/selftests/powerpc/tm/tm-poison.c +++ b/tools/testing/selftests/powerpc/tm/tm-poison.c @@ -33,6 +33,7 @@ int tm_poison_test(void) bool fail_vr = false; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); cpu = pick_online_cpu(); FAIL_IF(cpu < 0); diff --git a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c index 4cdb83964bb3..85c940ae6ff8 100644 --- a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c +++ b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c @@ -40,6 +40,7 @@ int test_body(void) uint64_t rv, dscr1 = 1, dscr2, texasr; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); printf("Check DSCR TM context switch: "); fflush(stdout); diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c index 254f912ad611..657d755b2905 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c @@ -79,6 +79,7 @@ static int tm_signal_context_chk_fpu() pid_t pid = getpid(); SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); act.sa_sigaction = signal_usr1; sigemptyset(&act.sa_mask); diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c index 0cc680f61828..400fa70ca71e 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c @@ -81,6 +81,7 @@ static int tm_signal_context_chk_gpr() pid_t pid = getpid(); SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); act.sa_sigaction = signal_usr1; sigemptyset(&act.sa_mask); diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c index b6d52730a0d8..d628fd302b28 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c @@ -104,6 +104,7 @@ static int tm_signal_context_chk() pid_t pid = getpid(); SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); act.sa_sigaction = signal_usr1; sigemptyset(&act.sa_mask); diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c index 8e25e2072ecd..9bd869245bad 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c @@ -153,6 +153,7 @@ static int tm_signal_context_chk() pid_t pid = getpid(); SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); act.sa_sigaction = signal_usr1; sigemptyset(&act.sa_mask); diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c b/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c index 5908bc6abe60..0b84c9208d62 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c @@ -226,6 +226,7 @@ int tm_signal_pagefault(void) stack_t ss; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); SKIP_IF(!have_userfaultfd()); setup_uf_mem(); diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c b/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c index 07c388147b75..06b801906f27 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c @@ -32,6 +32,7 @@ int tm_signal_sigreturn_nt(void) struct sigaction trap_sa; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); trap_sa.sa_flags = SA_SIGINFO; trap_sa.sa_sigaction = trap_signal_handler; diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c index cdcf8c5bbbc7..68807aac8dd3 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c @@ -35,6 +35,7 @@ int tm_signal_stack() int pid; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); pid = fork(); if (pid < 0) diff --git a/tools/testing/selftests/powerpc/tm/tm-sigreturn.c b/tools/testing/selftests/powerpc/tm/tm-sigreturn.c index 9a6017a1d769..ffe4e5515f33 100644 --- a/tools/testing/selftests/powerpc/tm/tm-sigreturn.c +++ b/tools/testing/selftests/powerpc/tm/tm-sigreturn.c @@ -55,6 +55,7 @@ int tm_sigreturn(void) uint64_t ret = 0; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); SKIP_IF(!is_ppc64le()); memset(&sa, 0, sizeof(sa)); diff --git a/tools/testing/selftests/powerpc/tm/tm-syscall.c b/tools/testing/selftests/powerpc/tm/tm-syscall.c index becb8207b432..467a6b3134b2 100644 --- a/tools/testing/selftests/powerpc/tm/tm-syscall.c +++ b/tools/testing/selftests/powerpc/tm/tm-syscall.c @@ -25,7 +25,6 @@ extern int getppid_tm_suspended(void); unsigned retries = 0; #define TEST_DURATION 10 /* seconds */ -#define TM_RETRIES 100 pid_t getppid_tm(bool suspend) { @@ -67,6 +66,7 @@ int tm_syscall(void) struct timeval end, now; SKIP_IF(!have_htm_nosc()); + SKIP_IF(htm_is_synthetic()); setbuf(stdout, NULL); diff --git a/tools/testing/selftests/powerpc/tm/tm-tar.c b/tools/testing/selftests/powerpc/tm/tm-tar.c index 03be8c47292b..f2a9137f3c1e 100644 --- a/tools/testing/selftests/powerpc/tm/tm-tar.c +++ b/tools/testing/selftests/powerpc/tm/tm-tar.c @@ -26,6 +26,7 @@ int test_tar(void) int i; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); SKIP_IF(!is_ppc64le()); for (i = 0; i < num_loops; i++) diff --git a/tools/testing/selftests/powerpc/tm/tm-tmspr.c b/tools/testing/selftests/powerpc/tm/tm-tmspr.c index 794d574db784..dd5ddffa28b7 100644 --- a/tools/testing/selftests/powerpc/tm/tm-tmspr.c +++ b/tools/testing/selftests/powerpc/tm/tm-tmspr.c @@ -96,6 +96,7 @@ int test_tmspr() unsigned long i; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); /* To cause some context switching */ thread_num = 10 * sysconf(_SC_NPROCESSORS_ONLN); diff --git a/tools/testing/selftests/powerpc/tm/tm-trap.c b/tools/testing/selftests/powerpc/tm/tm-trap.c index 11521077f915..97cb74768e30 100644 --- a/tools/testing/selftests/powerpc/tm/tm-trap.c +++ b/tools/testing/selftests/powerpc/tm/tm-trap.c @@ -255,6 +255,7 @@ int tm_trap_test(void) struct sigaction trap_sa; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); trap_sa.sa_flags = SA_SIGINFO; trap_sa.sa_sigaction = trap_signal_handler; diff --git a/tools/testing/selftests/powerpc/tm/tm-unavailable.c b/tools/testing/selftests/powerpc/tm/tm-unavailable.c index a1348a5f721a..6bf1b65b020d 100644 --- a/tools/testing/selftests/powerpc/tm/tm-unavailable.c +++ b/tools/testing/selftests/powerpc/tm/tm-unavailable.c @@ -344,6 +344,7 @@ int tm_unavailable_test(void) cpu_set_t cpuset; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); cpu = pick_online_cpu(); FAIL_IF(cpu < 0); diff --git a/tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c b/tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c index 9ef37a9836ac..34364ed2b6b7 100644 --- a/tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c +++ b/tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c @@ -91,6 +91,7 @@ int tm_vmx_unavail_test() pthread_t *thread; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); passed = 1; diff --git a/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c b/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c index c1e788a6df47..1640e7ead69b 100644 --- a/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c +++ b/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c @@ -46,6 +46,7 @@ int test_vmxcopy() uint64_t aborted = 0; SKIP_IF(!have_htm()); + SKIP_IF(htm_is_synthetic()); SKIP_IF(!is_ppc64le()); fd = mkstemp(tmpfile); diff --git a/tools/testing/selftests/powerpc/tm/tm.h b/tools/testing/selftests/powerpc/tm/tm.h index c5a1e5c163fc..c03c6e778876 100644 --- a/tools/testing/selftests/powerpc/tm/tm.h +++ b/tools/testing/selftests/powerpc/tm/tm.h @@ -10,6 +10,9 @@ #include #include "utils.h" +#include "reg.h" + +#define TM_RETRIES 100 static inline bool have_htm(void) { @@ -31,6 +34,39 @@ static inline bool have_htm_nosc(void) #endif } +/* + * Transactional Memory was removed in ISA 3.1. A synthetic TM implementation + * is provided on P10 for threads running in P8/P9 compatibility mode. The + * synthetic implementation immediately fails after tbegin. This failure sets + * Bit 7 (Failure Persistent) and Bit 15 (Implementation-specific). + */ +static inline bool htm_is_synthetic(void) +{ + int i; + + /* + * Per the ISA, the Failure Persistent bit may be incorrect. Try a few + * times in case we got an Implementation-specific failure on a non ISA + * v3.1 system. On these systems the Implementation-specific failure + * should not be persistent. + */ + for (i = 0; i < TM_RETRIES; i++) { + asm volatile( + "tbegin.;" + "beq 1f;" + "tend.;" + "1:" + : + : + : "memory"); + + if ((__builtin_get_texasr() & (TEXASR_FP | TEXASR_IC)) != + (TEXASR_FP | TEXASR_IC)) + break; + } + return i == TM_RETRIES; +} + static inline long failure_code(void) { return __builtin_get_texasru() >> 24; From 4f8e78c0757e3c5a65d9d8ac76e2434c71a78f5a Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Sat, 7 Aug 2021 09:02:36 +0800 Subject: [PATCH 258/474] powerpc: Add esr as a synonym for pt_regs.dsisr Create an anonymous union for dsisr and esr regsiters, we can reference esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y. Otherwise, reference dsisr. This makes code more clear. Signed-off-by: Xiongwei Song [mpe: Reword commit title] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210807010239.416055-2-sxwjean@me.com --- arch/powerpc/include/asm/ptrace.h | 5 ++++- arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/ptrace/ptrace.c | 2 ++ arch/powerpc/kernel/traps.c | 2 +- arch/powerpc/platforms/44x/machine_check.c | 4 ++-- arch/powerpc/platforms/4xx/machine_check.c | 2 +- 6 files changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 333979cf5d1e..ab58939653db 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -44,7 +44,10 @@ struct pt_regs #endif unsigned long trap; unsigned long dar; - unsigned long dsisr; + union { + unsigned long dsisr; + unsigned long esr; + }; unsigned long result; }; }; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 185beb290580..f74af8f9133c 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs) trap == INTERRUPT_DATA_STORAGE || trap == INTERRUPT_ALIGNMENT) { if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr); + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->esr); else pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); } diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index 0a0a33eb0d28..a222fd4d6334 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -375,6 +375,8 @@ void __init pt_regs_check(void) offsetof(struct user_pt_regs, dar)); BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != offsetof(struct user_pt_regs, dsisr)); + BUILD_BUG_ON(offsetof(struct pt_regs, esr) != + offsetof(struct user_pt_regs, dsisr)); BUILD_BUG_ON(offsetof(struct pt_regs, result) != offsetof(struct user_pt_regs, result)); diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 51d4f5faf425..5463d201d465 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -562,7 +562,7 @@ static inline int check_io_access(struct pt_regs *regs) #ifdef CONFIG_PPC_ADV_DEBUG_REGS /* On 4xx, the reason for the machine check or program exception is in the ESR. */ -#define get_reason(regs) ((regs)->dsisr) +#define get_reason(regs) ((regs)->esr) #define REASON_FP ESR_FP #define REASON_ILLEGAL (ESR_PIL | ESR_PUO) #define REASON_PRIVILEGED ESR_PPR diff --git a/arch/powerpc/platforms/44x/machine_check.c b/arch/powerpc/platforms/44x/machine_check.c index a5c898bb9bab..5d19daacd78a 100644 --- a/arch/powerpc/platforms/44x/machine_check.c +++ b/arch/powerpc/platforms/44x/machine_check.c @@ -11,7 +11,7 @@ int machine_check_440A(struct pt_regs *regs) { - unsigned long reason = regs->dsisr; + unsigned long reason = regs->esr; printk("Machine check in kernel mode.\n"); if (reason & ESR_IMCP){ @@ -48,7 +48,7 @@ int machine_check_440A(struct pt_regs *regs) #ifdef CONFIG_PPC_47x int machine_check_47x(struct pt_regs *regs) { - unsigned long reason = regs->dsisr; + unsigned long reason = regs->esr; u32 mcsr; printk(KERN_ERR "Machine check in kernel mode.\n"); diff --git a/arch/powerpc/platforms/4xx/machine_check.c b/arch/powerpc/platforms/4xx/machine_check.c index a71c29892a91..a905da1d6f41 100644 --- a/arch/powerpc/platforms/4xx/machine_check.c +++ b/arch/powerpc/platforms/4xx/machine_check.c @@ -10,7 +10,7 @@ int machine_check_4xx(struct pt_regs *regs) { - unsigned long reason = regs->dsisr; + unsigned long reason = regs->esr; if (reason & ESR_IMCP) { printk("Instruction"); From cfa47772ca8d53d7a6c9b331a7f6e7c4c9827214 Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Sat, 7 Aug 2021 09:02:37 +0800 Subject: [PATCH 259/474] powerpc/64e: Get esr offset with _ESR macro Use _ESR to get the offset of esr register in pr_regs for 64e cpus. Signed-off-by: Xiongwei Song Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210807010239.416055-3-sxwjean@me.com --- arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/kernel/exceptions-64e.S | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index a47eefa09bcb..f4ebc435fd78 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -287,6 +287,7 @@ int main(void) STACK_PT_REGS_OFFSET(_XER, xer); STACK_PT_REGS_OFFSET(_DAR, dar); STACK_PT_REGS_OFFSET(_DSISR, dsisr); + STACK_PT_REGS_OFFSET(_ESR, esr); STACK_PT_REGS_OFFSET(ORIG_GPR3, orig_gpr3); STACK_PT_REGS_OFFSET(RESULT, result); STACK_PT_REGS_OFFSET(_TRAP, trap); @@ -298,7 +299,6 @@ int main(void) * we use them to hold SRR0 and SRR1. */ STACK_PT_REGS_OFFSET(_DEAR, dar); - STACK_PT_REGS_OFFSET(_ESR, dsisr); #else /* CONFIG_PPC64 */ STACK_PT_REGS_OFFSET(SOFTE, softe); STACK_PT_REGS_OFFSET(_PPR, ppr); diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 7e0943d9f9b0..bf8822a5ae0e 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -546,7 +546,7 @@ __end_interrupts: mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR std r14,_DAR(r1) - std r15,_DSISR(r1) + std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x300) @@ -559,7 +559,7 @@ __end_interrupts: li r15,0 mr r14,r10 std r14,_DAR(r1) - std r15,_DSISR(r1) + std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x400) @@ -576,7 +576,7 @@ __end_interrupts: mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR std r14,_DAR(r1) - std r15,_DSISR(r1) + std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x600) @@ -587,7 +587,7 @@ __end_interrupts: NORMAL_EXCEPTION_PROLOG(0x700, BOOKE_INTERRUPT_PROGRAM, PROLOG_ADDITION_1REG) mfspr r14,SPRN_ESR - std r14,_DSISR(r1) + std r14,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) EXCEPTION_COMMON(0x700) addi r3,r1,STACK_FRAME_OVERHEAD @@ -1058,7 +1058,7 @@ bad_stack_book3e: mfspr r10,SPRN_DEAR mfspr r11,SPRN_ESR std r10,_DAR(r1) - std r11,_DSISR(r1) + std r11,_ESR(r1) std r0,GPR0(r1); /* save r0 in stackframe */ \ std r2,GPR2(r1); /* save r2 in stackframe */ \ SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ From 4872cbd0ca35ca5b20d52e2539e7e1950f126e7b Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Sat, 7 Aug 2021 09:02:38 +0800 Subject: [PATCH 260/474] powerpc: Add dear as a synonym for pt_regs.dar register Create an anonymous union for dar and dear regsiters, we can reference dear to get the effective address when CONFIG_4xx=y or CONFIG_BOOKE=y. Otherwise, reference dar. This makes code more clear. Signed-off-by: Xiongwei Song [mpe: Reword commit title] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210807010239.416055-4-sxwjean@me.com --- arch/powerpc/include/asm/ptrace.h | 5 ++++- arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/ptrace/ptrace.c | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index ab58939653db..5114ddba6f16 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -43,7 +43,10 @@ struct pt_regs unsigned long mq; #endif unsigned long trap; - unsigned long dar; + union { + unsigned long dar; + unsigned long dear; + }; union { unsigned long dsisr; unsigned long esr; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index f74af8f9133c..50436b52c213 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs) trap == INTERRUPT_DATA_STORAGE || trap == INTERRUPT_ALIGNMENT) { if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->esr); + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dear, regs->esr); else pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); } diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index a222fd4d6334..7c7093c17c45 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -373,6 +373,8 @@ void __init pt_regs_check(void) offsetof(struct user_pt_regs, trap)); BUILD_BUG_ON(offsetof(struct pt_regs, dar) != offsetof(struct user_pt_regs, dar)); + BUILD_BUG_ON(offsetof(struct pt_regs, dear) != + offsetof(struct user_pt_regs, dar)); BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != offsetof(struct user_pt_regs, dsisr)); BUILD_BUG_ON(offsetof(struct pt_regs, esr) != From d9db6e420268b2d561731468a31f0b15e2e9a145 Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Sat, 7 Aug 2021 09:02:39 +0800 Subject: [PATCH 261/474] powerpc/64e: Get dear offset with _DEAR macro Use _DEAR to get the offset of dear register in pr_regs for 64e cpus. Signed-off-by: Xiongwei Song Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210807010239.416055-5-sxwjean@me.com --- arch/powerpc/kernel/asm-offsets.c | 13 +++---------- arch/powerpc/kernel/exceptions-64e.S | 8 ++++---- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index f4ebc435fd78..8357d5fcd09e 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -286,23 +286,16 @@ int main(void) STACK_PT_REGS_OFFSET(_CCR, ccr); STACK_PT_REGS_OFFSET(_XER, xer); STACK_PT_REGS_OFFSET(_DAR, dar); + STACK_PT_REGS_OFFSET(_DEAR, dear); STACK_PT_REGS_OFFSET(_DSISR, dsisr); STACK_PT_REGS_OFFSET(_ESR, esr); STACK_PT_REGS_OFFSET(ORIG_GPR3, orig_gpr3); STACK_PT_REGS_OFFSET(RESULT, result); STACK_PT_REGS_OFFSET(_TRAP, trap); -#ifndef CONFIG_PPC64 - /* - * The PowerPC 400-class & Book-E processors have neither the DAR - * nor the DSISR SPRs. Hence, we overload them to hold the similar - * DEAR and ESR SPRs for such processors. For critical interrupts - * we use them to hold SRR0 and SRR1. - */ - STACK_PT_REGS_OFFSET(_DEAR, dar); -#else /* CONFIG_PPC64 */ +#ifdef CONFIG_PPC64 STACK_PT_REGS_OFFSET(SOFTE, softe); STACK_PT_REGS_OFFSET(_PPR, ppr); -#endif /* CONFIG_PPC64 */ +#endif #ifdef CONFIG_PPC_PKEY STACK_PT_REGS_OFFSET(STACK_REGS_AMR, amr); diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index bf8822a5ae0e..711c66b76df1 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -545,7 +545,7 @@ __end_interrupts: PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR - std r14,_DAR(r1) + std r14,_DEAR(r1) std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) @@ -558,7 +558,7 @@ __end_interrupts: PROLOG_ADDITION_2REGS) li r15,0 mr r14,r10 - std r14,_DAR(r1) + std r14,_DEAR(r1) std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) @@ -575,7 +575,7 @@ __end_interrupts: PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR - std r14,_DAR(r1) + std r14,_DEAR(r1) std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) @@ -1057,7 +1057,7 @@ bad_stack_book3e: std r11,_CCR(r1) mfspr r10,SPRN_DEAR mfspr r11,SPRN_ESR - std r10,_DAR(r1) + std r10,_DEAR(r1) std r11,_ESR(r1) std r0,GPR0(r1); /* save r0 in stackframe */ \ std r2,GPR2(r1); /* save r2 in stackframe */ \ From 133c17a1788d68c9fff59d5f724a4ba14647a16d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 23 Aug 2021 08:24:20 +0000 Subject: [PATCH 262/474] powerpc: Remove MSR_PR check in interrupt_exit_{user/kernel}_prepare() In those hot functions that are called at every interrupt, any saved cycle is worth it. interrupt_exit_user_prepare() and interrupt_exit_kernel_prepare() are called from three places: - From entry_32.S - From interrupt_64.S - From interrupt_exit_user_restart() and interrupt_exit_kernel_restart() In entry_32.S, there are inambiguously called based on MSR_PR: interrupt_return: lwz r4,_MSR(r1) addi r3,r1,STACK_FRAME_OVERHEAD andi. r0,r4,MSR_PR beq .Lkernel_interrupt_return bl interrupt_exit_user_prepare ... .Lkernel_interrupt_return: bl interrupt_exit_kernel_prepare In interrupt_64.S, that's similar: interrupt_return_\srr\(): ld r4,_MSR(r1) andi. r0,r4,MSR_PR beq interrupt_return_\srr\()_kernel interrupt_return_\srr\()_user: /* make backtraces match the _kernel variant */ addi r3,r1,STACK_FRAME_OVERHEAD bl interrupt_exit_user_prepare ... interrupt_return_\srr\()_kernel: addi r3,r1,STACK_FRAME_OVERHEAD bl interrupt_exit_kernel_prepare In interrupt_exit_user_restart() and interrupt_exit_kernel_restart(), MSR_PR is verified respectively by BUG_ON(!user_mode(regs)) and BUG_ON(user_mode(regs)) prior to calling interrupt_exit_user_prepare() and interrupt_exit_kernel_prepare(). The verification in interrupt_exit_user_prepare() and interrupt_exit_kernel_prepare() are therefore useless and can be removed. Signed-off-by: Christophe Leroy Acked-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/385ead49ccb66a259b25fee3eebf0bd4094068f3.1629707037.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/interrupt.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index abc2b3dc3f20..531f0a1dbabc 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -464,7 +464,6 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs) if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x)) BUG_ON(!(regs->msr & MSR_RI)); - BUG_ON(!(regs->msr & MSR_PR)); BUG_ON(arch_irq_disabled_regs(regs)); CT_WARN_ON(ct_state() == CONTEXT_USER); @@ -498,7 +497,6 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x) && unlikely(!(regs->msr & MSR_RI))) unrecoverable_exception(regs); - BUG_ON(regs->msr & MSR_PR); /* * CT_WARN_ON comes here via program_check_exception, * so avoid recursion. From 806c0e6e7e97adc17389c8dc1f52d4736f49299b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 23 Aug 2021 08:24:21 +0000 Subject: [PATCH 263/474] powerpc: Refactor verification of MSR_RI 40x and BOOKE don't have MSR_RI therefore all tests involving MSR_RI may be problematic on those plateforms. Create helpers to check or set MSR_RI in regs, and use them in common code. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c2fb93708196734f4176dda334aaa3055f213b89.1629707037.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ptrace.h | 23 +++++++++++++++++++ arch/powerpc/kernel/interrupt.c | 9 +++----- arch/powerpc/kernel/traps.c | 8 +++---- arch/powerpc/mm/book3s64/slb.c | 2 +- arch/powerpc/platforms/embedded6xx/holly.c | 2 +- .../platforms/embedded6xx/mpc7448_hpc2.c | 2 +- arch/powerpc/platforms/pasemi/idle.c | 2 +- arch/powerpc/platforms/powernv/opal.c | 2 +- arch/powerpc/platforms/pseries/ras.c | 2 +- arch/powerpc/sysdev/fsl_rio.c | 2 +- arch/powerpc/xmon/xmon.c | 16 +++---------- 11 files changed, 40 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 5114ddba6f16..f9c235c1d6ce 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -22,6 +22,7 @@ #include #include #include +#include #ifndef __ASSEMBLY__ struct pt_regs @@ -272,6 +273,28 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) regs->gpr[3] = rc; } +static inline bool cpu_has_msr_ri(void) +{ + return !IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x); +} + +static inline bool regs_is_unrecoverable(struct pt_regs *regs) +{ + return unlikely(cpu_has_msr_ri() && !(regs->msr & MSR_RI)); +} + +static inline void regs_set_recoverable(struct pt_regs *regs) +{ + if (cpu_has_msr_ri()) + regs_set_return_msr(regs, regs->msr | MSR_RI); +} + +static inline void regs_set_unrecoverable(struct pt_regs *regs) +{ + if (cpu_has_msr_ri()) + regs_set_return_msr(regs, regs->msr & ~MSR_RI); +} + #define arch_has_single_step() (1) #define arch_has_block_step() (true) #define ARCH_HAS_USER_SINGLE_STEP_REPORT diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index 531f0a1dbabc..a73f3f70a657 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -92,8 +92,7 @@ notrace long system_call_exception(long r3, long r4, long r5, CT_WARN_ON(ct_state() == CONTEXT_KERNEL); user_exit_irqoff(); - if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x)) - BUG_ON(!(regs->msr & MSR_RI)); + BUG_ON(regs_is_unrecoverable(regs)); BUG_ON(!(regs->msr & MSR_PR)); BUG_ON(arch_irq_disabled_regs(regs)); @@ -462,8 +461,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs) { unsigned long ret; - if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x)) - BUG_ON(!(regs->msr & MSR_RI)); + BUG_ON(regs_is_unrecoverable(regs)); BUG_ON(arch_irq_disabled_regs(regs)); CT_WARN_ON(ct_state() == CONTEXT_USER); @@ -494,8 +492,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) bool stack_store = current_thread_info()->flags & _TIF_EMULATE_STACK_STORE; - if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x) && - unlikely(!(regs->msr & MSR_RI))) + if (regs_is_unrecoverable(regs)) unrecoverable_exception(regs); /* * CT_WARN_ON comes here via program_check_exception, diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 5463d201d465..3a344f5265c2 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -428,7 +428,7 @@ void hv_nmi_check_nonrecoverable(struct pt_regs *regs) return; nonrecoverable: - regs_set_return_msr(regs, regs->msr & ~MSR_RI); + regs_set_unrecoverable(regs); #endif } DEFINE_INTERRUPT_HANDLER_NMI(system_reset_exception) @@ -498,7 +498,7 @@ out: die("Unrecoverable nested System Reset", regs, SIGABRT); #endif /* Must die if the interrupt is not recoverable */ - if (!(regs->msr & MSR_RI)) { + if (regs_is_unrecoverable(regs)) { /* For the reason explained in die_mce, nmi_exit before die */ nmi_exit(); die("Unrecoverable System Reset", regs, SIGABRT); @@ -550,7 +550,7 @@ static inline int check_io_access(struct pt_regs *regs) printk(KERN_DEBUG "%s bad port %lx at %p\n", (*nip & 0x100)? "OUT to": "IN from", regs->gpr[rb] - _IO_BASE, nip); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } @@ -840,7 +840,7 @@ DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception) bail: /* Must die if the interrupt is not recoverable */ - if (!(regs->msr & MSR_RI)) + if (regs_is_unrecoverable(regs)) die_mce("Unrecoverable Machine check", regs, SIGBUS); #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index c91bd85eb90e..f0037bcc47a0 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -822,7 +822,7 @@ DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault) /* IRQs are not reconciled here, so can't check irqs_disabled */ VM_WARN_ON(mfmsr() & MSR_EE); - if (unlikely(!(regs->msr & MSR_RI))) + if (regs_is_unrecoverable(regs)) return -EINVAL; /* diff --git a/arch/powerpc/platforms/embedded6xx/holly.c b/arch/powerpc/platforms/embedded6xx/holly.c index 85521b3e7098..7a85b117f7a4 100644 --- a/arch/powerpc/platforms/embedded6xx/holly.c +++ b/arch/powerpc/platforms/embedded6xx/holly.c @@ -251,7 +251,7 @@ static int ppc750_machine_check_exception(struct pt_regs *regs) /* Are we prepared to handle this fault */ if ((entry = search_exception_tables(regs->nip)) != NULL) { tsi108_clear_pci_cfg_error(); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } diff --git a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c index d8da6a483e59..9eb9abb5bce2 100644 --- a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c +++ b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c @@ -173,7 +173,7 @@ static int mpc7448_machine_check_exception(struct pt_regs *regs) /* Are we prepared to handle this fault */ if ((entry = search_exception_tables(regs->nip)) != NULL) { tsi108_clear_pci_cfg_error(); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } diff --git a/arch/powerpc/platforms/pasemi/idle.c b/arch/powerpc/platforms/pasemi/idle.c index 9b88e3cded7d..2a5b84268b77 100644 --- a/arch/powerpc/platforms/pasemi/idle.c +++ b/arch/powerpc/platforms/pasemi/idle.c @@ -58,7 +58,7 @@ static int pasemi_system_reset_exception(struct pt_regs *regs) restore_astate(hard_smp_processor_id()); /* everything handled */ - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); return 1; } diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 2835376e61a4..e9d18519e650 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -588,7 +588,7 @@ static int opal_recover_mce(struct pt_regs *regs, { int recovered = 0; - if (!(regs->msr & MSR_RI)) { + if (regs_is_unrecoverable(regs)) { /* If MSR_RI isn't set, we cannot recover */ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 167f2e1b8d39..56092dccfdb8 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -783,7 +783,7 @@ static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt) { int recovered = 0; - if (!(regs->msr & MSR_RI)) { + if (regs_is_unrecoverable(regs)) { /* If MSR_RI isn't set, we cannot recover */ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index 5a95b8ea23d8..ff7906b48ca1 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -108,7 +108,7 @@ int fsl_rio_mcheck_exception(struct pt_regs *regs) __func__); out_be32((u32 *)(rio_regs_win + RIO_LTLEDCSR), 0); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index ead460b80905..dd8241c009e5 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -482,16 +482,6 @@ static inline void get_output_lock(void) {} static inline void release_output_lock(void) {} #endif -static inline int unrecoverable_excp(struct pt_regs *regs) -{ -#if defined(CONFIG_4xx) || defined(CONFIG_PPC_BOOK3E) - /* We have no MSR_RI bit on 4xx or Book3e, so we simply return false */ - return 0; -#else - return ((regs->msr & MSR_RI) == 0); -#endif -} - static void xmon_touch_watchdogs(void) { touch_softlockup_watchdog_sync(); @@ -565,7 +555,7 @@ static int xmon_core(struct pt_regs *regs, volatile int fromipi) bp = NULL; if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) == (MSR_IR|MSR_64BIT)) bp = at_breakpoint(regs->nip); - if (bp || unrecoverable_excp(regs)) + if (bp || regs_is_unrecoverable(regs)) fromipi = 0; if (!fromipi) { @@ -577,7 +567,7 @@ static int xmon_core(struct pt_regs *regs, volatile int fromipi) cpu, BP_NUM(bp)); xmon_print_symbol(regs->nip, " ", ")\n"); } - if (unrecoverable_excp(regs)) + if (regs_is_unrecoverable(regs)) printf("WARNING: exception is not recoverable, " "can't continue\n"); release_output_lock(); @@ -693,7 +683,7 @@ static int xmon_core(struct pt_regs *regs, volatile int fromipi) printf("Stopped at breakpoint %tx (", BP_NUM(bp)); xmon_print_symbol(regs->nip, " ", ")\n"); } - if (unrecoverable_excp(regs)) + if (regs_is_unrecoverable(regs)) printf("WARNING: exception is not recoverable, " "can't continue\n"); remove_bpts(); From c12adb0678446b3284a6135dc5fa7aa3b6a968a5 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 7 Jan 2021 08:40:38 -0500 Subject: [PATCH 264/474] powerpc: retire sbc8548 board support The support was for this was mainlined 13 years ago, in v2.6.25 [0e0fffe88767] just around the ppc --> powerpc migration. I believe the board was introduced a year or two before that, so it is roughly a 15 year old platform - with the CPU speed and memory size that was typical for that era. I haven't had one of these boards for several years, and availability was discontinued several years before that. Given that, there is no point in adding a burden to testing coverage that builds all possible defconfigs, so it makes sense to remove it. Of course it will remain in the git history forever, for anyone who happens to find a functional board and wants to tinker with it. Acked-by: Scott Wood Signed-off-by: Paul Gortmaker Signed-off-by: Michael Ellerman --- arch/powerpc/boot/Makefile | 1 - arch/powerpc/boot/dts/sbc8548-altflash.dts | 111 -------- arch/powerpc/boot/dts/sbc8548-post.dtsi | 289 -------------------- arch/powerpc/boot/dts/sbc8548-pre.dtsi | 48 ---- arch/powerpc/boot/dts/sbc8548.dts | 106 ------- arch/powerpc/boot/wrapper | 2 +- arch/powerpc/configs/85xx/sbc8548_defconfig | 50 ---- arch/powerpc/configs/mpc85xx_base.config | 1 - arch/powerpc/platforms/85xx/Kconfig | 6 - arch/powerpc/platforms/85xx/Makefile | 1 - arch/powerpc/platforms/85xx/sbc8548.c | 134 --------- 11 files changed, 1 insertion(+), 748 deletions(-) delete mode 100644 arch/powerpc/boot/dts/sbc8548-altflash.dts delete mode 100644 arch/powerpc/boot/dts/sbc8548-post.dtsi delete mode 100644 arch/powerpc/boot/dts/sbc8548-pre.dtsi delete mode 100644 arch/powerpc/boot/dts/sbc8548.dts delete mode 100644 arch/powerpc/configs/85xx/sbc8548_defconfig delete mode 100644 arch/powerpc/platforms/85xx/sbc8548.c diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index e312ea802aa6..3e30ea6f2187 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -341,7 +341,6 @@ image-$(CONFIG_TQM8541) += cuImage.tqm8541 image-$(CONFIG_TQM8548) += cuImage.tqm8548 image-$(CONFIG_TQM8555) += cuImage.tqm8555 image-$(CONFIG_TQM8560) += cuImage.tqm8560 -image-$(CONFIG_SBC8548) += cuImage.sbc8548 image-$(CONFIG_KSI8560) += cuImage.ksi8560 # Board ports in arch/powerpc/platform/86xx/Kconfig diff --git a/arch/powerpc/boot/dts/sbc8548-altflash.dts b/arch/powerpc/boot/dts/sbc8548-altflash.dts deleted file mode 100644 index bb7a1e712bb7..000000000000 --- a/arch/powerpc/boot/dts/sbc8548-altflash.dts +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8548 Device Tree Source - * - * Configured for booting off the alternate (64MB SODIMM) flash. - * Requires switching JP12 jumpers and changing SW2.8 setting. - * - * Copyright 2013 Wind River Systems Inc. - * - * Paul Gortmaker (see MAINTAINERS for contact information) - */ - - -/dts-v1/; - -/include/ "sbc8548-pre.dtsi" - -/{ - localbus@e0000000 { - #address-cells = <2>; - #size-cells = <1>; - compatible = "simple-bus"; - reg = <0xe0000000 0x5000>; - interrupt-parent = <&mpic>; - - ranges = <0x0 0x0 0xfc000000 0x04000000 /*64MB Flash*/ - 0x3 0x0 0xf0000000 0x04000000 /*64MB SDRAM*/ - 0x4 0x0 0xf4000000 0x04000000 /*64MB SDRAM*/ - 0x5 0x0 0xf8000000 0x00b10000 /* EPLD */ - 0x6 0x0 0xef800000 0x00800000>; /*8MB Flash*/ - - flash@0,0 { - #address-cells = <1>; - #size-cells = <1>; - reg = <0x0 0x0 0x04000000>; - compatible = "intel,JS28F128", "cfi-flash"; - bank-width = <4>; - device-width = <1>; - partition@0 { - label = "space"; - /* FC000000 -> FFEFFFFF */ - reg = <0x00000000 0x03f00000>; - }; - partition@3f00000 { - label = "bootloader"; - /* FFF00000 -> FFFFFFFF */ - reg = <0x03f00000 0x00100000>; - read-only; - }; - }; - - - epld@5,0 { - compatible = "wrs,epld-localbus"; - #address-cells = <2>; - #size-cells = <1>; - reg = <0x5 0x0 0x00b10000>; - ranges = < - 0x0 0x0 0x5 0x000000 0x1fff /* LED */ - 0x1 0x0 0x5 0x100000 0x1fff /* Switches */ - 0x3 0x0 0x5 0x300000 0x1fff /* HW Rev. */ - 0xb 0x0 0x5 0xb00000 0x1fff /* EEPROM */ - >; - - led@0,0 { - compatible = "led"; - reg = <0x0 0x0 0x1fff>; - }; - - switches@1,0 { - compatible = "switches"; - reg = <0x1 0x0 0x1fff>; - }; - - hw-rev@3,0 { - compatible = "hw-rev"; - reg = <0x3 0x0 0x1fff>; - }; - - eeprom@b,0 { - compatible = "eeprom"; - reg = <0xb 0 0x1fff>; - }; - - }; - - alt-flash@6,0 { - #address-cells = <1>; - #size-cells = <1>; - compatible = "intel,JS28F640", "cfi-flash"; - reg = <0x6 0x0 0x800000>; - bank-width = <1>; - device-width = <1>; - partition@0 { - label = "space"; - /* EF800000 -> EFF9FFFF */ - reg = <0x00000000 0x007a0000>; - }; - partition@7a0000 { - label = "bootloader"; - /* EFFA0000 -> EFFFFFFF */ - reg = <0x007a0000 0x00060000>; - read-only; - }; - }; - - - }; -}; - -/include/ "sbc8548-post.dtsi" diff --git a/arch/powerpc/boot/dts/sbc8548-post.dtsi b/arch/powerpc/boot/dts/sbc8548-post.dtsi deleted file mode 100644 index 9d848d409408..000000000000 --- a/arch/powerpc/boot/dts/sbc8548-post.dtsi +++ /dev/null @@ -1,289 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8548 Device Tree Source - * - * Copyright 2007 Wind River Systems Inc. - * - * Paul Gortmaker (see MAINTAINERS for contact information) - */ - -/{ - soc8548@e0000000 { - #address-cells = <1>; - #size-cells = <1>; - device_type = "soc"; - ranges = <0x00000000 0xe0000000 0x00100000>; - bus-frequency = <0>; - compatible = "simple-bus"; - - ecm-law@0 { - compatible = "fsl,ecm-law"; - reg = <0x0 0x1000>; - fsl,num-laws = <10>; - }; - - ecm@1000 { - compatible = "fsl,mpc8548-ecm", "fsl,ecm"; - reg = <0x1000 0x1000>; - interrupts = <17 2>; - interrupt-parent = <&mpic>; - }; - - memory-controller@2000 { - compatible = "fsl,mpc8548-memory-controller"; - reg = <0x2000 0x1000>; - interrupt-parent = <&mpic>; - interrupts = <0x12 0x2>; - }; - - L2: l2-cache-controller@20000 { - compatible = "fsl,mpc8548-l2-cache-controller"; - reg = <0x20000 0x1000>; - cache-line-size = <0x20>; // 32 bytes - cache-size = <0x80000>; // L2, 512K - interrupt-parent = <&mpic>; - interrupts = <0x10 0x2>; - }; - - i2c@3000 { - #address-cells = <1>; - #size-cells = <0>; - cell-index = <0>; - compatible = "fsl-i2c"; - reg = <0x3000 0x100>; - interrupts = <0x2b 0x2>; - interrupt-parent = <&mpic>; - dfsrr; - }; - - i2c@3100 { - #address-cells = <1>; - #size-cells = <0>; - cell-index = <1>; - compatible = "fsl-i2c"; - reg = <0x3100 0x100>; - interrupts = <0x2b 0x2>; - interrupt-parent = <&mpic>; - dfsrr; - }; - - dma@21300 { - #address-cells = <1>; - #size-cells = <1>; - compatible = "fsl,mpc8548-dma", "fsl,eloplus-dma"; - reg = <0x21300 0x4>; - ranges = <0x0 0x21100 0x200>; - cell-index = <0>; - dma-channel@0 { - compatible = "fsl,mpc8548-dma-channel", - "fsl,eloplus-dma-channel"; - reg = <0x0 0x80>; - cell-index = <0>; - interrupt-parent = <&mpic>; - interrupts = <20 2>; - }; - dma-channel@80 { - compatible = "fsl,mpc8548-dma-channel", - "fsl,eloplus-dma-channel"; - reg = <0x80 0x80>; - cell-index = <1>; - interrupt-parent = <&mpic>; - interrupts = <21 2>; - }; - dma-channel@100 { - compatible = "fsl,mpc8548-dma-channel", - "fsl,eloplus-dma-channel"; - reg = <0x100 0x80>; - cell-index = <2>; - interrupt-parent = <&mpic>; - interrupts = <22 2>; - }; - dma-channel@180 { - compatible = "fsl,mpc8548-dma-channel", - "fsl,eloplus-dma-channel"; - reg = <0x180 0x80>; - cell-index = <3>; - interrupt-parent = <&mpic>; - interrupts = <23 2>; - }; - }; - - enet0: ethernet@24000 { - #address-cells = <1>; - #size-cells = <1>; - cell-index = <0>; - device_type = "network"; - model = "eTSEC"; - compatible = "gianfar"; - reg = <0x24000 0x1000>; - ranges = <0x0 0x24000 0x1000>; - local-mac-address = [ 00 00 00 00 00 00 ]; - interrupts = <0x1d 0x2 0x1e 0x2 0x22 0x2>; - interrupt-parent = <&mpic>; - tbi-handle = <&tbi0>; - phy-handle = <&phy0>; - - mdio@520 { - #address-cells = <1>; - #size-cells = <0>; - compatible = "fsl,gianfar-mdio"; - reg = <0x520 0x20>; - - phy0: ethernet-phy@19 { - interrupt-parent = <&mpic>; - interrupts = <0x6 0x1>; - reg = <0x19>; - }; - phy1: ethernet-phy@1a { - interrupt-parent = <&mpic>; - interrupts = <0x7 0x1>; - reg = <0x1a>; - }; - tbi0: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - }; - - enet1: ethernet@25000 { - #address-cells = <1>; - #size-cells = <1>; - cell-index = <1>; - device_type = "network"; - model = "eTSEC"; - compatible = "gianfar"; - reg = <0x25000 0x1000>; - ranges = <0x0 0x25000 0x1000>; - local-mac-address = [ 00 00 00 00 00 00 ]; - interrupts = <0x23 0x2 0x24 0x2 0x28 0x2>; - interrupt-parent = <&mpic>; - tbi-handle = <&tbi1>; - phy-handle = <&phy1>; - - mdio@520 { - #address-cells = <1>; - #size-cells = <0>; - compatible = "fsl,gianfar-tbi"; - reg = <0x520 0x20>; - - tbi1: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - }; - - serial0: serial@4500 { - cell-index = <0>; - device_type = "serial"; - compatible = "fsl,ns16550", "ns16550"; - reg = <0x4500 0x100>; // reg base, size - clock-frequency = <0>; // should we fill in in uboot? - interrupts = <0x2a 0x2>; - interrupt-parent = <&mpic>; - }; - - serial1: serial@4600 { - cell-index = <1>; - device_type = "serial"; - compatible = "fsl,ns16550", "ns16550"; - reg = <0x4600 0x100>; // reg base, size - clock-frequency = <0>; // should we fill in in uboot? - interrupts = <0x2a 0x2>; - interrupt-parent = <&mpic>; - }; - - global-utilities@e0000 { //global utilities reg - compatible = "fsl,mpc8548-guts"; - reg = <0xe0000 0x1000>; - fsl,has-rstcr; - }; - - crypto@30000 { - compatible = "fsl,sec2.1", "fsl,sec2.0"; - reg = <0x30000 0x10000>; - interrupts = <45 2>; - interrupt-parent = <&mpic>; - fsl,num-channels = <4>; - fsl,channel-fifo-len = <24>; - fsl,exec-units-mask = <0xfe>; - fsl,descriptor-types-mask = <0x12b0ebf>; - }; - - mpic: pic@40000 { - interrupt-controller; - #address-cells = <0>; - #interrupt-cells = <2>; - reg = <0x40000 0x40000>; - compatible = "chrp,open-pic"; - device_type = "open-pic"; - }; - }; - - pci0: pci@e0008000 { - interrupt-map-mask = <0xf800 0x0 0x0 0x7>; - interrupt-map = < - /* IDSEL 0x01 (PCI-X slot) @66MHz */ - 0x0800 0x0 0x0 0x1 &mpic 0x2 0x1 - 0x0800 0x0 0x0 0x2 &mpic 0x3 0x1 - 0x0800 0x0 0x0 0x3 &mpic 0x4 0x1 - 0x0800 0x0 0x0 0x4 &mpic 0x1 0x1 - - /* IDSEL 0x11 (PCI, 3.3V 32bit) @33MHz */ - 0x8800 0x0 0x0 0x1 &mpic 0x2 0x1 - 0x8800 0x0 0x0 0x2 &mpic 0x3 0x1 - 0x8800 0x0 0x0 0x3 &mpic 0x4 0x1 - 0x8800 0x0 0x0 0x4 &mpic 0x1 0x1>; - - interrupt-parent = <&mpic>; - interrupts = <0x18 0x2>; - bus-range = <0 0>; - ranges = <0x02000000 0x0 0x80000000 0x80000000 0x0 0x10000000 - 0x01000000 0x0 0x00000000 0xe2000000 0x0 0x00800000>; - clock-frequency = <66000000>; - #interrupt-cells = <1>; - #size-cells = <2>; - #address-cells = <3>; - reg = <0xe0008000 0x1000>; - compatible = "fsl,mpc8540-pcix", "fsl,mpc8540-pci"; - device_type = "pci"; - }; - - pci1: pcie@e000a000 { - interrupt-map-mask = <0xf800 0x0 0x0 0x7>; - interrupt-map = < - - /* IDSEL 0x0 (PEX) */ - 0x0000 0x0 0x0 0x1 &mpic 0x0 0x1 - 0x0000 0x0 0x0 0x2 &mpic 0x1 0x1 - 0x0000 0x0 0x0 0x3 &mpic 0x2 0x1 - 0x0000 0x0 0x0 0x4 &mpic 0x3 0x1>; - - interrupt-parent = <&mpic>; - interrupts = <0x1a 0x2>; - bus-range = <0x0 0xff>; - ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x10000000 - 0x01000000 0x0 0x00000000 0xe2800000 0x0 0x08000000>; - clock-frequency = <33000000>; - #interrupt-cells = <1>; - #size-cells = <2>; - #address-cells = <3>; - reg = <0xe000a000 0x1000>; - compatible = "fsl,mpc8548-pcie"; - device_type = "pci"; - pcie@0 { - reg = <0x0 0x0 0x0 0x0 0x0>; - #size-cells = <2>; - #address-cells = <3>; - device_type = "pci"; - ranges = <0x02000000 0x0 0xa0000000 - 0x02000000 0x0 0xa0000000 - 0x0 0x10000000 - - 0x01000000 0x0 0x00000000 - 0x01000000 0x0 0x00000000 - 0x0 0x00800000>; - }; - }; -}; diff --git a/arch/powerpc/boot/dts/sbc8548-pre.dtsi b/arch/powerpc/boot/dts/sbc8548-pre.dtsi deleted file mode 100644 index 0e3665fd15d0..000000000000 --- a/arch/powerpc/boot/dts/sbc8548-pre.dtsi +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8548 Device Tree Source - * - * Copyright 2007 Wind River Systems Inc. - * - * Paul Gortmaker (see MAINTAINERS for contact information) - */ - -/{ - model = "SBC8548"; - compatible = "SBC8548"; - #address-cells = <1>; - #size-cells = <1>; - - aliases { - ethernet0 = &enet0; - ethernet1 = &enet1; - serial0 = &serial0; - serial1 = &serial1; - pci0 = &pci0; - pci1 = &pci1; - }; - - cpus { - #address-cells = <1>; - #size-cells = <0>; - - PowerPC,8548@0 { - device_type = "cpu"; - reg = <0>; - d-cache-line-size = <0x20>; // 32 bytes - i-cache-line-size = <0x20>; // 32 bytes - d-cache-size = <0x8000>; // L1, 32K - i-cache-size = <0x8000>; // L1, 32K - timebase-frequency = <0>; // From uboot - bus-frequency = <0>; - clock-frequency = <0>; - next-level-cache = <&L2>; - }; - }; - - memory { - device_type = "memory"; - reg = <0x00000000 0x10000000>; - }; - -}; diff --git a/arch/powerpc/boot/dts/sbc8548.dts b/arch/powerpc/boot/dts/sbc8548.dts deleted file mode 100644 index ce0a119f496e..000000000000 --- a/arch/powerpc/boot/dts/sbc8548.dts +++ /dev/null @@ -1,106 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8548 Device Tree Source - * - * Copyright 2007 Wind River Systems Inc. - * - * Paul Gortmaker (see MAINTAINERS for contact information) - */ - - -/dts-v1/; - -/include/ "sbc8548-pre.dtsi" - -/{ - localbus@e0000000 { - #address-cells = <2>; - #size-cells = <1>; - compatible = "simple-bus"; - reg = <0xe0000000 0x5000>; - interrupt-parent = <&mpic>; - - ranges = <0x0 0x0 0xff800000 0x00800000 /*8MB Flash*/ - 0x3 0x0 0xf0000000 0x04000000 /*64MB SDRAM*/ - 0x4 0x0 0xf4000000 0x04000000 /*64MB SDRAM*/ - 0x5 0x0 0xf8000000 0x00b10000 /* EPLD */ - 0x6 0x0 0xec000000 0x04000000>; /*64MB Flash*/ - - - flash@0,0 { - #address-cells = <1>; - #size-cells = <1>; - compatible = "intel,JS28F640", "cfi-flash"; - reg = <0x0 0x0 0x800000>; - bank-width = <1>; - device-width = <1>; - partition@0 { - label = "space"; - /* FF800000 -> FFF9FFFF */ - reg = <0x00000000 0x007a0000>; - }; - partition@7a0000 { - label = "bootloader"; - /* FFFA0000 -> FFFFFFFF */ - reg = <0x007a0000 0x00060000>; - read-only; - }; - }; - - epld@5,0 { - compatible = "wrs,epld-localbus"; - #address-cells = <2>; - #size-cells = <1>; - reg = <0x5 0x0 0x00b10000>; - ranges = < - 0x0 0x0 0x5 0x000000 0x1fff /* LED */ - 0x1 0x0 0x5 0x100000 0x1fff /* Switches */ - 0x3 0x0 0x5 0x300000 0x1fff /* HW Rev. */ - 0xb 0x0 0x5 0xb00000 0x1fff /* EEPROM */ - >; - - led@0,0 { - compatible = "led"; - reg = <0x0 0x0 0x1fff>; - }; - - switches@1,0 { - compatible = "switches"; - reg = <0x1 0x0 0x1fff>; - }; - - hw-rev@3,0 { - compatible = "hw-rev"; - reg = <0x3 0x0 0x1fff>; - }; - - eeprom@b,0 { - compatible = "eeprom"; - reg = <0xb 0 0x1fff>; - }; - - }; - - alt-flash@6,0 { - #address-cells = <1>; - #size-cells = <1>; - reg = <0x6 0x0 0x04000000>; - compatible = "intel,JS28F128", "cfi-flash"; - bank-width = <4>; - device-width = <1>; - partition@0 { - label = "space"; - /* EC000000 -> EFEFFFFF */ - reg = <0x00000000 0x03f00000>; - }; - partition@3f00000 { - label = "bootloader"; - /* EFF00000 -> EFFFFFFF */ - reg = <0x03f00000 0x00100000>; - read-only; - }; - }; - }; -}; - -/include/ "sbc8548-post.dtsi" diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 1f4676bab786..1cd82564c996 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -298,7 +298,7 @@ cuboot*) *-tqm8541|*-mpc8560*|*-tqm8560|*-tqm8555|*-ksi8560*) platformo=$object/cuboot-85xx-cpm2.o ;; - *-mpc85*|*-tqm85*|*-sbc85*) + *-mpc85*|*-tqm85*) platformo=$object/cuboot-85xx.o ;; *-amigaone) diff --git a/arch/powerpc/configs/85xx/sbc8548_defconfig b/arch/powerpc/configs/85xx/sbc8548_defconfig deleted file mode 100644 index 258881727119..000000000000 --- a/arch/powerpc/configs/85xx/sbc8548_defconfig +++ /dev/null @@ -1,50 +0,0 @@ -CONFIG_PPC_85xx=y -CONFIG_SYSVIPC=y -CONFIG_LOG_BUF_SHIFT=14 -CONFIG_BLK_DEV_INITRD=y -CONFIG_EXPERT=y -CONFIG_SLAB=y -# CONFIG_BLK_DEV_BSG is not set -CONFIG_SBC8548=y -CONFIG_GEN_RTC=y -CONFIG_BINFMT_MISC=y -CONFIG_MATH_EMULATION=y -# CONFIG_SECCOMP is not set -CONFIG_PCI=y -CONFIG_NET=y -CONFIG_PACKET=y -CONFIG_UNIX=y -CONFIG_XFRM_USER=y -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -CONFIG_IP_PNP_BOOTP=y -CONFIG_SYN_COOKIES=y -# CONFIG_IPV6 is not set -# CONFIG_FW_LOADER is not set -CONFIG_MTD=y -CONFIG_MTD_BLOCK=y -CONFIG_MTD_CFI=y -CONFIG_MTD_CFI_ADV_OPTIONS=y -CONFIG_MTD_CFI_GEOMETRY=y -CONFIG_MTD_CFI_I4=y -CONFIG_MTD_CFI_INTELEXT=y -CONFIG_MTD_PHYSMAP_OF=y -CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_RAM=y -CONFIG_NETDEVICES=y -CONFIG_GIANFAR=y -CONFIG_BROADCOM_PHY=y -# CONFIG_INPUT_KEYBOARD is not set -# CONFIG_INPUT_MOUSE is not set -# CONFIG_SERIO is not set -# CONFIG_VT is not set -CONFIG_SERIAL_8250=y -CONFIG_SERIAL_8250_CONSOLE=y -# CONFIG_HW_RANDOM is not set -# CONFIG_USB_SUPPORT is not set -CONFIG_PROC_KCORE=y -CONFIG_TMPFS=y -CONFIG_NFS_FS=y -CONFIG_ROOT_NFS=y diff --git a/arch/powerpc/configs/mpc85xx_base.config b/arch/powerpc/configs/mpc85xx_base.config index b1593fe6f70b..85907b776908 100644 --- a/arch/powerpc/configs/mpc85xx_base.config +++ b/arch/powerpc/configs/mpc85xx_base.config @@ -13,7 +13,6 @@ CONFIG_P1022_DS=y CONFIG_P1022_RDK=y CONFIG_P1023_RDB=y CONFIG_TWR_P102x=y -CONFIG_SBC8548=y CONFIG_SOCRATES=y CONFIG_STX_GP3=y CONFIG_TQM8540=y diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig index b77cbb0a50e1..4142ebf01382 100644 --- a/arch/powerpc/platforms/85xx/Kconfig +++ b/arch/powerpc/platforms/85xx/Kconfig @@ -208,12 +208,6 @@ config TQM8560 select TQM85xx select CPM2 -config SBC8548 - bool "Wind River SBC8548" - select DEFAULT_UIMAGE - help - This option enables support for the Wind River SBC8548 board - config PPA8548 bool "Prodrive PPA8548" help diff --git a/arch/powerpc/platforms/85xx/Makefile b/arch/powerpc/platforms/85xx/Makefile index d1dd0dca5ebf..60e4e97a929d 100644 --- a/arch/powerpc/platforms/85xx/Makefile +++ b/arch/powerpc/platforms/85xx/Makefile @@ -26,7 +26,6 @@ obj-$(CONFIG_CORENET_GENERIC) += corenet_generic.o obj-$(CONFIG_FB_FSL_DIU) += t1042rdb_diu.o obj-$(CONFIG_STX_GP3) += stx_gp3.o obj-$(CONFIG_TQM85xx) += tqm85xx.o -obj-$(CONFIG_SBC8548) += sbc8548.o obj-$(CONFIG_PPA8548) += ppa8548.o obj-$(CONFIG_SOCRATES) += socrates.o socrates_fpga_pic.o obj-$(CONFIG_KSI8560) += ksi8560.o diff --git a/arch/powerpc/platforms/85xx/sbc8548.c b/arch/powerpc/platforms/85xx/sbc8548.c deleted file mode 100644 index e4acf5ce6b07..000000000000 --- a/arch/powerpc/platforms/85xx/sbc8548.c +++ /dev/null @@ -1,134 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Wind River SBC8548 setup and early boot code. - * - * Copyright 2007 Wind River Systems Inc. - * - * By Paul Gortmaker (see MAINTAINERS for contact information) - * - * Based largely on the MPC8548CDS support - Copyright 2005 Freescale Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "mpc85xx.h" - -static int sbc_rev; - -static void __init sbc8548_pic_init(void) -{ - struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN, - 0, 256, " OpenPIC "); - BUG_ON(mpic == NULL); - mpic_init(mpic); -} - -/* Extract the HW Rev from the EPLD on the board */ -static int __init sbc8548_hw_rev(void) -{ - struct device_node *np; - struct resource res; - unsigned int *rev; - int board_rev = 0; - - np = of_find_compatible_node(NULL, NULL, "hw-rev"); - if (np == NULL) { - printk("No HW-REV found in DTB.\n"); - return -ENODEV; - } - - of_address_to_resource(np, 0, &res); - of_node_put(np); - - rev = ioremap(res.start,sizeof(unsigned int)); - board_rev = (*rev) >> 28; - iounmap(rev); - - return board_rev; -} - -/* - * Setup the architecture - */ -static void __init sbc8548_setup_arch(void) -{ - if (ppc_md.progress) - ppc_md.progress("sbc8548_setup_arch()", 0); - - fsl_pci_assign_primary(); - - sbc_rev = sbc8548_hw_rev(); -} - -static void sbc8548_show_cpuinfo(struct seq_file *m) -{ - uint pvid, svid, phid1; - - pvid = mfspr(SPRN_PVR); - svid = mfspr(SPRN_SVR); - - seq_printf(m, "Vendor\t\t: Wind River\n"); - seq_printf(m, "Machine\t\t: SBC8548 v%d\n", sbc_rev); - seq_printf(m, "PVR\t\t: 0x%x\n", pvid); - seq_printf(m, "SVR\t\t: 0x%x\n", svid); - - /* Display cpu Pll setting */ - phid1 = mfspr(SPRN_HID1); - seq_printf(m, "PLL setting\t: 0x%x\n", ((phid1 >> 24) & 0x3f)); -} - -machine_arch_initcall(sbc8548, mpc85xx_common_publish_devices); - -/* - * Called very early, device-tree isn't unflattened - */ -static int __init sbc8548_probe(void) -{ - return of_machine_is_compatible("SBC8548"); -} - -define_machine(sbc8548) { - .name = "SBC8548", - .probe = sbc8548_probe, - .setup_arch = sbc8548_setup_arch, - .init_IRQ = sbc8548_pic_init, - .show_cpuinfo = sbc8548_show_cpuinfo, - .get_irq = mpic_get_irq, -#ifdef CONFIG_PCI - .pcibios_fixup_bus = fsl_pcibios_fixup_bus, - .pcibios_fixup_phb = fsl_pcibios_fixup_phb, -#endif - .calibrate_decr = generic_calibrate_decr, - .progress = udbg_progress, -}; From d7c1814f2f4f812d7a39447f975abdc095dbf7aa Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 7 Jan 2021 13:45:32 -0500 Subject: [PATCH 265/474] powerpc: retire sbc8641d board support The support was for this was added to mainline over 12 years ago, in v2.6.26 [4e8aae89a35d] just around the ppc --> powerpc migration. I believe the board was introduced shortly after the sbc8548 board, making it roughly a 14 year old platform - with the CPU speed and memory size typical for that era. I haven't had one of these boards for several years, and availability was discontinued several years before that. Given that, there is no point in adding a burden to testing coverage that builds all possible defconfigs, so it makes sense to remove it. Of course it will remain in the git history forever, for anyone who happens to find a functional board and wants to tinker with it. Acked-by: Scott Wood Signed-off-by: Paul Gortmaker Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/fsl/sbc8641d.dts | 176 ----------------------- arch/powerpc/configs/mpc86xx_base.config | 1 - arch/powerpc/configs/ppc6xx_defconfig | 1 - arch/powerpc/platforms/86xx/Kconfig | 8 +- arch/powerpc/platforms/86xx/Makefile | 1 - arch/powerpc/platforms/86xx/sbc8641d.c | 87 ----------- 6 files changed, 1 insertion(+), 273 deletions(-) delete mode 100644 arch/powerpc/boot/dts/fsl/sbc8641d.dts delete mode 100644 arch/powerpc/platforms/86xx/sbc8641d.c diff --git a/arch/powerpc/boot/dts/fsl/sbc8641d.dts b/arch/powerpc/boot/dts/fsl/sbc8641d.dts deleted file mode 100644 index 3dca10acc161..000000000000 --- a/arch/powerpc/boot/dts/fsl/sbc8641d.dts +++ /dev/null @@ -1,176 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8641D Device Tree Source - * - * Copyright 2008 Wind River Systems Inc. - * - * Paul Gortmaker (see MAINTAINERS for contact information) - * - * Based largely on the mpc8641_hpcn.dts by Freescale Semiconductor Inc. - */ - -/include/ "mpc8641si-pre.dtsi" - -/ { - model = "SBC8641D"; - compatible = "wind,sbc8641"; - - memory { - device_type = "memory"; - reg = <0x00000000 0x20000000>; // 512M at 0x0 - }; - - lbc: localbus@f8005000 { - reg = <0xf8005000 0x1000>; - - ranges = <0 0 0xff000000 0x01000000 // 16MB Boot flash - 1 0 0xf0000000 0x00010000 // 64KB EEPROM - 2 0 0xf1000000 0x00100000 // EPLD (1MB) - 3 0 0xe0000000 0x04000000 // 64MB LB SDRAM (CS3) - 4 0 0xe4000000 0x04000000 // 64MB LB SDRAM (CS4) - 6 0 0xf4000000 0x00100000 // LCD display (1MB) - 7 0 0xe8000000 0x04000000>; // 64MB OneNAND - - flash@0,0 { - compatible = "cfi-flash"; - reg = <0 0 0x01000000>; - bank-width = <2>; - device-width = <2>; - #address-cells = <1>; - #size-cells = <1>; - partition@0 { - label = "dtb"; - reg = <0x00000000 0x00100000>; - read-only; - }; - partition@300000 { - label = "kernel"; - reg = <0x00100000 0x00400000>; - read-only; - }; - partition@400000 { - label = "fs"; - reg = <0x00500000 0x00a00000>; - }; - partition@700000 { - label = "firmware"; - reg = <0x00f00000 0x00100000>; - read-only; - }; - }; - - epld@2,0 { - compatible = "wrs,epld-localbus"; - #address-cells = <2>; - #size-cells = <1>; - reg = <2 0 0x100000>; - ranges = <0 0 5 0 1 // User switches - 1 0 5 1 1 // Board ID/Rev - 3 0 5 3 1>; // LEDs - }; - }; - - soc: soc@f8000000 { - ranges = <0x00000000 0xf8000000 0x00100000>; - - enet0: ethernet@24000 { - tbi-handle = <&tbi0>; - phy-handle = <&phy0>; - phy-connection-type = "rgmii-id"; - }; - - mdio@24520 { - phy0: ethernet-phy@1f { - reg = <0x1f>; - }; - phy1: ethernet-phy@0 { - reg = <0>; - }; - phy2: ethernet-phy@1 { - reg = <1>; - }; - phy3: ethernet-phy@2 { - reg = <2>; - }; - tbi0: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - - enet1: ethernet@25000 { - tbi-handle = <&tbi1>; - phy-handle = <&phy1>; - phy-connection-type = "rgmii-id"; - }; - - mdio@25520 { - tbi1: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - - enet2: ethernet@26000 { - tbi-handle = <&tbi2>; - phy-handle = <&phy2>; - phy-connection-type = "rgmii-id"; - }; - - mdio@26520 { - tbi2: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - - enet3: ethernet@27000 { - tbi-handle = <&tbi3>; - phy-handle = <&phy3>; - phy-connection-type = "rgmii-id"; - }; - - mdio@27520 { - tbi3: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - }; - - pci0: pcie@f8008000 { - reg = <0xf8008000 0x1000>; - ranges = <0x02000000 0x0 0x80000000 0x80000000 0x0 0x20000000 - 0x01000000 0x0 0x00000000 0xe2000000 0x0 0x00100000>; - interrupt-map-mask = <0xff00 0 0 7>; - - pcie@0 { - ranges = <0x02000000 0x0 0x80000000 - 0x02000000 0x0 0x80000000 - 0x0 0x20000000 - - 0x01000000 0x0 0x00000000 - 0x01000000 0x0 0x00000000 - 0x0 0x00100000>; - }; - - }; - - pci1: pcie@f8009000 { - reg = <0xf8009000 0x1000>; - ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000 - 0x01000000 0x0 0x00000000 0xe3000000 0x0 0x00100000>; - - pcie@0 { - ranges = <0x02000000 0x0 0xa0000000 - 0x02000000 0x0 0xa0000000 - 0x0 0x20000000 - - 0x01000000 0x0 0x00000000 - 0x01000000 0x0 0x00000000 - 0x0 0x00100000>; - }; - }; -}; - -/include/ "mpc8641si-post.dtsi" diff --git a/arch/powerpc/configs/mpc86xx_base.config b/arch/powerpc/configs/mpc86xx_base.config index 67bd1fa036ee..588870e6af3b 100644 --- a/arch/powerpc/configs/mpc86xx_base.config +++ b/arch/powerpc/configs/mpc86xx_base.config @@ -1,6 +1,5 @@ CONFIG_PPC_86xx=y CONFIG_MPC8641_HPCN=y -CONFIG_SBC8641D=y CONFIG_MPC8610_HPCD=y CONFIG_GEF_PPC9A=y CONFIG_GEF_SBC310=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index ee09f7bb6ea9..6697c5e6682f 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -55,7 +55,6 @@ CONFIG_MPC837x_RDB=y CONFIG_ASP834x=y CONFIG_PPC_86xx=y CONFIG_MPC8641_HPCN=y -CONFIG_SBC8641D=y CONFIG_MPC8610_HPCD=y CONFIG_GEF_SBC610=y CONFIG_CPU_FREQ=y diff --git a/arch/powerpc/platforms/86xx/Kconfig b/arch/powerpc/platforms/86xx/Kconfig index 07a9d60c618a..be867abebc83 100644 --- a/arch/powerpc/platforms/86xx/Kconfig +++ b/arch/powerpc/platforms/86xx/Kconfig @@ -20,12 +20,6 @@ config MPC8641_HPCN help This option enables support for the MPC8641 HPCN board. -config SBC8641D - bool "Wind River SBC8641D" - select DEFAULT_UIMAGE - help - This option enables support for the WRS SBC8641D board. - config MPC8610_HPCD bool "Freescale MPC8610 HPCD" select DEFAULT_UIMAGE @@ -74,7 +68,7 @@ config MPC8641 select FSL_PCI if PCI select PPC_UDBG_16550 select MPIC - default y if MPC8641_HPCN || SBC8641D || GEF_SBC610 || GEF_SBC310 || GEF_PPC9A \ + default y if MPC8641_HPCN || GEF_SBC610 || GEF_SBC310 || GEF_PPC9A \ || MVME7100 config MPC8610 diff --git a/arch/powerpc/platforms/86xx/Makefile b/arch/powerpc/platforms/86xx/Makefile index 2c04449be107..5bbe1475bf26 100644 --- a/arch/powerpc/platforms/86xx/Makefile +++ b/arch/powerpc/platforms/86xx/Makefile @@ -6,7 +6,6 @@ obj-y := pic.o common.o obj-$(CONFIG_SMP) += mpc86xx_smp.o obj-$(CONFIG_MPC8641_HPCN) += mpc86xx_hpcn.o -obj-$(CONFIG_SBC8641D) += sbc8641d.o obj-$(CONFIG_MPC8610_HPCD) += mpc8610_hpcd.o obj-$(CONFIG_GEF_SBC610) += gef_sbc610.o obj-$(CONFIG_GEF_SBC310) += gef_sbc310.o diff --git a/arch/powerpc/platforms/86xx/sbc8641d.c b/arch/powerpc/platforms/86xx/sbc8641d.c deleted file mode 100644 index dc23dd383d6e..000000000000 --- a/arch/powerpc/platforms/86xx/sbc8641d.c +++ /dev/null @@ -1,87 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8641D board specific routines - * - * Copyright 2008 Wind River Systems Inc. - * - * By Paul Gortmaker (see MAINTAINERS for contact information) - * - * Based largely on the 8641 HPCN support by Freescale Semiconductor Inc. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -#include "mpc86xx.h" - -static void __init -sbc8641_setup_arch(void) -{ - if (ppc_md.progress) - ppc_md.progress("sbc8641_setup_arch()", 0); - - printk("SBC8641 board from Wind River\n"); - -#ifdef CONFIG_SMP - mpc86xx_smp_init(); -#endif - - fsl_pci_assign_primary(); -} - - -static void -sbc8641_show_cpuinfo(struct seq_file *m) -{ - uint svid = mfspr(SPRN_SVR); - - seq_printf(m, "Vendor\t\t: Wind River Systems\n"); - - seq_printf(m, "SVR\t\t: 0x%x\n", svid); -} - - -/* - * Called very early, device-tree isn't unflattened - */ -static int __init sbc8641_probe(void) -{ - if (of_machine_is_compatible("wind,sbc8641")) - return 1; /* Looks good */ - - return 0; -} - -machine_arch_initcall(sbc8641, mpc86xx_common_publish_devices); - -define_machine(sbc8641) { - .name = "SBC8641D", - .probe = sbc8641_probe, - .setup_arch = sbc8641_setup_arch, - .init_IRQ = mpc86xx_init_irq, - .show_cpuinfo = sbc8641_show_cpuinfo, - .get_irq = mpic_get_irq, - .time_init = mpc86xx_time_init, - .calibrate_decr = generic_calibrate_decr, - .progress = udbg_progress, -#ifdef CONFIG_PCI - .pcibios_fixup_bus = fsl_pcibios_fixup_bus, -#endif -}; From 5bd4ae07e7970d73e3372910e60bb38623ac3064 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 7 Jan 2021 14:18:59 -0500 Subject: [PATCH 266/474] MAINTAINERS: update for Paul Gortmaker Signed-off-by: Paul Gortmaker Signed-off-by: Michael Ellerman --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6c8be735cc91..43f07a1a3a52 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6801,7 +6801,6 @@ F: Documentation/admin-guide/media/em28xx* F: drivers/media/usb/em28xx/ EMBEDDED LINUX -M: Paul Gortmaker M: Matt Mackall M: David Woodhouse L: linux-embedded@vger.kernel.org From f50da6edbf1ebf35dd8070847bfab5cb988d472b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 25 Aug 2021 09:54:47 +0530 Subject: [PATCH 267/474] powerpc/doc: Fix htmldocs errors Fix make htmldocs related errors with the newly added associativity.rst doc file. Reported-by: Stephen Rothwell Tested-by: Stephen Rothwell # build test Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210825042447.106219-1-aneesh.kumar@linux.ibm.com --- Documentation/powerpc/associativity.rst | 29 +++++++++++++------------ Documentation/powerpc/index.rst | 1 + 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/Documentation/powerpc/associativity.rst b/Documentation/powerpc/associativity.rst index 07e7dd3d6c87..4d01c7368561 100644 --- a/Documentation/powerpc/associativity.rst +++ b/Documentation/powerpc/associativity.rst @@ -1,6 +1,6 @@ ============================ NUMA resource associativity -============================= +============================ Associativity represents the groupings of the various platform resources into domains of substantially similar mean performance relative to resources outside @@ -20,11 +20,11 @@ A value of 1 indicates the usage of Form 1 associativity. For Form 2 associativi bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used. Form 0 ------ +------ Form 0 associativity supports only two NUMA distances (LOCAL and REMOTE). Form 1 ------ +------ With Form 1 a combination of ibm,associativity-reference-points, and ibm,associativity device tree properties are used to determine the NUMA distance between resource groups/domains. @@ -78,17 +78,18 @@ numa-lookup-index-table. For ex: ibm,numa-lookup-index-table = <3 0 8 40>; -ibm,numa-distace-table = <9>, /bits/ 8 < 10 20 80 - 20 10 160 - 80 160 10>; - | 0 8 40 ---|------------ - | -0 | 10 20 80 - | -8 | 20 10 160 - | -40| 80 160 10 +ibm,numa-distace-table = <9>, /bits/ 8 < 10 20 80 20 10 160 80 160 10>; + +:: + + | 0 8 40 + --|------------ + | + 0 | 10 20 80 + | + 8 | 20 10 160 + | + 40| 80 160 10 A possible "ibm,associativity" property for resources in node 0, 8 and 40 diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst index bf5f1a2bdbdf..0f7d3c495693 100644 --- a/Documentation/powerpc/index.rst +++ b/Documentation/powerpc/index.rst @@ -7,6 +7,7 @@ powerpc .. toctree:: :maxdepth: 1 + associativity booting bootwrapper cpu_families From 8149238ffd210875f5a77e3c654bb59b58da35e3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 25 Aug 2021 13:34:45 +0000 Subject: [PATCH 268/474] powerpc: Redefine HMT_xxx macros as empty on PPC32 HMT_xxx macros are macros for adjusting thread priority (hardware multi-threading) are macros inherited from PPC64 via commit 5f7c690728ac ("[PATCH] powerpc: Merged ppc_asm.h") Those instructions are pointless on PPC32, but some common fonctions like arch_cpu_idle() use them. So make them empty on PPC32 to avoid those instructions. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c5a07fadea33d640ad10cecf0ac8faaec1c524e0.1629898474.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/vdso/processor.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/powerpc/include/asm/vdso/processor.h b/arch/powerpc/include/asm/vdso/processor.h index e072577bc7c0..8d79f994b4aa 100644 --- a/arch/powerpc/include/asm/vdso/processor.h +++ b/arch/powerpc/include/asm/vdso/processor.h @@ -5,12 +5,21 @@ #ifndef __ASSEMBLY__ /* Macros for adjusting thread priority (hardware multi-threading) */ +#ifdef CONFIG_PPC64 #define HMT_very_low() asm volatile("or 31, 31, 31 # very low priority") #define HMT_low() asm volatile("or 1, 1, 1 # low priority") #define HMT_medium_low() asm volatile("or 6, 6, 6 # medium low priority") #define HMT_medium() asm volatile("or 2, 2, 2 # medium priority") #define HMT_medium_high() asm volatile("or 5, 5, 5 # medium high priority") #define HMT_high() asm volatile("or 3, 3, 3 # high priority") +#else +#define HMT_very_low() +#define HMT_low() +#define HMT_medium_low() +#define HMT_medium() +#define HMT_medium_high() +#define HMT_high() +#endif #ifdef CONFIG_PPC64 #define cpu_relax() do { HMT_low(); HMT_medium(); barrier(); } while (0) From 602d0f96563c2e0b8e1ddb22ac46bf7f58480d64 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 26 Aug 2021 21:56:51 +0930 Subject: [PATCH 269/474] powerpc/microwatt: Add Ethernet to device tree The liteeth network device is used in the Microwatt soc. Signed-off-by: Joel Stanley Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826122653.3236867-2-joel@jms.id.au --- arch/powerpc/boot/dts/microwatt.dts | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/powerpc/boot/dts/microwatt.dts b/arch/powerpc/boot/dts/microwatt.dts index 974abbdda249..65b270a90f94 100644 --- a/arch/powerpc/boot/dts/microwatt.dts +++ b/arch/powerpc/boot/dts/microwatt.dts @@ -127,6 +127,18 @@ fifo-size = <16>; interrupts = <0x10 0x1>; }; + + ethernet@8020000 { + compatible = "litex,liteeth"; + reg = <0x8021000 0x100 + 0x8020800 0x100 + 0x8030000 0x2000>; + reg-names = "mac", "mido", "buffer"; + litex,rx-slots = <2>; + litex,tx-slots = <2>; + litex,slot-size = <0x800>; + interrupts = <0x11 0x1>; + }; }; chosen { From ef4fcaf99cd27eb48790f2adc4eff456dbe1dec4 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 26 Aug 2021 21:56:52 +0930 Subject: [PATCH 270/474] powerpc/configs/microwattt: Enable Liteeth Liteeth is the network device used by Microwatt. Signed-off-by: Joel Stanley Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826122653.3236867-3-joel@jms.id.au --- arch/powerpc/configs/microwatt_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/configs/microwatt_defconfig b/arch/powerpc/configs/microwatt_defconfig index ebc90aefbc0c..6b76232094b2 100644 --- a/arch/powerpc/configs/microwatt_defconfig +++ b/arch/powerpc/configs/microwatt_defconfig @@ -53,6 +53,7 @@ CONFIG_MTD_SPI_NOR=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_NETDEVICES=y +CONFIG_LITEX_LITEETH=y # CONFIG_WLAN is not set # CONFIG_INPUT is not set # CONFIG_SERIO is not set From 3e18e271182206c996a3a7efbbe70c66307ef137 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 26 Aug 2021 21:56:53 +0930 Subject: [PATCH 271/474] powerpc/configs/microwatt: Enable options for systemd When booting with systemd these options are required. This increases the image by about 50KB, or 2%. Signed-off-by: Joel Stanley Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826122653.3236867-4-joel@jms.id.au --- arch/powerpc/configs/microwatt_defconfig | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/configs/microwatt_defconfig b/arch/powerpc/configs/microwatt_defconfig index 6b76232094b2..9465209b8c5b 100644 --- a/arch/powerpc/configs/microwatt_defconfig +++ b/arch/powerpc/configs/microwatt_defconfig @@ -5,6 +5,7 @@ CONFIG_PREEMPT_VOLUNTARY=y CONFIG_TICK_CPU_ACCOUNTING=y CONFIG_LOG_BUF_SHIFT=16 CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=12 +CONFIG_CGROUPS=y CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KALLSYMS_ALL=y @@ -78,8 +79,10 @@ CONFIG_SPI_SPIDEV=y CONFIG_EXT4_FS=y # CONFIG_FILE_LOCKING is not set # CONFIG_DNOTIFY is not set -# CONFIG_INOTIFY_USER is not set +CONFIG_AUTOFS_FS=y +CONFIG_TMPFS=y # CONFIG_MISC_FILESYSTEMS is not set +CONFIG_CRYPTO_SHA256=y # CONFIG_CRYPTO_HW is not set # CONFIG_XZ_DEC_X86 is not set # CONFIG_XZ_DEC_IA64 is not set From 8efd249babea2fec268cff90b9f5ca723dbb7499 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:33:59 +0530 Subject: [PATCH 272/474] powerpc/smp: Fix a crash while booting kvm guest with nr_cpus=2 Aneesh reported a crash with a fairly recent upstream kernel when booting kernel whose commandline was appended with nr_cpus=2 1:mon> e cpu 0x1: Vector: 300 (Data Access) at [c000000008a67bd0] pc: c00000000002557c: cpu_to_chip_id+0x3c/0x100 lr: c000000000058380: start_secondary+0x460/0xb00 sp: c000000008a67e70 msr: 8000000000001033 dar: 10 dsisr: 80000 current = 0xc00000000891bb00 paca = 0xc0000018ff981f80 irqmask: 0x03 irq_happened: 0x01 pid = 0, comm = swapper/1 Linux version 5.13.0-rc3-15704-ga050a6d2b7e8 (kvaneesh@ltc-boston8) (gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #433 SMP Tue May 25 02:38:49 CDT 2021 1:mon> t [link register ] c000000000058380 start_secondary+0x460/0xb00 [c000000008a67e70] c000000008a67eb0 (unreliable) [c000000008a67eb0] c0000000000589d4 start_secondary+0xab4/0xb00 [c000000008a67f90] c00000000000c654 start_secondary_prolog+0x10/0x14 Current code assumes that num_possible_cpus() is always greater than threads_per_core. However this may not be true when using nr_cpus=2 or similar options. Handle the case where num_possible_cpus() is not an exact multiple of threads_per_core. Fixes: c1e53367dab1 ("powerpc/smp: Cache CPU to chip lookup") Reported-by: Aneesh Kumar K.V Signed-off-by: Srikar Dronamraju Debugged-by: Michael Ellerman Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100401.412519-2-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index f2abd88e0c25..a3eab89ee572 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1109,7 +1109,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } if (cpu_to_chip_id(boot_cpuid) != -1) { - int idx = num_possible_cpus() / threads_per_core; + int idx = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); /* * All threads of a core will all belong to the same core, From b8b928030332a0ca16d42433eb2c3085600d8704 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:34:00 +0530 Subject: [PATCH 273/474] powerpc/smp: Update cpu_core_map on all PowerPc systems lscpu() uses core_siblings to list the number of sockets in the system. core_siblings is set using topology_core_cpumask. While optimizing the powerpc bootup path, Commit 4ca234a9cbd7 ("powerpc/smp: Stop updating cpu_core_mask"). it was found that updating cpu_core_mask() ended up taking a lot of time. It was thought that on Powerpc, cpu_core_mask() would always be same as cpu_cpu_mask() i.e number of sockets will always be equal to number of nodes. As an optimization, cpu_core_mask() was made a snapshot of cpu_cpu_mask(). However that was found to be false with PowerPc KVM guests, where each node could have more than one socket. So with Commit c47f892d7aa6 ("powerpc/smp: Reintroduce cpu_core_mask"), cpu_core_mask was updated based on chip_id but in an optimized way using some mask manipulations and chip_id caching. However on non-PowerNV and non-pseries KVM guests (i.e not implementing cpu_to_chip_id(), continued to use a copy of cpu_cpu_mask(). There are two issues that were noticed on such systems 1. lscpu would report one extra socket. On a IBM,9009-42A (aka zz system) which has only 2 chips/ sockets/ nodes, lscpu would report Architecture: ppc64le Byte Order: Little Endian CPU(s): 160 On-line CPU(s) list: 0-159 Thread(s) per core: 8 Core(s) per socket: 6 Socket(s): 3 <-------------- NUMA node(s): 2 Model: 2.2 (pvr 004e 0202) Model name: POWER9 (architected), altivec supported Hypervisor vendor: pHyp Virtualization type: para L1d cache: 32K L1i cache: 32K L2 cache: 512K L3 cache: 10240K NUMA node0 CPU(s): 0-79 NUMA node1 CPU(s): 80-159 2. Currently cpu_cpu_mask is updated when a core is added/removed. However its not updated when smt mode switching or on CPUs are explicitly offlined. However all other percpu masks are updated to ensure only active/online CPUs are in the masks. This results in build_sched_domain traces since there will be CPUs in cpu_cpu_mask() but those CPUs are not present in SMT / CACHE / MC / NUMA domains. A loop of threads running smt mode switching and core add/remove will soon show this trace. Hence cpu_cpu_mask has to be update at smt mode switch. This will have impact on cpu_core_mask(). cpu_core_mask() is a snapshot of cpu_cpu_mask. Different CPUs within the same socket will end up having different cpu_core_masks since they are snapshots at different points of time. This means when lscpu will start reporting many more sockets than the actual number of sockets/ nodes / chips. Different ways to handle this problem: A. Update the snapshot aka cpu_core_mask for all CPUs whenever cpu_cpu_mask is updated. This would a non-optimal solution. B. Instead of a cpumask_var_t, make cpu_core_map a cpumask pointer pointing to cpu_cpu_mask. However percpu cpumask pointer is frowned upon and we need a clean way to handle PowerPc KVM guest which is not a snapshot. C. Update cpu_core_masks all PowerPc systems like in PowerPc KVM guests using mask manipulations. This approach is relatively simple and unifies with the existing code. D. On top of 3, we could also resurrect get_physical_package_id which could return a nid for the said CPU. However this is not needed at this time. Option C is the preferred approach for now. While this is somewhat a revert of Commit 4ca234a9cbd7 ("powerpc/smp: Stop updating cpu_core_mask"). 1. Plain revert has some conflicts 2. For chip_id == -1, the cpu_core_mask is made identical to cpu_cpu_mask, unlike previously where cpu_core_mask was set to a core if chip_id doesn't exist. This goes by the principle that if chip_id is not exposed, then sockets / chip / node share the same set of CPUs. With the fix, lscpu o/p would be Architecture: ppc64le Byte Order: Little Endian CPU(s): 160 On-line CPU(s) list: 0-159 Thread(s) per core: 8 Core(s) per socket: 6 Socket(s): 2 <-------------- NUMA node(s): 2 Model: 2.2 (pvr 004e 0202) Model name: POWER9 (architected), altivec supported Hypervisor vendor: pHyp Virtualization type: para L1d cache: 32K L1i cache: 32K L2 cache: 512K L3 cache: 10240K NUMA node0 CPU(s): 0-79 NUMA node1 CPU(s): 80-159 Fixes: 4ca234a9cbd7 ("powerpc/smp: Stop updating cpu_core_mask") Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100401.412519-3-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index a3eab89ee572..4bdc339fff3c 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1527,6 +1527,7 @@ static void add_cpu_to_masks(int cpu) * add it to it's own thread sibling mask. */ cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); + cpumask_set_cpu(cpu, cpu_core_mask(cpu)); for (i = first_thread; i < first_thread + threads_per_core; i++) if (cpu_online(i)) @@ -1544,11 +1545,6 @@ static void add_cpu_to_masks(int cpu) if (chip_id_lookup_table && ret) chip_id = cpu_to_chip_id(cpu); - if (chip_id == -1) { - cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu)); - goto out; - } - if (shared_caches) submask_fn = cpu_l2_cache_mask; @@ -1558,6 +1554,10 @@ static void add_cpu_to_masks(int cpu) /* Skip all CPUs already part of current CPU core mask */ cpumask_andnot(mask, cpu_online_mask, cpu_core_mask(cpu)); + /* If chip_id is -1; limit the cpu_core_mask to within DIE*/ + if (chip_id == -1) + cpumask_and(mask, mask, cpu_cpu_mask(cpu)); + for_each_cpu(i, mask) { if (chip_id == cpu_to_chip_id(i)) { or_cpumasks_related(cpu, i, submask_fn, cpu_core_mask); @@ -1567,7 +1567,6 @@ static void add_cpu_to_masks(int cpu) } } -out: free_cpumask_var(mask); } From 5bf63497b8ddf53d8973f68076119e482639b2bb Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:34:01 +0530 Subject: [PATCH 274/474] powerpc/smp: Enable CACHE domain for shared processor Currently CACHE domain is not enabled on shared processor mode PowerVM LPARS. On PowerVM systems, 'ibm,thread-group' device-tree property 2 under cpu-device-node indicates which all CPUs share L2-cache. However 'ibm,thread-group' device-tree property 2 is a relatively new property. In absence of 'ibm,thread-group' property 2, 'l2-cache' device property under cpu-device-node could help system to identify CPUs sharing L2-cache. However this property is not exposed by PhyP in shared processor mode configurations. In absence of properties that inform OS about which CPUs share L2-cache, fallback on core boundary. Here are some stats from Power9 shared LPAR with the changes. $ lscpu Architecture: ppc64le Byte Order: Little Endian CPU(s): 32 On-line CPU(s) list: 0-31 Thread(s) per core: 8 Core(s) per socket: 1 Socket(s): 3 NUMA node(s): 2 Model: 2.2 (pvr 004e 0202) Model name: POWER9 (architected), altivec supported Hypervisor vendor: pHyp Virtualization type: para L1d cache: 32K L1i cache: 32K NUMA node0 CPU(s): 16-23 NUMA node1 CPU(s): 0-15,24-31 Physical sockets: 2 Physical chips: 1 Physical cores/chip: 10 Before patch $ grep -r . /sys/kernel/debug/sched/domains/cpu0/domain*/name Before /sys/kernel/debug/sched/domains/cpu0/domain0/name:SMT /sys/kernel/debug/sched/domains/cpu0/domain1/name:DIE /sys/kernel/debug/sched/domains/cpu0/domain2/name:NUMA After /sys/kernel/debug/sched/domains/cpu0/domain0/name:SMT /sys/kernel/debug/sched/domains/cpu0/domain1/name:CACHE /sys/kernel/debug/sched/domains/cpu0/domain2/name:DIE /sys/kernel/debug/sched/domains/cpu0/domain3/name:NUMA $ awk '/domain/{print $1, $2}' /proc/schedstat | sort -u | sed -e 's/00000000,//g' Before domain0 00000055 domain0 000000aa domain0 00005500 domain0 0000aa00 domain0 00550000 domain0 00aa0000 domain0 55000000 domain0 aa000000 domain1 00ff0000 domain1 ff00ffff domain2 ffffffff After domain0 00000055 domain0 000000aa domain0 00005500 domain0 0000aa00 domain0 00550000 domain0 00aa0000 domain0 55000000 domain0 aa000000 domain1 000000ff domain1 0000ff00 domain1 00ff0000 domain1 ff000000 domain2 ff00ffff domain2 ffffffff domain3 ffffffff (Lower is better) perf stat -a -r 5 -n perf bench sched pipe | tail -n 2 Before 153.798 +- 0.142 seconds time elapsed ( +- 0.09% ) After 111.545 +- 0.652 seconds time elapsed ( +- 0.58% ) which is an improvement of 27.47% Signed-off-by: Srikar Dronamraju Reviewed-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100401.412519-4-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 4bdc339fff3c..548aa58d25f9 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1400,7 +1400,7 @@ static bool update_mask_by_l2(int cpu, cpumask_var_t *mask) l2_cache = cpu_to_l2cache(cpu); if (!l2_cache || !*mask) { /* Assume only core siblings share cache with this CPU */ - for_each_cpu(i, submask_fn(cpu)) + for_each_cpu(i, cpu_sibling_mask(cpu)) set_cpus_related(cpu, i, cpu_l2_cache_mask); return false; From 544af6429777cefae2f8af9a9866df5e8cb21763 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:35:17 +0530 Subject: [PATCH 275/474] powerpc/numa: Drop dbg in favour of pr_debug powerpc supported numa=debug which is not documented. This option was used to print early debug output. However something more flexible can be achieved by using CONFIG_DYNAMIC_DEBUG. Hence drop dbg (and numa=debug) in favour of pr_debug Suggested-by: Michael Ellerman Signed-off-by: Srikar Dronamraju [mpe: Rebase on to powerpc/next form2 affinity changes] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100521.412639-2-srikar@linux.vnet.ibm.com --- arch/powerpc/mm/numa.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index fe9c6ced40b2..73ba6b072274 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -40,9 +40,6 @@ static int numa_enabled = 1; static char *cmdline __initdata; -static int numa_debug; -#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } - int numa_cpu_lookup_table[NR_CPUS]; cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; struct pglist_data *node_data[MAX_NUMNODES]; @@ -87,7 +84,7 @@ static void __init setup_node_to_cpumask_map(void) alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); /* cpumask_of_node() will now work */ - dbg("Node to cpumask map for %u nodes\n", nr_node_ids); + pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } static int __init fake_numa_create_new_node(unsigned long end_pfn, @@ -131,7 +128,7 @@ static int __init fake_numa_create_new_node(unsigned long end_pfn, cmdline = p; fake_nid++; *nid = fake_nid; - dbg("created new fake_node with id %d\n", fake_nid); + pr_debug("created new fake_node with id %d\n", fake_nid); return 1; } return 0; @@ -149,7 +146,7 @@ static void map_cpu_to_node(int cpu, int node) { update_numa_cpu_lookup_table(cpu, node); - dbg("adding cpu %d to node %d\n", cpu, node); + pr_debug("adding cpu %d to node %d\n", cpu, node); if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) cpumask_set_cpu(cpu, node_to_cpumask_map[node]); @@ -160,7 +157,7 @@ static void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; - dbg("removing cpu %lu from node %d\n", cpu, node); + pr_debug("removing cpu %lu from node %d\n", cpu, node); if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); @@ -450,10 +447,10 @@ static int __init find_primary_domain_index(void) if (firmware_has_feature(FW_FEATURE_OPAL)) { affinity_form = FORM1_AFFINITY; } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) { - dbg("Using form 2 affinity\n"); + pr_debug("Using form 2 affinity\n"); affinity_form = FORM2_AFFINITY; } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { - dbg("Using form 1 affinity\n"); + pr_debug("Using form 1 affinity\n"); affinity_form = FORM1_AFFINITY; } else affinity_form = FORM0_AFFINITY; @@ -482,7 +479,7 @@ static int __init find_primary_domain_index(void) &distance_ref_points_depth); if (!distance_ref_points) { - dbg("NUMA: ibm,associativity-reference-points not found.\n"); + pr_debug("NUMA: ibm,associativity-reference-points not found.\n"); goto err; } @@ -928,7 +925,7 @@ static int __init parse_numa_properties(void) return primary_domain_index; } - dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); + pr_debug("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); /* * If it is FORM2 initialize the distance table here. @@ -1251,9 +1248,6 @@ static int __init early_numa(char *p) if (strstr(p, "off")) numa_enabled = 0; - if (strstr(p, "debug")) - numa_debug = 1; - p = strstr(p, "fake="); if (p) cmdline = p + strlen("fake="); @@ -1416,7 +1410,7 @@ static long vphn_get_associativity(unsigned long cpu, switch (rc) { case H_SUCCESS: - dbg("VPHN hcall succeeded. Reset polling...\n"); + pr_debug("VPHN hcall succeeded. Reset polling...\n"); goto out; case H_FUNCTION: From 506c2075ffd8db352c53201ef166948a272e3bce Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:35:18 +0530 Subject: [PATCH 276/474] powerpc/numa: convert printk to pr_xxx Convert the remaining printk to pr_xxx One advantage would be all prints will now have prefix "numa:" from pr_fmt(). [ convert printk(KERN_ERR) to pr_warn : Suggested by Laurent Dufour ] Suggested-by: Michael Ellerman Signed-off-by: Srikar Dronamraju [mpe: Rebase onto powerpc/next, s/WARNING/Warning/] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100521.412639-3-srikar@linux.vnet.ibm.com --- arch/powerpc/mm/numa.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 73ba6b072274..f4e52581713e 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -162,8 +162,7 @@ static void unmap_cpu_from_node(unsigned long cpu) if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); } else { - printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", - cpu, node); + pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node); } } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ @@ -479,15 +478,14 @@ static int __init find_primary_domain_index(void) &distance_ref_points_depth); if (!distance_ref_points) { - pr_debug("NUMA: ibm,associativity-reference-points not found.\n"); + pr_debug("ibm,associativity-reference-points not found.\n"); goto err; } distance_ref_points_depth /= sizeof(int); if (affinity_form == FORM0_AFFINITY) { if (distance_ref_points_depth < 2) { - printk(KERN_WARNING "NUMA: " - "short ibm,associativity-reference-points\n"); + pr_warn("short ibm,associativity-reference-points\n"); goto err; } @@ -504,8 +502,8 @@ static int __init find_primary_domain_index(void) * MAX_DISTANCE_REF_POINTS domains. */ if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { - printk(KERN_WARNING "NUMA: distance array capped at " - "%d entries\n", MAX_DISTANCE_REF_POINTS); + pr_warn("distance array capped at %d entries\n", + MAX_DISTANCE_REF_POINTS); distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; } @@ -910,7 +908,7 @@ static int __init parse_numa_properties(void) const __be32 *associativity; if (numa_enabled == 0) { - printk(KERN_WARNING "NUMA disabled by user\n"); + pr_warn("disabled by user\n"); return -1; } @@ -925,7 +923,7 @@ static int __init parse_numa_properties(void) return primary_domain_index; } - pr_debug("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); + pr_debug("associativity depth for CPU/Memory: %d\n", primary_domain_index); /* * If it is FORM2 initialize the distance table here. @@ -1038,10 +1036,8 @@ static void __init setup_nonnuma(void) unsigned int nid = 0; int i; - printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", - top_of_ram, total_ram); - printk(KERN_DEBUG "Memory hole size: %ldMB\n", - (top_of_ram - total_ram) >> 20); + pr_debug("Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram); + pr_debug("Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { fake_numa_create_new_node(end_pfn, &nid); From 544a09ee7434b949552266d20ece538d35535bd5 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:35:19 +0530 Subject: [PATCH 277/474] powerpc/numa: Print debug statements only when required Currently, a debug message gets printed every time an attempt to add(remove) a CPU. However this is redundant if the CPU is already added (removed) from the node. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100521.412639-4-srikar@linux.vnet.ibm.com --- arch/powerpc/mm/numa.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f4e52581713e..52bc63ee6a6a 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -146,10 +146,10 @@ static void map_cpu_to_node(int cpu, int node) { update_numa_cpu_lookup_table(cpu, node); - pr_debug("adding cpu %d to node %d\n", cpu, node); - - if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) + if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) { + pr_debug("adding cpu %d to node %d\n", cpu, node); cpumask_set_cpu(cpu, node_to_cpumask_map[node]); + } } #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) @@ -157,10 +157,9 @@ static void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; - pr_debug("removing cpu %lu from node %d\n", cpu, node); - if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); + pr_debug("removing cpu %lu from node %d\n", cpu, node); } else { pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node); } From 9a245d0e1f006bc7ccf0285d0d520ed304d00c4a Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:35:20 +0530 Subject: [PATCH 278/474] powerpc/numa: Update cpu_cpu_map on CPU online/offline cpu_cpu_map holds all the CPUs in the DIE. However in PowerPC, when onlining/offlining of CPUs, this mask doesn't get updated. This mask is however updated when CPUs are added/removed. So when both operations like online/offline of CPUs and adding/removing of CPUs are done simultaneously, then cpumaps end up broken. WARNING: CPU: 13 PID: 1142 at kernel/sched/topology.c:898 build_sched_domains+0xd48/0x1720 Modules linked in: rpadlpar_io rpaphp mptcp_diag xsk_diag tcp_diag udp_diag raw_diag inet_diag unix_diag af_packet_diag netlink_diag bonding tls nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set rfkill nf_tables nfnetlink pseries_rng xts vmx_crypto uio_pdrv_genirq uio binfmt_misc ip_tables xfs libcrc32c dm_service_time sd_mod t10_pi sg ibmvfc scsi_transport_fc ibmveth dm_multipath dm_mirror dm_region_hash dm_log dm_mod fuse CPU: 13 PID: 1142 Comm: kworker/13:2 Not tainted 5.13.0-rc6+ #28 Workqueue: events cpuset_hotplug_workfn NIP: c0000000001caac8 LR: c0000000001caac4 CTR: 00000000007088ec REGS: c00000005596f220 TRAP: 0700 Not tainted (5.13.0-rc6+) MSR: 8000000000029033 CR: 48828222 XER: 00000009 CFAR: c0000000001ea698 IRQMASK: 0 GPR00: c0000000001caac4 c00000005596f4c0 c000000001c4a400 0000000000000036 GPR04: 00000000fffdffff c00000005596f1d0 0000000000000027 c0000018cfd07f90 GPR08: 0000000000000023 0000000000000001 0000000000000027 c0000018fe68ffe8 GPR12: 0000000000008000 c00000001e9d1880 c00000013a047200 0000000000000800 GPR16: c000000001d3c7d0 0000000000000240 0000000000000048 c000000010aacd18 GPR20: 0000000000000001 c000000010aacc18 c00000013a047c00 c000000139ec2400 GPR24: 0000000000000280 c000000139ec2520 c000000136c1b400 c000000001c93060 GPR28: c00000013a047c20 c000000001d3c6c0 c000000001c978a0 000000000000000d NIP [c0000000001caac8] build_sched_domains+0xd48/0x1720 LR [c0000000001caac4] build_sched_domains+0xd44/0x1720 Call Trace: [c00000005596f4c0] [c0000000001caac4] build_sched_domains+0xd44/0x1720 (unreliable) [c00000005596f670] [c0000000001cc5ec] partition_sched_domains_locked+0x3ac/0x4b0 [c00000005596f710] [c0000000002804e4] rebuild_sched_domains_locked+0x404/0x9e0 [c00000005596f810] [c000000000283e60] rebuild_sched_domains+0x40/0x70 [c00000005596f840] [c000000000284124] cpuset_hotplug_workfn+0x294/0xf10 [c00000005596fc60] [c000000000175040] process_one_work+0x290/0x590 [c00000005596fd00] [c0000000001753c8] worker_thread+0x88/0x620 [c00000005596fda0] [c000000000181704] kthread+0x194/0x1a0 [c00000005596fe10] [c00000000000ccec] ret_from_kernel_thread+0x5c/0x70 Instruction dump: 485af049 60000000 2fa30800 409e0028 80fe0000 e89a00f8 e86100e8 38da0120 7f88e378 7ce53b78 4801fb91 60000000 <0fe00000> 39000000 38e00000 38c00000 Fix this by updating cpu_cpu_map aka cpumask_of_node() on every CPU online/offline. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100521.412639-5-srikar@linux.vnet.ibm.com --- arch/powerpc/include/asm/topology.h | 13 +++++++++++++ arch/powerpc/kernel/smp.c | 3 +++ arch/powerpc/mm/numa.c | 7 ++----- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index a4712ecad3e9..36fcafb1fd6d 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -66,6 +66,11 @@ static inline int early_cpu_to_node(int cpu) int of_drconf_to_nid_single(struct drmem_lmb *lmb); void update_numa_distance(struct device_node *node); +extern void map_cpu_to_node(int cpu, int node); +#ifdef CONFIG_HOTPLUG_CPU +extern void unmap_cpu_from_node(unsigned long cpu); +#endif /* CONFIG_HOTPLUG_CPU */ + #else static inline int early_cpu_to_node(int cpu) { return 0; } @@ -95,6 +100,14 @@ static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb) } static inline void update_numa_distance(struct device_node *node) {} + +#ifdef CONFIG_SMP +static inline void map_cpu_to_node(int cpu, int node) {} +#ifdef CONFIG_HOTPLUG_CPU +static inline void unmap_cpu_from_node(unsigned long cpu) {} +#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_SMP */ + #endif /* CONFIG_NUMA */ #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 548aa58d25f9..9cc7d3dbf439 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1442,6 +1442,8 @@ static void remove_cpu_from_masks(int cpu) struct cpumask *(*mask_fn)(int) = cpu_sibling_mask; int i; + unmap_cpu_from_node(cpu); + if (shared_caches) mask_fn = cpu_l2_cache_mask; @@ -1526,6 +1528,7 @@ static void add_cpu_to_masks(int cpu) * This CPU will not be in the online mask yet so we need to manually * add it to it's own thread sibling mask. */ + map_cpu_to_node(cpu, cpu_to_node(cpu)); cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); cpumask_set_cpu(cpu, cpu_core_mask(cpu)); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 52bc63ee6a6a..6f14c8fb6359 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -142,7 +142,7 @@ static void reset_numa_cpu_lookup_table(void) numa_cpu_lookup_table[cpu] = -1; } -static void map_cpu_to_node(int cpu, int node) +void map_cpu_to_node(int cpu, int node) { update_numa_cpu_lookup_table(cpu, node); @@ -153,7 +153,7 @@ static void map_cpu_to_node(int cpu, int node) } #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) -static void unmap_cpu_from_node(unsigned long cpu) +void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; @@ -800,9 +800,6 @@ static int ppc_numa_cpu_prepare(unsigned int cpu) static int ppc_numa_cpu_dead(unsigned int cpu) { -#ifdef CONFIG_HOTPLUG_CPU - unmap_cpu_from_node(cpu); -#endif return 0; } From 0c634bafe3bbee7a36dca7f1277057e05bf14d91 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:19 -0300 Subject: [PATCH 279/474] powerpc/pseries/iommu: Replace hard-coded page shift Some functions assume IOMMU page size can only be 4K (pageshift == 12). Update them to accept any page size passed, so we can use 64K pages. In the process, some defines like TCE_SHIFT were made obsolete, and then removed. IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show a RPN of 52-bit, and considers a 12-bit pageshift, so there should be no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn. It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and tce_buildmulti_pSeriesLP(). Most places had a tbl struct, so using tbl->it_page_shift was simple. tce_free_pSeriesLP() was a special case, since callers not always have a tbl struct, so adding a tceshift parameter seems the right thing to do. Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com --- arch/powerpc/include/asm/tce.h | 8 ------ arch/powerpc/platforms/pseries/iommu.c | 39 +++++++++++++++----------- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index db5fc2f2262d..0c34d2756d92 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -19,15 +19,7 @@ #define TCE_VB 0 #define TCE_PCI 1 -/* TCE page size is 4096 bytes (1 << 12) */ - -#define TCE_SHIFT 12 -#define TCE_PAGE_SIZE (1 << TCE_SHIFT) - #define TCE_ENTRY_SIZE 8 /* each TCE is 64 bits */ - -#define TCE_RPN_MASK 0xfffffffffful /* 40-bit RPN (4K pages) */ -#define TCE_RPN_SHIFT 12 #define TCE_VALID 0x800 /* TCE valid */ #define TCE_ALLIO 0x400 /* TCE valid for all lpars */ #define TCE_PCI_WRITE 0x2 /* write from PCI allowed */ diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 0c55b991f665..b1b8d12bab39 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -107,6 +107,8 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, u64 proto_tce; __be64 *tcep; u64 rpn; + const unsigned long tceshift = tbl->it_page_shift; + const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl); proto_tce = TCE_PCI_READ; // Read allowed @@ -117,10 +119,10 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, while (npages--) { /* can't move this out since we might cross MEMBLOCK boundary */ - rpn = __pa(uaddr) >> TCE_SHIFT; - *tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT); + rpn = __pa(uaddr) >> tceshift; + *tcep = cpu_to_be64(proto_tce | rpn << tceshift); - uaddr += TCE_PAGE_SIZE; + uaddr += pagesize; tcep++; } return 0; @@ -146,7 +148,7 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) return be64_to_cpu(*tcep); } -static void tce_free_pSeriesLP(unsigned long liobn, long, long); +static void tce_free_pSeriesLP(unsigned long liobn, long, long, long); static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, @@ -166,12 +168,12 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, proto_tce |= TCE_PCI_WRITE; while (npages--) { - tce = proto_tce | (rpn & TCE_RPN_MASK) << tceshift; + tce = proto_tce | rpn << tceshift; rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; - tce_free_pSeriesLP(liobn, tcenum_start, + tce_free_pSeriesLP(liobn, tcenum_start, tceshift, (npages_start - (npages + 1))); break; } @@ -205,10 +207,11 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long tcenum_start = tcenum, npages_start = npages; int ret = 0; unsigned long flags; + const unsigned long tceshift = tbl->it_page_shift; if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) { return tce_build_pSeriesLP(tbl->it_index, tcenum, - tbl->it_page_shift, npages, uaddr, + tceshift, npages, uaddr, direction, attrs); } @@ -225,13 +228,13 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, if (!tcep) { local_irq_restore(flags); return tce_build_pSeriesLP(tbl->it_index, tcenum, - tbl->it_page_shift, + tceshift, npages, uaddr, direction, attrs); } __this_cpu_write(tce_page, tcep); } - rpn = __pa(uaddr) >> TCE_SHIFT; + rpn = __pa(uaddr) >> tceshift; proto_tce = TCE_PCI_READ; if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; @@ -245,12 +248,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE); for (l = 0; l < limit; l++) { - tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT); + tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift); rpn++; } rc = plpar_tce_put_indirect((u64)tbl->it_index, - (u64)tcenum << 12, + (u64)tcenum << tceshift, (u64)__pa(tcep), limit); @@ -277,12 +280,13 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, return ret; } -static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long npages) +static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, + long npages) { u64 rc; while (npages--) { - rc = plpar_tce_put((u64)liobn, (u64)tcenum << 12, 0); + rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0); if (rc && printk_ratelimit()) { printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); @@ -301,9 +305,11 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n u64 rc; if (!firmware_has_feature(FW_FEATURE_STUFF_TCE)) - return tce_free_pSeriesLP(tbl->it_index, tcenum, npages); + return tce_free_pSeriesLP(tbl->it_index, tcenum, + tbl->it_page_shift, npages); - rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages); + rc = plpar_tce_stuff((u64)tbl->it_index, + (u64)tcenum << tbl->it_page_shift, 0, npages); if (rc && printk_ratelimit()) { printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n"); @@ -319,7 +325,8 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum) u64 rc; unsigned long tce_ret; - rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret); + rc = plpar_tce_get((u64)tbl->it_index, + (u64)tcenum << tbl->it_page_shift, &tce_ret); if (rc && printk_ratelimit()) { printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc); From 3c33066a21903076722a2881556a92aa3cd7d359 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:20 -0300 Subject: [PATCH 280/474] powerpc/kernel/iommu: Add new iommu_table_in_use() helper Having a function to check if the iommu table has any allocation helps deciding if a tbl can be reset for using a new DMA window. It should be enough to replace all instances of !bitmap_empty(tbl...). iommu_table_in_use() skips reserved memory, so we don't need to worry about releasing it before testing. This causes iommu_table_release_pages() to become unnecessary, given it is only used to remove reserved memory for testing. Also, only allow storing reserved memory values in tbl if they are valid in the table, so there is no need to check it in the new helper. Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-3-leobras.c@gmail.com --- arch/powerpc/include/asm/iommu.h | 1 + arch/powerpc/kernel/iommu.c | 61 ++++++++++++++++---------------- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index deef7c94d7b6..bf3b84128525 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -154,6 +154,7 @@ extern int iommu_tce_table_put(struct iommu_table *tbl); */ extern struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, unsigned long res_start, unsigned long res_end); +bool iommu_table_in_use(struct iommu_table *tbl); #define IOMMU_TABLE_GROUP_MAX_TABLES 2 diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 2af89a5e379f..ed98ad63633e 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -690,34 +690,26 @@ static void iommu_table_reserve_pages(struct iommu_table *tbl, if (tbl->it_offset == 0) set_bit(0, tbl->it_map); + if (res_start < tbl->it_offset) + res_start = tbl->it_offset; + + if (res_end > (tbl->it_offset + tbl->it_size)) + res_end = tbl->it_offset + tbl->it_size; + + /* Check if res_start..res_end is a valid range in the table */ + if (res_start >= res_end) { + tbl->it_reserved_start = tbl->it_offset; + tbl->it_reserved_end = tbl->it_offset; + return; + } + tbl->it_reserved_start = res_start; tbl->it_reserved_end = res_end; - /* Check if res_start..res_end isn't empty and overlaps the table */ - if (res_start && res_end && - (tbl->it_offset + tbl->it_size < res_start || - res_end < tbl->it_offset)) - return; - for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) set_bit(i - tbl->it_offset, tbl->it_map); } -static void iommu_table_release_pages(struct iommu_table *tbl) -{ - int i; - - /* - * In case we have reserved the first bit, we should not emit - * the warning below. - */ - if (tbl->it_offset == 0) - clear_bit(0, tbl->it_map); - - for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) - clear_bit(i - tbl->it_offset, tbl->it_map); -} - /* * Build a iommu_table structure. This contains a bit map which * is used to manage allocation of the tce space. @@ -779,6 +771,22 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, return tbl; } +bool iommu_table_in_use(struct iommu_table *tbl) +{ + unsigned long start = 0, end; + + /* ignore reserved bit0 */ + if (tbl->it_offset == 0) + start = 1; + end = tbl->it_reserved_start - tbl->it_offset; + if (find_next_bit(tbl->it_map, end, start) != end) + return true; + + start = tbl->it_reserved_end - tbl->it_offset; + end = tbl->it_size; + return find_next_bit(tbl->it_map, end, start) != end; +} + static void iommu_table_free(struct kref *kref) { struct iommu_table *tbl; @@ -795,10 +803,8 @@ static void iommu_table_free(struct kref *kref) iommu_debugfs_del(tbl); - iommu_table_release_pages(tbl); - /* verify that table contains no entries */ - if (!bitmap_empty(tbl->it_map, tbl->it_size)) + if (iommu_table_in_use(tbl)) pr_warn("%s: Unexpected TCEs\n", __func__); /* free bitmap */ @@ -1099,14 +1105,9 @@ int iommu_take_ownership(struct iommu_table *tbl) for (i = 0; i < tbl->nr_pools; i++) spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock); - iommu_table_release_pages(tbl); - - if (!bitmap_empty(tbl->it_map, tbl->it_size)) { + if (iommu_table_in_use(tbl)) { pr_err("iommu_tce: it_map is not empty"); ret = -EBUSY; - /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */ - iommu_table_reserve_pages(tbl, tbl->it_reserved_start, - tbl->it_reserved_end); } else { memset(tbl->it_map, 0xff, sz); } From 4ff8677a0b192a58d998d1d34fc5168203041a24 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:21 -0300 Subject: [PATCH 281/474] powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper Creates a helper to allow allocating a new iommu_table without the need to reallocate the iommu_group. This will be helpful for replacing the iommu_table for the new DMA window, after we remove the old one with iommu_tce_table_put(). Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-4-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index b1b8d12bab39..33d82865d6e6 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -53,28 +53,31 @@ enum { DDW_EXT_QUERY_OUT_SIZE = 2 }; -static struct iommu_table_group *iommu_pseries_alloc_group(int node) +static struct iommu_table *iommu_pseries_alloc_table(int node) { - struct iommu_table_group *table_group; struct iommu_table *tbl; - table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL, - node); - if (!table_group) - return NULL; - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); if (!tbl) - goto free_group; + return NULL; INIT_LIST_HEAD_RCU(&tbl->it_group_list); kref_init(&tbl->it_kref); + return tbl; +} - table_group->tables[0] = tbl; +static struct iommu_table_group *iommu_pseries_alloc_group(int node) +{ + struct iommu_table_group *table_group; - return table_group; + table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node); + if (!table_group) + return NULL; + + table_group->tables[0] = iommu_pseries_alloc_table(node); + if (table_group->tables[0]) + return table_group; -free_group: kfree(table_group); return NULL; } From 92a23219299cedde52e3298788484f4875d5ce0f Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:22 -0300 Subject: [PATCH 282/474] powerpc/pseries/iommu: Add ddw_list_new_entry() helper There are two functions creating direct_window_list entries in a similar way, so create a ddw_list_new_entry() to avoid duplicity and simplify those functions. Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-5-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 32 +++++++++++++++++--------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 33d82865d6e6..712d1667144a 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -874,6 +874,21 @@ static u64 find_existing_ddw(struct device_node *pdn, int *window_shift) return dma_addr; } +static struct direct_window *ddw_list_new_entry(struct device_node *pdn, + const struct dynamic_dma_window_prop *dma64) +{ + struct direct_window *window; + + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) + return NULL; + + window->device = pdn; + window->prop = dma64; + + return window; +} + static int find_existing_ddw_windows(void) { int len; @@ -886,18 +901,15 @@ static int find_existing_ddw_windows(void) for_each_node_with_property(pdn, DIRECT64_PROPNAME) { direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len); - if (!direct64) - continue; - - window = kzalloc(sizeof(*window), GFP_KERNEL); - if (!window || len < sizeof(struct dynamic_dma_window_prop)) { - kfree(window); + if (!direct64 || len < sizeof(*direct64)) { remove_ddw(pdn, true); continue; } - window->device = pdn; - window->prop = direct64; + window = ddw_list_new_entry(pdn, direct64); + if (!window) + break; + spin_lock(&direct_window_list_lock); list_add(&window->list, &direct_window_list); spin_unlock(&direct_window_list_lock); @@ -1307,7 +1319,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n", create.liobn, dn); - window = kzalloc(sizeof(*window), GFP_KERNEL); + window = ddw_list_new_entry(pdn, ddwprop); if (!window) goto out_clear_window; @@ -1326,8 +1338,6 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) goto out_free_window; } - window->device = pdn; - window->prop = ddwprop; spin_lock(&direct_window_list_lock); list_add(&window->list, &direct_window_list); spin_unlock(&direct_window_list_lock); From 2ca73c54ce24489518a56d816331b774044c2445 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:23 -0300 Subject: [PATCH 283/474] powerpc/pseries/iommu: Allow DDW windows starting at 0x00 enable_ddw() currently returns the address of the DMA window, which is considered invalid if has the value 0x00. Also, it only considers valid an address returned from find_existing_ddw if it's not 0x00. Changing this behavior makes sense, given the users of enable_ddw() only need to know if direct mapping is possible. It can also allow a DMA window starting at 0x00 to be used. This will be helpful for using a DDW with indirect mapping, as the window address will be different than 0x00, but it will not map the whole partition. Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-6-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 36 +++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 712d1667144a..b34b473bbdc1 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -853,25 +853,26 @@ static void remove_ddw(struct device_node *np, bool remove_prop) np, ret); } -static u64 find_existing_ddw(struct device_node *pdn, int *window_shift) +static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift) { struct direct_window *window; const struct dynamic_dma_window_prop *direct64; - u64 dma_addr = 0; + bool found = false; spin_lock(&direct_window_list_lock); /* check if we already created a window and dupe that config if so */ list_for_each_entry(window, &direct_window_list, list) { if (window->device == pdn) { direct64 = window->prop; - dma_addr = be64_to_cpu(direct64->dma_base); + *dma_addr = be64_to_cpu(direct64->dma_base); *window_shift = be32_to_cpu(direct64->window_shift); + found = true; break; } } spin_unlock(&direct_window_list_lock); - return dma_addr; + return found; } static struct direct_window *ddw_list_new_entry(struct device_node *pdn, @@ -1161,20 +1162,20 @@ static int iommu_get_page_shift(u32 query_page_size) * pdn: the parent pe node with the ibm,dma_window property * Future: also check if we can remap the base window for our base page size * - * returns the dma offset for use by the direct mapped DMA code. + * returns true if can map all pages (direct mapping), false otherwise.. */ -static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) +static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) { int len = 0, ret; int max_ram_len = order_base_2(ddw_memory_hotplug_max()); struct ddw_query_response query; struct ddw_create_response create; int page_shift; - u64 dma_addr; struct device_node *dn; u32 ddw_avail[DDW_APPLICABLE_SIZE]; struct direct_window *window; struct property *win64; + bool ddw_enabled = false; struct dynamic_dma_window_prop *ddwprop; struct failed_ddw_pdn *fpdn; bool default_win_removed = false; @@ -1186,9 +1187,10 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) mutex_lock(&direct_window_init_mutex); - dma_addr = find_existing_ddw(pdn, &len); - if (dma_addr != 0) + if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) { + ddw_enabled = true; goto out_unlock; + } /* * If we already went through this for a previous function of @@ -1342,7 +1344,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) list_add(&window->list, &direct_window_list); spin_unlock(&direct_window_list_lock); - dma_addr = be64_to_cpu(ddwprop->dma_base); + dev->dev.archdata.dma_offset = be64_to_cpu(ddwprop->dma_base); + ddw_enabled = true; goto out_unlock; out_free_window: @@ -1374,10 +1377,10 @@ out_unlock: * as RAM, then we failed to create a window to cover persistent * memory and need to set the DMA limit. */ - if (pmem_present && dma_addr && (len == max_ram_len)) - dev->dev.bus_dma_limit = dma_addr + (1ULL << len); + if (pmem_present && ddw_enabled && (len == max_ram_len)) + dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL << len); - return dma_addr; + return ddw_enabled; } static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) @@ -1456,11 +1459,8 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) break; } - if (pdn && PCI_DN(pdn)) { - pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn); - if (pdev->dev.archdata.dma_offset) - return true; - } + if (pdn && PCI_DN(pdn)) + return enable_ddw(pdev, pdn); return false; } From 7ed2ed2db2685a285cb09ab330dc4efea0b64022 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:24 -0300 Subject: [PATCH 284/474] powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw() Code used to create a ddw property that was previously scattered in enable_ddw() is now gathered in ddw_property_create(), which deals with allocation and filling the property, letting it ready for of_property_add(), which now occurs in sequence. This created an opportunity to reorganize the second part of enable_ddw(): Without this patch enable_ddw() does, in order: kzalloc() property & members, create_ddw(), fill ddwprop inside property, ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory, of_add_property(), and list_add(). With this patch enable_ddw() does, in order: create_ddw(), ddw_property_create(), of_add_property(), ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory, and list_add(). This change requires of_remove_property() in case anything fails after of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk in all memory, which looks the most expensive operation, only if everything else succeeds. Also, the error path got remove_ddw() replaced by a new helper __remove_dma_window(), which only removes the new DDW with an rtas-call. For this, a new helper clean_dma_window() was needed to clean anything that could left if walk_system_ram_range() fails. Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 129 ++++++++++++++++--------- 1 file changed, 84 insertions(+), 45 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index b34b473bbdc1..00392582fe10 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -795,17 +795,10 @@ static int __init disable_ddw_setup(char *str) early_param("disable_ddw", disable_ddw_setup); -static void remove_dma_window(struct device_node *np, u32 *ddw_avail, - struct property *win) +static void clean_dma_window(struct device_node *np, struct dynamic_dma_window_prop *dwp) { - struct dynamic_dma_window_prop *dwp; - u64 liobn; int ret; - dwp = win->value; - liobn = (u64)be32_to_cpu(dwp->liobn); - - /* clear the whole window, note the arg is in kernel pages */ ret = tce_clearrange_multi_pSeriesLP(0, 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp); if (ret) @@ -814,18 +807,39 @@ static void remove_dma_window(struct device_node *np, u32 *ddw_avail, else pr_debug("%pOF successfully cleared tces in window.\n", np); +} + +/* + * Call only if DMA window is clean. + */ +static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liobn) +{ + int ret; ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn); if (ret) - pr_warn("%pOF: failed to remove direct window: rtas returned " + pr_warn("%pOF: failed to remove DMA window: rtas returned " "%d to ibm,remove-pe-dma-window(%x) %llx\n", np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); else - pr_debug("%pOF: successfully removed direct window: rtas returned " + pr_debug("%pOF: successfully removed DMA window: rtas returned " "%d to ibm,remove-pe-dma-window(%x) %llx\n", np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); } +static void remove_dma_window(struct device_node *np, u32 *ddw_avail, + struct property *win) +{ + struct dynamic_dma_window_prop *dwp; + u64 liobn; + + dwp = win->value; + liobn = (u64)be32_to_cpu(dwp->liobn); + + clean_dma_window(np, dwp); + __remove_dma_window(np, ddw_avail, liobn); +} + static void remove_ddw(struct device_node *np, bool remove_prop) { struct property *win; @@ -1153,6 +1167,35 @@ static int iommu_get_page_shift(u32 query_page_size) return 0; } +static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr, + u32 page_shift, u32 window_shift) +{ + struct dynamic_dma_window_prop *ddwprop; + struct property *win64; + + win64 = kzalloc(sizeof(*win64), GFP_KERNEL); + if (!win64) + return NULL; + + win64->name = kstrdup(propname, GFP_KERNEL); + ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL); + win64->value = ddwprop; + win64->length = sizeof(*ddwprop); + if (!win64->name || !win64->value) { + kfree(win64->name); + kfree(win64->value); + kfree(win64); + return NULL; + } + + ddwprop->liobn = cpu_to_be32(liobn); + ddwprop->dma_base = cpu_to_be64(dma_addr); + ddwprop->tce_shift = cpu_to_be32(page_shift); + ddwprop->window_shift = cpu_to_be32(window_shift); + + return win64; +} + /* * If the PE supports dynamic dma windows, and there is space for a table * that can map all pages in a linear offset, then setup such a table, @@ -1171,12 +1214,12 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) struct ddw_query_response query; struct ddw_create_response create; int page_shift; + u64 win_addr; struct device_node *dn; u32 ddw_avail[DDW_APPLICABLE_SIZE]; struct direct_window *window; struct property *win64; bool ddw_enabled = false; - struct dynamic_dma_window_prop *ddwprop; struct failed_ddw_pdn *fpdn; bool default_win_removed = false; bool pmem_present; @@ -1293,72 +1336,68 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) 1ULL << page_shift); goto out_failed; } - win64 = kzalloc(sizeof(struct property), GFP_KERNEL); - if (!win64) { - dev_info(&dev->dev, - "couldn't allocate property for 64bit dma window\n"); - goto out_failed; - } - win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL); - win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL); - win64->length = sizeof(*ddwprop); - if (!win64->name || !win64->value) { - dev_info(&dev->dev, - "couldn't allocate property name and value\n"); - goto out_free_prop; - } ret = create_ddw(dev, ddw_avail, &create, page_shift, len); if (ret != 0) - goto out_free_prop; - - ddwprop->liobn = cpu_to_be32(create.liobn); - ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) | - create.addr_lo); - ddwprop->tce_shift = cpu_to_be32(page_shift); - ddwprop->window_shift = cpu_to_be32(len); + goto out_failed; dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n", create.liobn, dn); - window = ddw_list_new_entry(pdn, ddwprop); + win_addr = ((u64)create.addr_hi << 32) | create.addr_lo; + win64 = ddw_property_create(DIRECT64_PROPNAME, create.liobn, win_addr, + page_shift, len); + if (!win64) { + dev_info(&dev->dev, + "couldn't allocate property, property name, or value\n"); + goto out_remove_win; + } + + ret = of_add_property(pdn, win64); + if (ret) { + dev_err(&dev->dev, "unable to add dma window property for %pOF: %d", + pdn, ret); + goto out_free_prop; + } + + window = ddw_list_new_entry(pdn, win64->value); if (!window) - goto out_clear_window; + goto out_del_prop; ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, win64->value, tce_setrange_multi_pSeriesLP_walk); if (ret) { dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n", dn, ret); - goto out_free_window; - } - ret = of_add_property(pdn, win64); - if (ret) { - dev_err(&dev->dev, "unable to add dma window property for %pOF: %d", - pdn, ret); - goto out_free_window; + /* Make sure to clean DDW if any TCE was set*/ + clean_dma_window(pdn, win64->value); + goto out_del_list; } spin_lock(&direct_window_list_lock); list_add(&window->list, &direct_window_list); spin_unlock(&direct_window_list_lock); - dev->dev.archdata.dma_offset = be64_to_cpu(ddwprop->dma_base); + dev->dev.archdata.dma_offset = win_addr; ddw_enabled = true; goto out_unlock; -out_free_window: +out_del_list: kfree(window); -out_clear_window: - remove_ddw(pdn, true); +out_del_prop: + of_remove_property(pdn, win64); out_free_prop: kfree(win64->name); kfree(win64->value); kfree(win64); +out_remove_win: + /* DDW is clean, so it's ok to call this directly. */ + __remove_dma_window(pdn, ddw_avail, create.liobn); + out_failed: if (default_win_removed) reset_dma_window(dev, pdn); From fc8cba8f989fb98e496b33a78476861e246c42a0 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:25 -0300 Subject: [PATCH 285/474] powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new helper Add a new helper _iommu_table_setparms(), and use it in iommu_table_setparms() and iommu_table_setparms_lpar() to avoid duplicated code. Also, setting tbl->it_ops was happening outsite iommu_table_setparms*(), so move it to the new helper. Since we need the iommu_table_ops to be declared before used, declare iommu_table_lpar_multi_ops and iommu_table_pseries_ops to before their respective iommu_table_setparms*(). Signed-off-by: Leonardo Bras Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-8-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 72 ++++++++++++++------------ 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 00392582fe10..a47f59a8f107 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -501,6 +501,24 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn, return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg); } +static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long busno, + unsigned long liobn, unsigned long win_addr, + unsigned long window_size, unsigned long page_shift, + void *base, struct iommu_table_ops *table_ops) +{ + tbl->it_busno = busno; + tbl->it_index = liobn; + tbl->it_offset = win_addr >> page_shift; + tbl->it_size = window_size >> page_shift; + tbl->it_page_shift = page_shift; + tbl->it_base = (unsigned long)base; + tbl->it_blocksize = 16; + tbl->it_type = TCE_PCI; + tbl->it_ops = table_ops; +} + +struct iommu_table_ops iommu_table_pseries_ops; + static void iommu_table_setparms(struct pci_controller *phb, struct device_node *dn, struct iommu_table *tbl) @@ -509,8 +527,13 @@ static void iommu_table_setparms(struct pci_controller *phb, const unsigned long *basep; const u32 *sizep; - node = phb->dn; + /* Test if we are going over 2GB of DMA space */ + if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) { + udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); + panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); + } + node = phb->dn; basep = of_get_property(node, "linux,tce-base", NULL); sizep = of_get_property(node, "linux,tce-size", NULL); if (basep == NULL || sizep == NULL) { @@ -519,33 +542,18 @@ static void iommu_table_setparms(struct pci_controller *phb, return; } - tbl->it_base = (unsigned long)__va(*basep); + iommu_table_setparms_common(tbl, phb->bus->number, 0, phb->dma_window_base_cur, + phb->dma_window_size, IOMMU_PAGE_SHIFT_4K, + __va(*basep), &iommu_table_pseries_ops); if (!is_kdump_kernel()) memset((void *)tbl->it_base, 0, *sizep); - tbl->it_busno = phb->bus->number; - tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; - - /* Units of tce entries */ - tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift; - - /* Test if we are going over 2GB of DMA space */ - if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) { - udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); - panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); - } - phb->dma_window_base_cur += phb->dma_window_size; - - /* Set the tce table size - measured in entries */ - tbl->it_size = phb->dma_window_size >> tbl->it_page_shift; - - tbl->it_index = 0; - tbl->it_blocksize = 16; - tbl->it_type = TCE_PCI; } +struct iommu_table_ops iommu_table_lpar_multi_ops; + /* * iommu_table_setparms_lpar * @@ -557,17 +565,13 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb, struct iommu_table_group *table_group, const __be32 *dma_window) { - unsigned long offset, size; + unsigned long offset, size, liobn; - of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size); + of_parse_dma_window(dn, dma_window, &liobn, &offset, &size); + + iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL, + &iommu_table_lpar_multi_ops); - tbl->it_busno = phb->bus->number; - tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; - tbl->it_base = 0; - tbl->it_blocksize = 16; - tbl->it_type = TCE_PCI; - tbl->it_offset = offset >> tbl->it_page_shift; - tbl->it_size = size >> tbl->it_page_shift; table_group->tce32_start = offset; table_group->tce32_size = size; @@ -647,7 +651,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) tbl = pci->table_group->tables[0]; iommu_table_setparms(pci->phb, dn, tbl); - tbl->it_ops = &iommu_table_pseries_ops; + if (!iommu_init_table(tbl, pci->phb->node, 0, 0)) panic("Failed to initialize iommu table"); @@ -730,7 +734,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) tbl = ppci->table_group->tables[0]; iommu_table_setparms_lpar(ppci->phb, pdn, tbl, ppci->table_group, dma_window); - tbl->it_ops = &iommu_table_lpar_multi_ops; + if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) panic("Failed to initialize iommu table"); iommu_register_group(ppci->table_group, @@ -760,7 +764,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node); tbl = PCI_DN(dn)->table_group->tables[0]; iommu_table_setparms(phb, dn, tbl); - tbl->it_ops = &iommu_table_pseries_ops; + if (!iommu_init_table(tbl, phb->node, 0, 0)) panic("Failed to initialize iommu table"); @@ -1461,7 +1465,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) tbl = pci->table_group->tables[0]; iommu_table_setparms_lpar(pci->phb, pdn, tbl, pci->table_group, dma_window); - tbl->it_ops = &iommu_table_lpar_multi_ops; + iommu_init_table(tbl, pci->phb->node, 0, 0); iommu_register_group(pci->table_group, pci_domain_nr(pci->phb->bus), 0); From a5fd95120c653962a9e75e260a35436b96d2c991 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:26 -0300 Subject: [PATCH 286/474] powerpc/pseries/iommu: Update remove_dma_window() to accept property name Update remove_dma_window() so it can be used to remove DDW with a given property name. This enables the creation of new property names for DDW, so we can have different usage for it, like indirect mapping. Also, add return values to it so we can check if the property was found while removing the active DDW. This allows skipping the remaining property names while reducing the impact of multiple property names. Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-9-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index a47f59a8f107..901f290999d0 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -844,31 +844,33 @@ static void remove_dma_window(struct device_node *np, u32 *ddw_avail, __remove_dma_window(np, ddw_avail, liobn); } -static void remove_ddw(struct device_node *np, bool remove_prop) +static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_name) { struct property *win; u32 ddw_avail[DDW_APPLICABLE_SIZE]; int ret = 0; + win = of_find_property(np, win_name, NULL); + if (!win) + return -EINVAL; + ret = of_property_read_u32_array(np, "ibm,ddw-applicable", &ddw_avail[0], DDW_APPLICABLE_SIZE); if (ret) - return; + return 0; - win = of_find_property(np, DIRECT64_PROPNAME, NULL); - if (!win) - return; if (win->length >= sizeof(struct dynamic_dma_window_prop)) remove_dma_window(np, ddw_avail, win); if (!remove_prop) - return; + return 0; ret = of_remove_property(np, win); if (ret) pr_warn("%pOF: failed to remove direct window property: %d\n", np, ret); + return 0; } static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift) @@ -921,7 +923,7 @@ static int find_existing_ddw_windows(void) for_each_node_with_property(pdn, DIRECT64_PROPNAME) { direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len); if (!direct64 || len < sizeof(*direct64)) { - remove_ddw(pdn, true); + remove_ddw(pdn, true, DIRECT64_PROPNAME); continue; } @@ -1565,7 +1567,7 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti * we have to remove the property when releasing * the device node. */ - remove_ddw(np, false); + remove_ddw(np, false, DIRECT64_PROPNAME); if (pci && pci->table_group) iommu_pseries_free_group(pci->table_group, np->full_name); From 8599395d34f2dd7b77bef42da1d99798e7a3d58f Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:27 -0300 Subject: [PATCH 287/474] powerpc/pseries/iommu: Find existing DDW with given property name At the moment pseries stores information about created directly mapped DDW window in DIRECT64_PROPNAME. With the objective of implementing indirect DMA mapping with DDW, it's necessary to have another propriety name to make sure kexec'ing into older kernels does not break, as it would if we reuse DIRECT64_PROPNAME. In order to have this, find_existing_ddw_windows() needs to be able to look for different property names. Extract find_existing_ddw_windows() into find_existing_ddw_windows_named() and calls it with current property name. Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-10-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 901f290999d0..e11c00b2dc1e 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -910,24 +910,21 @@ static struct direct_window *ddw_list_new_entry(struct device_node *pdn, return window; } -static int find_existing_ddw_windows(void) +static void find_existing_ddw_windows_named(const char *name) { int len; struct device_node *pdn; struct direct_window *window; - const struct dynamic_dma_window_prop *direct64; + const struct dynamic_dma_window_prop *dma64; - if (!firmware_has_feature(FW_FEATURE_LPAR)) - return 0; - - for_each_node_with_property(pdn, DIRECT64_PROPNAME) { - direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len); - if (!direct64 || len < sizeof(*direct64)) { - remove_ddw(pdn, true, DIRECT64_PROPNAME); + for_each_node_with_property(pdn, name) { + dma64 = of_get_property(pdn, name, &len); + if (!dma64 || len < sizeof(*dma64)) { + remove_ddw(pdn, true, name); continue; } - window = ddw_list_new_entry(pdn, direct64); + window = ddw_list_new_entry(pdn, dma64); if (!window) break; @@ -935,6 +932,14 @@ static int find_existing_ddw_windows(void) list_add(&window->list, &direct_window_list); spin_unlock(&direct_window_list_lock); } +} + +static int find_existing_ddw_windows(void) +{ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + return 0; + + find_existing_ddw_windows_named(DIRECT64_PROPNAME); return 0; } From 381ceda88c4c4c8345cad1cffa6328892f15dca6 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:28 -0300 Subject: [PATCH 288/474] powerpc/pseries/iommu: Make use of DDW for indirect mapping So far it's assumed possible to map the guest RAM 1:1 to the bus, which works with a small number of devices. SRIOV changes it as the user can configure hundreds VFs and since phyp preallocates TCEs and does not allow IOMMU pages bigger than 64K, it has to limit the number of TCEs per a PE to limit waste of physical pages. As of today, if the assumed direct mapping is not possible, DDW creation is skipped and the default DMA window "ibm,dma-window" is used instead. By using DDW, indirect mapping can get more TCEs than available for the default DMA window, and also get access to using much larger pagesizes (16MB as implemented in qemu vs 4k from default DMA window), causing a significant increase on the maximum amount of memory that can be IOMMU mapped at the same time. Indirect mapping will only be used if direct mapping is not a possibility. For indirect mapping, it's necessary to re-create the iommu_table with the new DMA window parameters, so iommu_alloc() can use it. Removing the default DMA window for using DDW with indirect mapping is only allowed if there is no current IOMMU memory allocated in the iommu_table. enable_ddw() is aborted otherwise. Even though there won't be both direct and indirect mappings at the same time, we can't reuse the DIRECT64_PROPNAME property name, or else an older kexec()ed kernel can assume direct mapping, and skip iommu_alloc(), causing undesirable behavior. So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info" was created to represent a DDW that does not allow direct mapping. Signed-off-by: Leonardo Bras Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 89 +++++++++++++++++++++----- 1 file changed, 74 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index e11c00b2dc1e..0eccc29f5573 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -375,6 +375,7 @@ static DEFINE_SPINLOCK(direct_window_list_lock); /* protects initializing window twice for same device */ static DEFINE_MUTEX(direct_window_init_mutex); #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info" +#define DMA64_PROPNAME "linux,dma64-ddr-window-info" static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, unsigned long num_pfn, const void *arg) @@ -940,6 +941,7 @@ static int find_existing_ddw_windows(void) return 0; find_existing_ddw_windows_named(DIRECT64_PROPNAME); + find_existing_ddw_windows_named(DMA64_PROPNAME); return 0; } @@ -1226,14 +1228,17 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) struct ddw_create_response create; int page_shift; u64 win_addr; + const char *win_name; struct device_node *dn; u32 ddw_avail[DDW_APPLICABLE_SIZE]; struct direct_window *window; struct property *win64; bool ddw_enabled = false; struct failed_ddw_pdn *fpdn; - bool default_win_removed = false; + bool default_win_removed = false, direct_mapping = false; bool pmem_present; + struct pci_dn *pci = PCI_DN(pdn); + struct iommu_table *tbl = pci->table_group->tables[0]; dn = of_find_node_by_type(NULL, "ibm,pmemory"); pmem_present = dn != NULL; @@ -1242,6 +1247,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) mutex_lock(&direct_window_init_mutex); if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) { + direct_mapping = (len >= max_ram_len); ddw_enabled = true; goto out_unlock; } @@ -1322,8 +1328,8 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) query.page_size); goto out_failed; } - /* verify the window * number of ptes will map the partition */ - /* check largest block * page size > max memory hotplug addr */ + + /* * The "ibm,pmemory" can appear anywhere in the address space. * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS @@ -1339,13 +1345,25 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) dev_info(&dev->dev, "Skipping ibm,pmemory"); } + /* check if the available block * number of ptes will map everything */ if (query.largest_available_block < (1ULL << (len - page_shift))) { dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu %llu-sized pages\n", 1ULL << len, query.largest_available_block, 1ULL << page_shift); - goto out_failed; + + /* DDW + IOMMU on single window may fail if there is any allocation */ + if (default_win_removed && iommu_table_in_use(tbl)) { + dev_dbg(&dev->dev, "current IOMMU table in use, can't be replaced.\n"); + goto out_failed; + } + + len = order_base_2(query.largest_available_block << page_shift); + win_name = DMA64_PROPNAME; + } else { + direct_mapping = true; + win_name = DIRECT64_PROPNAME; } ret = create_ddw(dev, ddw_avail, &create, page_shift, len); @@ -1356,8 +1374,8 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) create.liobn, dn); win_addr = ((u64)create.addr_hi << 32) | create.addr_lo; - win64 = ddw_property_create(DIRECT64_PROPNAME, create.liobn, win_addr, - page_shift, len); + win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len); + if (!win64) { dev_info(&dev->dev, "couldn't allocate property, property name, or value\n"); @@ -1375,15 +1393,54 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) if (!window) goto out_del_prop; - ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, - win64->value, tce_setrange_multi_pSeriesLP_walk); - if (ret) { - dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n", - dn, ret); + if (direct_mapping) { + /* DDW maps the whole partition, so enable direct DMA mapping */ + ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, + win64->value, tce_setrange_multi_pSeriesLP_walk); + if (ret) { + dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n", + dn, ret); /* Make sure to clean DDW if any TCE was set*/ clean_dma_window(pdn, win64->value); - goto out_del_list; + goto out_del_list; + } + } else { + struct iommu_table *newtbl; + int i; + + for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) { + const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM; + + /* Look for MMIO32 */ + if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) + break; + } + + if (i == ARRAY_SIZE(pci->phb->mem_resources)) + goto out_del_list; + + /* New table for using DDW instead of the default DMA window */ + newtbl = iommu_pseries_alloc_table(pci->phb->node); + if (!newtbl) { + dev_dbg(&dev->dev, "couldn't create new IOMMU table\n"); + goto out_del_list; + } + + iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, win_addr, + 1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops); + iommu_init_table(newtbl, pci->phb->node, pci->phb->mem_resources[i].start, + pci->phb->mem_resources[i].end); + + pci->table_group->tables[1] = newtbl; + + /* Keep default DMA window stuct if removed */ + if (default_win_removed) { + tbl->it_size = 0; + kfree(tbl->it_map); + } + + set_iommu_table_base(&dev->dev, newtbl); } spin_lock(&direct_window_list_lock); @@ -1427,10 +1484,10 @@ out_unlock: * as RAM, then we failed to create a window to cover persistent * memory and need to set the DMA limit. */ - if (pmem_present && ddw_enabled && (len == max_ram_len)) + if (pmem_present && ddw_enabled && direct_mapping && len == max_ram_len) dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL << len); - return ddw_enabled; + return ddw_enabled && direct_mapping; } static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) @@ -1572,7 +1629,9 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti * we have to remove the property when releasing * the device node. */ - remove_ddw(np, false, DIRECT64_PROPNAME); + if (remove_ddw(np, false, DIRECT64_PROPNAME)) + remove_ddw(np, false, DMA64_PROPNAME); + if (pci && pci->table_group) iommu_pseries_free_group(pci->table_group, np->full_name); From 57dbbe590f152e5e8a3ff8bf5ba163df34eeae0b Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:29 -0300 Subject: [PATCH 289/474] powerpc/pseries/iommu: Rename "direct window" to "dma window" A previous change introduced the usage of DDW as a bigger indirect DMA mapping when the DDW available size does not map the whole partition. As most of the code that manipulates direct mappings was reused for indirect mappings, it's necessary to rename all names and debug/info messages to reflect that it can be used for both kinds of mapping. This should cause no behavioural change, just adjust naming. Signed-off-by: Leonardo Bras Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-12-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 87 +++++++++++++------------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 0eccc29f5573..dab5c56ffd0e 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -349,7 +349,7 @@ struct dynamic_dma_window_prop { __be32 window_shift; /* ilog2(tce_window_size) */ }; -struct direct_window { +struct dma_win { struct device_node *device; const struct dynamic_dma_window_prop *prop; struct list_head list; @@ -369,11 +369,11 @@ struct ddw_create_response { u32 addr_lo; }; -static LIST_HEAD(direct_window_list); +static LIST_HEAD(dma_win_list); /* prevents races between memory on/offline and window creation */ -static DEFINE_SPINLOCK(direct_window_list_lock); +static DEFINE_SPINLOCK(dma_win_list_lock); /* protects initializing window twice for same device */ -static DEFINE_MUTEX(direct_window_init_mutex); +static DEFINE_MUTEX(dma_win_init_mutex); #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info" #define DMA64_PROPNAME "linux,dma64-ddr-window-info" @@ -713,7 +713,10 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n", dn); - /* Find nearest ibm,dma-window, walking up the device tree */ + /* + * Find nearest ibm,dma-window (default DMA window), walking up the + * device tree + */ for (pdn = dn; pdn != NULL; pdn = pdn->parent) { dma_window = of_get_property(pdn, "ibm,dma-window", NULL); if (dma_window != NULL) @@ -869,37 +872,37 @@ static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_ ret = of_remove_property(np, win); if (ret) - pr_warn("%pOF: failed to remove direct window property: %d\n", + pr_warn("%pOF: failed to remove DMA window property: %d\n", np, ret); return 0; } static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift) { - struct direct_window *window; - const struct dynamic_dma_window_prop *direct64; + struct dma_win *window; + const struct dynamic_dma_window_prop *dma64; bool found = false; - spin_lock(&direct_window_list_lock); + spin_lock(&dma_win_list_lock); /* check if we already created a window and dupe that config if so */ - list_for_each_entry(window, &direct_window_list, list) { + list_for_each_entry(window, &dma_win_list, list) { if (window->device == pdn) { - direct64 = window->prop; - *dma_addr = be64_to_cpu(direct64->dma_base); - *window_shift = be32_to_cpu(direct64->window_shift); + dma64 = window->prop; + *dma_addr = be64_to_cpu(dma64->dma_base); + *window_shift = be32_to_cpu(dma64->window_shift); found = true; break; } } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); return found; } -static struct direct_window *ddw_list_new_entry(struct device_node *pdn, - const struct dynamic_dma_window_prop *dma64) +static struct dma_win *ddw_list_new_entry(struct device_node *pdn, + const struct dynamic_dma_window_prop *dma64) { - struct direct_window *window; + struct dma_win *window; window = kzalloc(sizeof(*window), GFP_KERNEL); if (!window) @@ -915,7 +918,7 @@ static void find_existing_ddw_windows_named(const char *name) { int len; struct device_node *pdn; - struct direct_window *window; + struct dma_win *window; const struct dynamic_dma_window_prop *dma64; for_each_node_with_property(pdn, name) { @@ -929,9 +932,9 @@ static void find_existing_ddw_windows_named(const char *name) if (!window) break; - spin_lock(&direct_window_list_lock); - list_add(&window->list, &direct_window_list); - spin_unlock(&direct_window_list_lock); + spin_lock(&dma_win_list_lock); + list_add(&window->list, &dma_win_list); + spin_unlock(&dma_win_list_lock); } } @@ -1231,7 +1234,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) const char *win_name; struct device_node *dn; u32 ddw_avail[DDW_APPLICABLE_SIZE]; - struct direct_window *window; + struct dma_win *window; struct property *win64; bool ddw_enabled = false; struct failed_ddw_pdn *fpdn; @@ -1244,7 +1247,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) pmem_present = dn != NULL; of_node_put(dn); - mutex_lock(&direct_window_init_mutex); + mutex_lock(&dma_win_init_mutex); if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) { direct_mapping = (len >= max_ram_len); @@ -1324,8 +1327,8 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) page_shift = iommu_get_page_shift(query.page_size); if (!page_shift) { - dev_dbg(&dev->dev, "no supported direct page size in mask %x", - query.page_size); + dev_dbg(&dev->dev, "no supported page size in mask %x", + query.page_size); goto out_failed; } @@ -1384,7 +1387,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) ret = of_add_property(pdn, win64); if (ret) { - dev_err(&dev->dev, "unable to add dma window property for %pOF: %d", + dev_err(&dev->dev, "unable to add DMA window property for %pOF: %d", pdn, ret); goto out_free_prop; } @@ -1398,7 +1401,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, win64->value, tce_setrange_multi_pSeriesLP_walk); if (ret) { - dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n", + dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n", dn, ret); /* Make sure to clean DDW if any TCE was set*/ @@ -1443,9 +1446,9 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) set_iommu_table_base(&dev->dev, newtbl); } - spin_lock(&direct_window_list_lock); - list_add(&window->list, &direct_window_list); - spin_unlock(&direct_window_list_lock); + spin_lock(&dma_win_list_lock); + list_add(&window->list, &dma_win_list); + spin_unlock(&dma_win_list_lock); dev->dev.archdata.dma_offset = win_addr; ddw_enabled = true; @@ -1477,7 +1480,7 @@ out_failed: list_add(&fpdn->list, &failed_ddw_pdn_list); out_unlock: - mutex_unlock(&direct_window_init_mutex); + mutex_unlock(&dma_win_init_mutex); /* * If we have persistent memory and the window size is only as big @@ -1575,29 +1578,29 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { - struct direct_window *window; + struct dma_win *window; struct memory_notify *arg = data; int ret = 0; switch (action) { case MEM_GOING_ONLINE: - spin_lock(&direct_window_list_lock); - list_for_each_entry(window, &direct_window_list, list) { + spin_lock(&dma_win_list_lock); + list_for_each_entry(window, &dma_win_list, list) { ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, arg->nr_pages, window->prop); /* XXX log error */ } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); break; case MEM_CANCEL_ONLINE: case MEM_OFFLINE: - spin_lock(&direct_window_list_lock); - list_for_each_entry(window, &direct_window_list, list) { + spin_lock(&dma_win_list_lock); + list_for_each_entry(window, &dma_win_list, list) { ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, arg->nr_pages, window->prop); /* XXX log error */ } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); break; default: break; @@ -1618,7 +1621,7 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti struct of_reconfig_data *rd = data; struct device_node *np = rd->dn; struct pci_dn *pci = PCI_DN(np); - struct direct_window *window; + struct dma_win *window; switch (action) { case OF_RECONFIG_DETACH_NODE: @@ -1636,15 +1639,15 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti iommu_pseries_free_group(pci->table_group, np->full_name); - spin_lock(&direct_window_list_lock); - list_for_each_entry(window, &direct_window_list, list) { + spin_lock(&dma_win_list_lock); + list_for_each_entry(window, &dma_win_list, list) { if (window->device == np) { list_del(&window->list); kfree(window); break; } } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); break; default: err = NOTIFY_DONE; From 71f8817c28e2e1e5549138e2aef68c2fd784e162 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Mon, 23 Aug 2021 18:34:05 +0100 Subject: [PATCH 290/474] MIPS: ingenic: Unconditionally enable clock of CPU #0 Make sure that the PLL that feeds the CPU won't be stopped while the kernel is running. This fixes a problem on JZ4760 (and probably others) where under very specific conditions, the main PLL would be turned OFF when the kernel was shutting down, causing the shutdown process to fail. Signed-off-by: Paul Cercueil Signed-off-by: Thomas Bogendoerfer --- arch/mips/generic/board-ingenic.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/mips/generic/board-ingenic.c b/arch/mips/generic/board-ingenic.c index e86e0016c548..3f44f14bdb33 100644 --- a/arch/mips/generic/board-ingenic.c +++ b/arch/mips/generic/board-ingenic.c @@ -7,6 +7,8 @@ * Copyright (C) 2020 Paul Cercueil */ +#include +#include #include #include #include @@ -129,10 +131,36 @@ static const struct platform_suspend_ops ingenic_pm_ops __maybe_unused = { static int __init ingenic_pm_init(void) { + struct device_node *cpu_node; + struct clk *cpu0_clk; + int ret; + if (boot_cpu_type() == CPU_XBURST) { if (IS_ENABLED(CONFIG_PM_SLEEP)) suspend_set_ops(&ingenic_pm_ops); _machine_halt = ingenic_halt; + + /* + * Unconditionally enable the clock for the first CPU. + * This makes sure that the PLL that feeds the CPU won't be + * stopped while the kernel is running. + */ + cpu_node = of_get_cpu_node(0, NULL); + if (!cpu_node) { + pr_err("Unable to get CPU node\n"); + } else { + cpu0_clk = of_clk_get(cpu_node, 0); + if (IS_ERR(cpu0_clk)) { + pr_err("Unable to get CPU0 clock\n"); + return PTR_ERR(cpu0_clk); + } + + ret = clk_prepare_enable(cpu0_clk); + if (ret) { + pr_err("Unable to enable CPU0 clock\n"); + return ret; + } + } } return 0; From 1d78dfde33a02da1d816279c2e3452978b7abd39 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 27 Aug 2021 14:07:06 +1000 Subject: [PATCH 291/474] KVM: PPC: Fix clearing never mapped TCEs in realmode Since commit e1a1ef84cd07 ("KVM: PPC: Book3S: Allocate guest TCEs on demand too"), pages for TCE tables for KVM guests are allocated only when needed. This allows skipping any update when clearing TCEs. This works mostly fine as TCE updates are handled when the MMU is enabled. The realmode handlers fail with H_TOO_HARD when pages are not yet allocated, except when clearing a TCE in which case KVM prints a warning and proceeds to dereference a NULL pointer, which crashes the host OS. This has not been caught so far as the change in commit e1a1ef84cd07 is reasonably new, and POWER9 runs mostly radix which does not use realmode handlers. With hash, the default TCE table is memset() by QEMU when the machine is reset which triggers page faults and the KVM TCE device's kvm_spapr_tce_fault() handles those with MMU on. And the huge DMA windows are not cleared by VMs which instead successfully create a DMA window big enough to map the VM memory 1:1 and then VMs just map everything without clearing. This started crashing now as commit 381ceda88c4c ("powerpc/pseries/iommu: Make use of DDW for indirect mapping") added a mode when a dymanic DMA window not big enough to map the VM memory 1:1 but it is used anyway, and the VM now is the first (i.e. not QEMU) to clear a just created table. Note that upstream QEMU needs to be modified to trigger the VM to trigger the host OS crash. This replaces WARN_ON_ONCE_RM() with a check and return, and adds another warning if TCE is not being cleared. Fixes: e1a1ef84cd07 ("KVM: PPC: Book3S: Allocate guest TCEs on demand too") Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210827040706.517652-1-aik@ozlabs.ru --- arch/powerpc/kvm/book3s_64_vio_hv.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index dc6591548f0c..636c6ae0939b 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -173,10 +173,13 @@ static void kvmppc_rm_tce_put(struct kvmppc_spapr_tce_table *stt, idx -= stt->offset; page = stt->pages[idx / TCES_PER_PAGE]; /* - * page must not be NULL in real mode, - * kvmppc_rm_ioba_validate() must have taken care of this. + * kvmppc_rm_ioba_validate() allows pages not be allocated if TCE is + * being cleared, otherwise it returns H_TOO_HARD and we skip this. */ - WARN_ON_ONCE_RM(!page); + if (!page) { + WARN_ON_ONCE_RM(tce != 0); + return; + } tbl = kvmppc_page_address(page); tbl[idx % TCES_PER_PAGE] = tce; From 7851155a1a7c33f2270dcd979ab6532a89be5293 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 26 Aug 2021 22:59:44 +0930 Subject: [PATCH 292/474] openrisc/litex: Update uart address Recent litex socs will place the UART at 0xe0006800. Signed-off-by: Joel Stanley Signed-off-by: Stafford Horne --- arch/openrisc/boot/dts/or1klitex.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/openrisc/boot/dts/or1klitex.dts b/arch/openrisc/boot/dts/or1klitex.dts index 3f9867aa3844..baba4f49fa6b 100644 --- a/arch/openrisc/boot/dts/or1klitex.dts +++ b/arch/openrisc/boot/dts/or1klitex.dts @@ -41,10 +41,10 @@ interrupt-controller; }; - serial0: serial@e0002000 { + serial0: serial@e0006800 { device_type = "serial"; compatible = "litex,liteuart"; - reg = <0xe0002000 0x100>; + reg = <0xe0006800 0x100>; }; soc_ctrl0: soc_controller@e0000000 { From 978c791491bce8cc1a27cb50392a2e8bbcea79d4 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 26 Aug 2021 22:59:45 +0930 Subject: [PATCH 293/474] openrisc/litex: Add ethernet device Add the liteeth ethernet device. Signed-off-by: Joel Stanley Signed-off-by: Stafford Horne --- arch/openrisc/boot/dts/or1klitex.dts | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/openrisc/boot/dts/or1klitex.dts b/arch/openrisc/boot/dts/or1klitex.dts index baba4f49fa6b..91c7173c50e6 100644 --- a/arch/openrisc/boot/dts/or1klitex.dts +++ b/arch/openrisc/boot/dts/or1klitex.dts @@ -52,4 +52,13 @@ reg = <0xe0000000 0xc>; status = "okay"; }; + + ethernet@e0001000 { + compatible = "litex,liteeth"; + reg = <0xe0001000 0x7c>, + <0xe0001800 0x0a>, + <0x80000000 0x2000>; + reg-names = "mac", "mdio", "buffer"; + interrupts = <2>; + }; }; From 1955d843efc3b5cf3ab4878986a87ad4972ff4e1 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 26 Aug 2021 22:59:46 +0930 Subject: [PATCH 294/474] openrisc/litex: Update defconfig Add the liteeth network device and basic network options, and update the options by doing a savedefconfig. Signed-off-by: Joel Stanley Signed-off-by: Stafford Horne --- arch/openrisc/configs/or1klitex_defconfig | 26 ++++++++++++++--------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/openrisc/configs/or1klitex_defconfig b/arch/openrisc/configs/or1klitex_defconfig index 3c2c70d3d740..d695879a4d26 100644 --- a/arch/openrisc/configs/or1klitex_defconfig +++ b/arch/openrisc/configs/or1klitex_defconfig @@ -1,18 +1,24 @@ CONFIG_BLK_DEV_INITRD=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y -CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_EMBEDDED=y +CONFIG_OPENRISC_BUILTIN_DTB="or1klitex" +CONFIG_HZ_100=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y -CONFIG_EMBEDDED=y -CONFIG_HZ_100=y -CONFIG_INITRAMFS_SOURCE="openrisc-rootfs.cpio.gz" CONFIG_OF_OVERLAY=y -CONFIG_OPENRISC_BUILTIN_DTB="or1klitex" -CONFIG_PANIC_ON_OOPS=y -CONFIG_PRINTK_TIME=y -CONFIG_LITEX_SOC_CONTROLLER=y +CONFIG_NETDEVICES=y +CONFIG_LITEX_LITEETH=y CONFIG_SERIAL_LITEUART=y CONFIG_SERIAL_LITEUART_CONSOLE=y -CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_TTY_PRINTK=y +CONFIG_LITEX_SOC_CONTROLLER=y +CONFIG_TMPFS=y +CONFIG_PRINTK_TIME=y +CONFIG_PANIC_ON_OOPS=y +CONFIG_SOFTLOCKUP_DETECTOR=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BUG_ON_DATA_CORRUPTION=y From f3c4b1341e8320e63f197a554fc5a25686a11d22 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Fri, 27 Aug 2021 11:48:02 +0800 Subject: [PATCH 295/474] swiotlb: use depends on for DMA_RESTRICTED_POOL Use depends on instead of select for DMA_RESTRICTED_POOL; otherwise it will make SWIOTLB user configurable and cause compile errors for some arch (e.g. mips). Fixes: 0b84e4f8b793 ("swiotlb: Add restricted DMA pool initialization") Acked-by: Will Deacon Reported-by: Guenter Roeck Signed-off-by: Claire Chang Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 3e961dc39634..d19f9fef4ab9 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -82,8 +82,7 @@ config SWIOTLB config DMA_RESTRICTED_POOL bool "DMA Restricted Pool" - depends on OF && OF_RESERVED_MEM - select SWIOTLB + depends on OF && OF_RESERVED_MEM && SWIOTLB help This enables support for restricted DMA pools which provide a level of DMA memory protection on systems with limited hardware protection From b14b8b1ed0e15b8f43fba9c25654278a31ee3c2f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 31 Aug 2021 23:51:51 +1000 Subject: [PATCH 296/474] powerpc/ptdump: Fix generic ptdump for 64-bit Since the conversion to generic ptdump we see crashes on 64-bit: BUG: Unable to handle kernel data access on read at 0xc0eeff7f00000000 Faulting instruction address: 0xc00000000045e5fc Oops: Kernel access of bad area, sig: 11 [#1] ... NIP __walk_page_range+0x2bc/0xce0 LR __walk_page_range+0x240/0xce0 Call Trace: __walk_page_range+0x240/0xce0 (unreliable) walk_page_range_novma+0x74/0xb0 ptdump_walk_pgd+0x98/0x170 ptdump_check_wx+0x88/0xd0 mark_rodata_ro+0x48/0x80 kernel_init+0x74/0x1a0 ret_from_kernel_thread+0x5c/0x64 What's happening is that have walked off the end of the kernel page tables, and started dereferencing junk values. That happens because we initialised the ptdump_range to span all the way up to 0xffffffffffffffff: static struct ptdump_range ptdump_range[] __ro_after_init = { {TASK_SIZE_MAX, ~0UL}, But the kernel page tables don't span that far. So on 64-bit set the end of the range to be the address immediately past the end of the kernel page tables, to limit the page table walk to valid addresses. Fixes: e084728393a5 ("powerpc/ptdump: Convert powerpc to GENERIC_PTDUMP") Reported-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210831135151.886620-1-mpe@ellerman.id.au --- arch/powerpc/mm/ptdump/ptdump.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 2d80d775d15e..bf251191e78d 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -359,6 +359,8 @@ static int __init ptdump_init(void) ptdump_range[0].start = KERN_VIRT_START; else ptdump_range[0].start = PAGE_OFFSET; + + ptdump_range[0].end = PAGE_OFFSET + (PGDIR_SIZE * PTRS_PER_PGD); #endif populate_markers(); From e432fe97f3e5de325b40021e505cce53877586c5 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 27 Aug 2021 22:51:06 +1000 Subject: [PATCH 297/474] powerpc/bug: Cast to unsigned long before passing to inline asm In commit 1e688dd2a3d6 ("powerpc/bug: Provide better flexibility to WARN_ON/__WARN_FLAGS() with asm goto") we changed WARN_ON(). Previously it would take the warning condition, x, and double negate it before converting the result to int, and passing that int to the underlying inline asm. ie: #define WARN_ON(x) ({ int __ret_warn_on = !!(x); if (__builtin_constant_p(__ret_warn_on)) { ... } else { BUG_ENTRY(PPC_TLNEI " %4, 0", BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), "r" (__ret_warn_on)); The asm then does a full register width comparison with zero and traps if it is non-zero (PPC_TLNEI). The new code instead passes the full expression, x, with some arbitrary type, to the inline asm: #define WARN_ON(x) ({ ... do { if (__builtin_constant_p((x))) { ... } else { ... WARN_ENTRY(PPC_TLNEI " %4, 0", BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), __label_warn_on, "r" (x)); As reported[1] by Nathan, when building with clang this can cause spurious warnings to fire repeatedly at boot: WARNING: CPU: 0 PID: 1 at lib/klist.c:62 .klist_add_tail+0x3c/0x110 Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 5.14.0-rc7-next-20210825 #1 NIP: c0000000007ff81c LR: c00000000090a038 CTR: 0000000000000000 REGS: c0000000073c32a0 TRAP: 0700 Tainted: G W (5.14.0-rc7-next-20210825) MSR: 8000000002029032 CR: 22000a40 XER: 00000000 CFAR: c00000000090a034 IRQMASK: 0 GPR00: c00000000090a038 c0000000073c3540 c000000001be3200 0000000000000001 GPR04: c0000000072d65c0 0000000000000000 c0000000091ba798 c0000000091bb0a0 GPR08: 0000000000000001 0000000000000000 c000000008581918 fffffffffffffc00 GPR12: 0000000044000240 c000000001dd0000 c000000000012300 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR24: 0000000000000000 c0000000017e3200 0000000000000000 c000000001a0e778 GPR28: c0000000072d65b0 c0000000072d65a8 c000000007de72c8 c0000000073c35d0 NIP .klist_add_tail+0x3c/0x110 LR .bus_add_driver+0x148/0x290 Call Trace: 0xc0000000073c35d0 (unreliable) .bus_add_driver+0x148/0x290 .driver_register+0xb8/0x190 .__hid_register_driver+0x70/0xd0 .redragon_driver_init+0x34/0x58 .do_one_initcall+0x130/0x3b0 .do_initcall_level+0xd8/0x188 .do_initcalls+0x7c/0xdc .kernel_init_freeable+0x178/0x21c .kernel_init+0x34/0x220 .ret_from_kernel_thread+0x58/0x60 Instruction dump: fba10078 7c7d1b78 38600001 fb810070 3b9d0008 fbc10080 7c9e2378 389d0018 fb9d0008 fb9d0010 90640000 fbdd0000 <0b1e0000> e87e0018 28230000 41820024 The instruction dump shows that we are trapping because r30 is not zero: tdnei r30,0 Where r30 = c000000007de72c8 The WARN_ON() comes from: static void knode_set_klist(struct klist_node *knode, struct klist *klist) { knode->n_klist = klist; /* no knode deserves to start its life dead */ WARN_ON(knode_dead(knode)); ^^^^^^^^^^^^^^^^^ Where: #define KNODE_DEAD 1LU static bool knode_dead(struct klist_node *knode) { return (unsigned long)knode->n_klist & KNODE_DEAD; } The full disassembly shows that clang has not generated any code to apply the "& KNODE_DEAD" to the n_klist pointer, which is surprising. Nathan filed an LLVM bug [2], in which Eli Friedman explained that clang believes it is only passing a single bit to the asm (ie. a bool) and so the mask of bit 0 with 1 can be omitted, and suggested that if we want the full 64-bit value passed to the inline asm we should cast to a 64-bit type (or 32-bit on 32-bits). In fact we already do that for BUG_ENTRY(), which was added to fix a possibly similar bug in 2005 in commit 32818c2eb6b8 ("[PATCH] ppc64: Fix issue with gcc 4.0 compiled kernels"). So cast the value we pass to the inline asm to long. For GCC this appears to have no effect on code generation, other than causing sign extension in some cases. [1]: http://lore.kernel.org/r/YSa1O4fcX1nNKqN/@Ryzen-9-3900X.localdomain [2]: https://bugs.llvm.org/show_bug.cgi?id=51634 Fixes: 1e688dd2a3d6 ("powerpc/bug: Provide better flexibility to WARN_ON/__WARN_FLAGS() with asm goto") Reported-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210901112522.1085134-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/bug.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index 1ee0f22313ee..02c08d1492f8 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -119,7 +119,8 @@ __label_warn_on: \ \ WARN_ENTRY(PPC_TLNEI " %4, 0", \ BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), \ - __label_warn_on, "r" (x)); \ + __label_warn_on, \ + "r" ((__force long)(x))); \ break; \ __label_warn_on: \ __ret_warn_on = true; \ From bea6a94a279bcbe6b2cde348782b28baf12255a5 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 2 Sep 2021 09:19:51 +0200 Subject: [PATCH 298/474] MIPS: Malta: fix alignment of the devicetree buffer Starting with following patch MIPS Malta is not able to boot: | commit 79edff12060fe7772af08607eff50c0e2486c5ba | Author: Rob Herring | scripts/dtc: Update to upstream version v1.6.0-51-g183df9e9c2b9 The reason is the alignment test added to the fdt_ro_probe_(). To fix this issue, we need to make sure that fdt_buf is aligned. Since the dtc patch was designed to uncover potential issue, I handle initial MIPS Malta patch as initial bug. Fixes: e81a8c7dabac ("MIPS: Malta: Setup RAM regions via DT") Signed-off-by: Oleksij Rempel Signed-off-by: Thomas Bogendoerfer --- arch/mips/mti-malta/malta-dtshim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/mti-malta/malta-dtshim.c b/arch/mips/mti-malta/malta-dtshim.c index 0ddf03df6268..f451268f6c38 100644 --- a/arch/mips/mti-malta/malta-dtshim.c +++ b/arch/mips/mti-malta/malta-dtshim.c @@ -22,7 +22,7 @@ #define ROCIT_CONFIG_GEN1_MEMMAP_SHIFT 8 #define ROCIT_CONFIG_GEN1_MEMMAP_MASK (0xf << 8) -static unsigned char fdt_buf[16 << 10] __initdata; +static unsigned char fdt_buf[16 << 10] __initdata __aligned(8); /* determined physical memory size, not overridden by command line args */ extern unsigned long physical_memsize; From 799206c1302e8fabfab5a4151e74a2fe90090590 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 1 Sep 2021 20:22:11 -0400 Subject: [PATCH 299/474] iscsi_ibft: Fix isa_bus_to_virt not working under ARM The isa_bus_to_virt is only needed under X86 and in fact the code that sets the ibft_phys_addr is only compiled under X86. As such lets just ifdef the code. Reported-by: kernel test robot Reported-by: Vijayendra Suman CC: Maurizio Lombardi CC: Mike Rapoport Signed-off-by: Konrad Rzeszutek Wilk --- v2: Remove the ibft_phys_addr as it is defined in iscsi_ibft.h --- drivers/firmware/iscsi_ibft.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c index 612a59e213df..6e9788324fea 100644 --- a/drivers/firmware/iscsi_ibft.c +++ b/drivers/firmware/iscsi_ibft.c @@ -86,10 +86,6 @@ MODULE_VERSION(IBFT_ISCSI_VERSION); static struct acpi_table_ibft *ibft_addr; -#ifndef CONFIG_ISCSI_IBFT_FIND -phys_addr_t ibft_phys_addr; -#endif - struct ibft_hdr { u8 id; u8 version; @@ -851,7 +847,21 @@ static void __init acpi_find_ibft_region(void) { } #endif - +#ifdef CONFIG_ISCSI_IBFT_FIND +static int __init acpi_find_isa_region(void) +{ + if (ibft_phys_addr) { + ibft_addr = isa_bus_to_virt(ibft_phys_addr); + return 0; + } + return -ENODEV; +} +#else +static int __init acpi_find_isa_region(void) +{ + return -ENODEV; +} +#endif /* * ibft_init() - creates sysfs tree entries for the iBFT data. */ @@ -864,9 +874,7 @@ static int __init ibft_init(void) is called before ACPI tables are parsed and it only does legacy finding. */ - if (ibft_phys_addr) - ibft_addr = isa_bus_to_virt(ibft_phys_addr); - else + if (acpi_find_isa_region()) acpi_find_ibft_region(); if (ibft_addr) { From e5a2cac908df691f1637f9272d4c6dec83239611 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 2 Sep 2021 23:54:03 +0200 Subject: [PATCH 300/474] parisc: Drop __arch_swab16(), arch_swab24(), _arch_swab32() and __arch_swab64() functions No need to keep those as inline assembly functions, the compiler now generates the same or even better optimized code. Signed-off-by: Helge Deller --- arch/parisc/include/uapi/asm/swab.h | 68 ----------------------------- 1 file changed, 68 deletions(-) delete mode 100644 arch/parisc/include/uapi/asm/swab.h diff --git a/arch/parisc/include/uapi/asm/swab.h b/arch/parisc/include/uapi/asm/swab.h deleted file mode 100644 index 35fb2d1bfbbd..000000000000 --- a/arch/parisc/include/uapi/asm/swab.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _PARISC_SWAB_H -#define _PARISC_SWAB_H - -#include -#include -#include - -#define __SWAB_64_THRU_32__ - -static inline __attribute_const__ __u16 __arch_swab16(__u16 x) -{ - __asm__("dep %0, 15, 8, %0\n\t" /* deposit 00ab -> 0bab */ - "shd %%r0, %0, 8, %0" /* shift 000000ab -> 00ba */ - : "=r" (x) - : "0" (x)); - return x; -} -#define __arch_swab16 __arch_swab16 - -static inline __attribute_const__ __u32 __arch_swab24(__u32 x) -{ - __asm__("shd %0, %0, 8, %0\n\t" /* shift xabcxabc -> cxab */ - "dep %0, 15, 8, %0\n\t" /* deposit cxab -> cbab */ - "shd %%r0, %0, 8, %0" /* shift 0000cbab -> 0cba */ - : "=r" (x) - : "0" (x)); - return x; -} - -static inline __attribute_const__ __u32 __arch_swab32(__u32 x) -{ - unsigned int temp; - __asm__("shd %0, %0, 16, %1\n\t" /* shift abcdabcd -> cdab */ - "dep %1, 15, 8, %1\n\t" /* deposit cdab -> cbab */ - "shd %0, %1, 8, %0" /* shift abcdcbab -> dcba */ - : "=r" (x), "=&r" (temp) - : "0" (x)); - return x; -} -#define __arch_swab32 __arch_swab32 - -#if __BITS_PER_LONG > 32 -/* -** From "PA-RISC 2.0 Architecture", HP Professional Books. -** See Appendix I page 8 , "Endian Byte Swapping". -** -** Pretty cool algorithm: (* == zero'd bits) -** PERMH 01234567 -> 67452301 into %0 -** HSHL 67452301 -> 7*5*3*1* into %1 -** HSHR 67452301 -> *6*4*2*0 into %0 -** OR %0 | %1 -> 76543210 into %0 (all done!) -*/ -static inline __attribute_const__ __u64 __arch_swab64(__u64 x) -{ - __u64 temp; - __asm__("permh,3210 %0, %0\n\t" - "hshl %0, 8, %1\n\t" - "hshr,u %0, 8, %0\n\t" - "or %1, %0, %0" - : "=r" (x), "=&r" (temp) - : "0" (x)); - return x; -} -#define __arch_swab64 __arch_swab64 -#endif /* __BITS_PER_LONG > 32 */ - -#endif /* _PARISC_SWAB_H */ From c42813b71a06a2ff4a155aa87ac609feeab76cf3 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 2 Sep 2021 23:24:42 +0200 Subject: [PATCH 301/474] parisc: Fix unaligned-access crash in bootloader Kernel v5.14 has various changes to optimize unaligned memory accesses, e.g. commit 0652035a5794 ("asm-generic: unaligned: remove byteshift helpers"). Those changes triggered an unalignment-exception and thus crashed the bootloader on parisc because the unaligned "output_len" variable now suddenly was read word-wise while it was read byte-wise in the past. Fix this issue by declaring the external output_len variable as char which then forces the compiler to generate byte-accesses. Signed-off-by: Helge Deller Cc: Arnd Bergmann Cc: John David Anglin Bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102162 Fixes: 8c031ba63f8f ("parisc: Unbreak bootloader due to gcc-7 optimizations") Fixes: 0652035a5794 ("asm-generic: unaligned: remove byteshift helpers") Cc: # v5.14+ --- arch/parisc/boot/compressed/misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/boot/compressed/misc.c b/arch/parisc/boot/compressed/misc.c index 2d395998f524..7ee49f5881d1 100644 --- a/arch/parisc/boot/compressed/misc.c +++ b/arch/parisc/boot/compressed/misc.c @@ -26,7 +26,7 @@ extern char input_data[]; extern int input_len; /* output_len is inserted by the linker possibly at an unaligned address */ -extern __le32 output_len __aligned(1); +extern char output_len; extern char _text, _end; extern char _bss, _ebss; extern char _startcode_end; From 577706de69c1e53bc23893953770d829a2242c38 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 2 Sep 2021 14:49:58 -0700 Subject: [PATCH 302/474] ia64: fix typo in a comment s/when when/when/ Link: https://lkml.kernel.org/r/20210817112500.12848-1-wangborong@cdjrlc.com Signed-off-by: Jason Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index e2af6b172200..96d13cb7c19f 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -906,6 +906,6 @@ EXPORT_SYMBOL(acpi_unregister_ioapic); /* * acpi_suspend_lowlevel() - save kernel state and suspend. * - * TBD when when IA64 starts to support suspend... + * TBD when IA64 starts to support suspend... */ int acpi_suspend_lowlevel(void) { return 0; } From 1d1f4bf845d36d73b27ed37790cae0da3112ca77 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 2 Sep 2021 14:50:01 -0700 Subject: [PATCH 303/474] ia64: fix #endif comment for reserve_elfcorehdr() Patch series "ia64: Miscellaneous fixes and cleanups". This patch series contains some miscellaneous fixes and cleanups for ia64. The second patch fixes a naming conflict triggered by a patch for the FDT code. This patch (of 3): The definition of reserve_elfcorehdr() depends on CONFIG_CRASH_DUMP, not CONFIG_PROC_VMCORE. Link: https://lkml.kernel.org/r/cover.1629884459.git.geert+renesas@glider.be Link: https://lkml.kernel.org/r/77b4c0648f200cab7e1c2c5171c06763e09362aa.1629884459.git.geert+renesas@glider.be Fixes: d9a9855d0b06ca6d ("always reserve elfcore header memory in crash kernel") Fixes: 17c1f07ed70afa4f ("[IA64] Reserve elfcorehdr memory in CONFIG_CRASH_DUMP") Signed-off-by: Geert Uytterhoeven Cc: Simon Horman Cc: Tony Luck Cc: Jay Lan Cc: Magnus Damm Cc: Mike Rapoport Cc: Rob Herring Cc: Frank Rowand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index dd595fbd8006..fbd931f1eb27 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -546,7 +546,7 @@ int __init reserve_elfcorehdr(u64 *start, u64 *end) return 0; } -#endif /* CONFIG_PROC_VMCORE */ +#endif /* CONFIG_CRASH_DUMP */ void __init setup_arch (char **cmdline_p) From 70b2e9912a018e9fddb3a1e0cbb397d4d3c0e98f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 2 Sep 2021 14:50:04 -0700 Subject: [PATCH 304/474] ia64: make reserve_elfcorehdr() static There never was a reason for reserve_elfcorehdr() to be global. Make the function static, and move it before its sole caller. Link: https://lkml.kernel.org/r/fe236cd73b64abc4abd03dd808cb015c907f4c8c.1629884459.git.geert+renesas@glider.be Fixes: cee87af2a5f75713 ("[IA64] kexec: Use EFI_LOADER_DATA for ELF core header") Signed-off-by: Geert Uytterhoeven Cc: Frank Rowand Cc: Jay Lan Cc: Magnus Damm Cc: Mike Rapoport Cc: Rob Herring Cc: Simon Horman Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/meminit.h | 1 - arch/ia64/kernel/setup.c | 51 ++++++++++++++++----------------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/arch/ia64/include/asm/meminit.h b/arch/ia64/include/asm/meminit.h index 6c47a239fc26..2738f62b5f98 100644 --- a/arch/ia64/include/asm/meminit.h +++ b/arch/ia64/include/asm/meminit.h @@ -40,7 +40,6 @@ extern unsigned long efi_memmap_init(u64 *s, u64 *e); extern int find_max_min_low_pfn (u64, u64, void *); extern unsigned long vmcore_find_descriptor_size(unsigned long address); -extern int reserve_elfcorehdr(u64 *start, u64 *end); /* * For rounding an address to the next IA64_GRANULE_SIZE or order diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index fbd931f1eb27..5e6ee8939092 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -325,6 +325,31 @@ static inline void __init setup_crashkernel(unsigned long total, int *n) {} #endif +#ifdef CONFIG_CRASH_DUMP +static int __init reserve_elfcorehdr(u64 *start, u64 *end) +{ + u64 length; + + /* We get the address using the kernel command line, + * but the size is extracted from the EFI tables. + * Both address and size are required for reservation + * to work properly. + */ + + if (!is_vmcore_usable()) + return -EINVAL; + + if ((length = vmcore_find_descriptor_size(elfcorehdr_addr)) == 0) { + vmcore_unusable(); + return -EINVAL; + } + + *start = (unsigned long)__va(elfcorehdr_addr); + *end = *start + length; + return 0; +} +#endif /* CONFIG_CRASH_DUMP */ + /** * reserve_memory - setup reserved memory areas * @@ -522,32 +547,6 @@ static __init int setup_nomca(char *s) } early_param("nomca", setup_nomca); -#ifdef CONFIG_CRASH_DUMP -int __init reserve_elfcorehdr(u64 *start, u64 *end) -{ - u64 length; - - /* We get the address using the kernel command line, - * but the size is extracted from the EFI tables. - * Both address and size are required for reservation - * to work properly. - */ - - if (!is_vmcore_usable()) - return -EINVAL; - - if ((length = vmcore_find_descriptor_size(elfcorehdr_addr)) == 0) { - vmcore_unusable(); - return -EINVAL; - } - - *start = (unsigned long)__va(elfcorehdr_addr); - *end = *start + length; - return 0; -} - -#endif /* CONFIG_CRASH_DUMP */ - void __init setup_arch (char **cmdline_p) { From 7e4265c88968d4e53b164944c05e12e79d8ff9c6 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 2 Sep 2021 14:50:08 -0700 Subject: [PATCH 305/474] ia64: make num_rsvd_regions static Commit f62800992e5917f2 ("ia64: switch to NO_BOOTMEM") removed the last user of num_rsvd_regions outside arch/ia64/kernel/setup.c. Link: https://lkml.kernel.org/r/a377b5437e3e9da93d02f996fe06a2b956cb0990.1629884459.git.geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Cc: Frank Rowand Cc: Jay Lan Cc: Magnus Damm Cc: Mike Rapoport Cc: Rob Herring Cc: Simon Horman Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/meminit.h | 1 - arch/ia64/kernel/setup.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/ia64/include/asm/meminit.h b/arch/ia64/include/asm/meminit.h index 2738f62b5f98..f1d5bf2ba847 100644 --- a/arch/ia64/include/asm/meminit.h +++ b/arch/ia64/include/asm/meminit.h @@ -29,7 +29,6 @@ struct rsvd_region { }; extern struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1]; -extern int num_rsvd_regions; extern void find_memory (void); extern void reserve_memory (void); diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 5e6ee8939092..31fb84de2d21 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -131,7 +131,7 @@ unsigned long ia64_cache_stride_shift = ~0; * We use a special marker for the end of memory and it uses the extra (+1) slot */ struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1] __initdata; -int num_rsvd_regions __initdata; +static int num_rsvd_regions __initdata; /* From 2f566394467c86c3c89022013a99aa2d9d4a0208 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 2 Sep 2021 14:50:11 -0700 Subject: [PATCH 306/474] ocfs2: remove an unnecessary condition The case where "tmp_oh" is NULL is handled at the start of the function. At this point we know it's non-NULL so this will always return 1. Link: https://lkml.kernel.org/r/YOcItgIXtisi3MaO@mwanda Signed-off-by: Dan Carpenter Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Larry Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 48fd369c29a4..33fbdc823278 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2721,7 +2721,7 @@ int ocfs2_inode_lock_tracker(struct inode *inode, return status; } } - return tmp_oh ? 1 : 0; + return 1; } void ocfs2_inode_unlock_tracker(struct inode *inode, From 6c85c2c728193d19d6a908ae9fb312d0325e65ca Mon Sep 17 00:00:00 2001 From: Tuo Li Date: Thu, 2 Sep 2021 14:50:14 -0700 Subject: [PATCH 307/474] ocfs2: quota_local: fix possible uninitialized-variable access in ocfs2_local_read_info() A memory block is allocated through kmalloc(), and its return value is assigned to the pointer oinfo. However, oinfo->dqi_gqinode is not initialized but it is accessed in: iput(oinfo->dqi_gqinode); To fix this possible uninitialized-variable access, assign NULL to oinfo->dqi_gqinode, and add ocfs2_qinfo_lock_res_init() behind the assignment in ocfs2_local_read_info(). Remove ocfs2_qinfo_lock_res_init() in ocfs2_global_read_info(). Link: https://lkml.kernel.org/r/20210804031832.57154-1-islituo@gmail.com Signed-off-by: Tuo Li Reported-by: TOTE Robot Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/quota_global.c | 1 - fs/ocfs2/quota_local.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index eda83487c9ec..f033de733adb 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -357,7 +357,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type) } oinfo->dqi_gi.dqi_sb = sb; oinfo->dqi_gi.dqi_type = type; - ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk); oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops; oinfo->dqi_gqi_bh = NULL; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index b1a8b046f4c2..0e4b16d4c037 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -702,6 +702,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) info->dqi_priv = oinfo; oinfo->dqi_type = type; INIT_LIST_HEAD(&oinfo->dqi_chunk); + oinfo->dqi_gqinode = NULL; + ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); oinfo->dqi_rec = NULL; oinfo->dqi_lqi_bh = NULL; oinfo->dqi_libh = NULL; From 9673e0050c39b0534d0e2ca431223f52089f4959 Mon Sep 17 00:00:00 2001 From: Gang He Date: Thu, 2 Sep 2021 14:50:17 -0700 Subject: [PATCH 308/474] ocfs2: ocfs2_downconvert_lock failure results in deadlock Usually, ocfs2_downconvert_lock() function always downconverts dlm lock to the expected level for satisfy dlm bast requests from the other nodes. But there is a rare situation. When dlm lock conversion is being canceled, ocfs2_downconvert_lock() function will return -EBUSY. You need to be aware that ocfs2_cancel_convert() function is asynchronous in fsdlm implementation. If we does not requeue this lockres entry, ocfs2 downconvert thread no longer handles this dlm lock bast request. Then, the other nodes will not get the dlm lock again, the current node's process will be blocked when acquire this dlm lock again. Link: https://lkml.kernel.org/r/20210830044621.12544-1-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 33fbdc823278..359524b7341f 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -3912,6 +3913,17 @@ downconvert: spin_unlock_irqrestore(&lockres->l_lock, flags); ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb, gen); + /* The dlm lock convert is being cancelled in background, + * ocfs2_cancel_convert() is asynchronous in fs/dlm, + * requeue it, try again later. + */ + if (ret == -EBUSY) { + ctl->requeue = 1; + mlog(ML_BASTS, "lockres %s, ReQ: Downconvert busy\n", + lockres->l_name); + ret = 0; + msleep(20); + } leave: if (ret) From 4bdffd2708d65e68ff254d90793bb167d828219f Mon Sep 17 00:00:00 2001 From: kernel test robot Date: Thu, 2 Sep 2021 14:50:20 -0700 Subject: [PATCH 309/474] arch/csky/kernel/probes/kprobes.c: fix bugon.cocci warnings Use BUG_ON instead of a if condition followed by BUG. Generated by: scripts/coccinelle/misc/bugon.cocci Link: https://lkml.kernel.org/r/alpine.DEB.2.22.394.2107061049150.7197@hadrien Fixes: 7d37cb2c912d ("lib: fix kconfig dependency on ARCH_WANT_FRAME_POINTERS") Signed-off-by: kernel test robot Signed-off-by: Julia Lawall Reported-by: kernel test robot Cc: Julian Braha Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/csky/kernel/probes/kprobes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/csky/kernel/probes/kprobes.c b/arch/csky/kernel/probes/kprobes.c index 68b22b499aeb..8fffa34d4e1c 100644 --- a/arch/csky/kernel/probes/kprobes.c +++ b/arch/csky/kernel/probes/kprobes.c @@ -283,8 +283,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr) * normal page fault. */ regs->pc = (unsigned long) cur->addr; - if (!instruction_pointer(regs)) - BUG(); + BUG_ON(!instruction_pointer(regs)); if (kcb->kprobe_status == KPROBE_REENTER) restore_previous_kprobe(kcb); From 3c9b84f044a9e54cf56d1b2c9b80a2d2ce56d70a Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 2 Sep 2021 14:52:19 -0700 Subject: [PATCH 310/474] mm/debug_vm_pgtable: introduce struct pgtable_debug_args Patch series "mm/debug_vm_pgtable: Enhancements", v6. There are a couple of issues with current implementations and this series tries to resolve the issues: (a) All needed information are scattered in variables, passed to various test functions. The code is organized in pretty much relaxed fashion. (b) The page isn't allocated from buddy during page table entry modifying tests. The page can be invalid, conflicting to the implementations of set_xxx_at() on ARM64. The target page is accessed so that the iCache can be flushed when execution permission is given on ARM64. Besides, the target page can be unmapped and accessing to it causes kernel crash. "struct pgtable_debug_args" is introduced to address issue (a). For issue (b), the used page is allocated from buddy in page table entry modifying tests. The corresponding tets will be skipped if we fail to allocate the (huge) page. For other test cases, the original page around to kernel symbol (@start_kernel) is still used. The patches are organized as below. PATCH[2-10] could be combined to one patch, but it will make the review harder: PATCH[1] introduces "struct pgtable_debug_args" as place holder of all needed information. With it, the old and new implementation can coexist. PATCH[2-10] uses "struct pgtable_debug_args" in various test functions. PATCH[11] removes the unused code for old implementation. PATCH[12] fixes the issue of corrupted page flag for ARM64 This patch (of 6): In debug_vm_pgtable(), there are many local variables introduced to track the needed information and they are passed to the functions for various test cases. It'd better to introduce a struct as place holder for these information. With it, what the tests functions need is the struct. In this way, the code is simplified and easier to be maintained. Besides, set_xxx_at() could access the data on the corresponding pages in the page table modifying tests. So the accessed pages in the tests should have been allocated from buddy. Otherwise, we're accessing pages that aren't owned by us. This causes issues like page flag corruption or kernel crash on accessing unmapped page when CONFIG_DEBUG_PAGEALLOC is enabled. This introduces "struct pgtable_debug_args". The struct is initialized and destroyed, but the information in the struct isn't used yet. It will be used in subsequent patches. Link: https://lkml.kernel.org/r/20210809092631.1888748-1-gshan@redhat.com Link: https://lkml.kernel.org/r/20210809092631.1888748-2-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Anshuman Khandual Cc: Aneesh Kumar K.V Cc: Qian Cai Cc: Catalin Marinas Cc: Will Deacon Cc: Vineet Gupta Cc: Chunyu Hu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 270 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 269 insertions(+), 1 deletion(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 1c922691aa61..7b6bcf59e376 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -58,6 +58,37 @@ #define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK) #define RANDOM_NZVALUE GENMASK(7, 0) +struct pgtable_debug_args { + struct mm_struct *mm; + struct vm_area_struct *vma; + + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + p4d_t *start_p4dp; + pud_t *start_pudp; + pmd_t *start_pmdp; + pgtable_t start_ptep; + + unsigned long vaddr; + pgprot_t page_prot; + pgprot_t page_prot_none; + + bool is_contiguous_page; + unsigned long pud_pfn; + unsigned long pmd_pfn; + unsigned long pte_pfn; + + unsigned long fixed_pgd_pfn; + unsigned long fixed_p4d_pfn; + unsigned long fixed_pud_pfn; + unsigned long fixed_pmd_pfn; + unsigned long fixed_pte_pfn; +}; + static void __init pte_basic_tests(unsigned long pfn, int idx) { pgprot_t prot = protection_map[idx]; @@ -955,8 +986,239 @@ static unsigned long __init get_random_vaddr(void) return random_vaddr; } +static void __init destroy_args(struct pgtable_debug_args *args) +{ + struct page *page = NULL; + + /* Free (huge) page */ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && + has_transparent_hugepage() && + args->pud_pfn != ULONG_MAX) { + if (args->is_contiguous_page) { + free_contig_range(args->pud_pfn, + (1 << (HPAGE_PUD_SHIFT - PAGE_SHIFT))); + } else { + page = pfn_to_page(args->pud_pfn); + __free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT); + } + + args->pud_pfn = ULONG_MAX; + args->pmd_pfn = ULONG_MAX; + args->pte_pfn = ULONG_MAX; + } + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + has_transparent_hugepage() && + args->pmd_pfn != ULONG_MAX) { + if (args->is_contiguous_page) { + free_contig_range(args->pmd_pfn, (1 << HPAGE_PMD_ORDER)); + } else { + page = pfn_to_page(args->pmd_pfn); + __free_pages(page, HPAGE_PMD_ORDER); + } + + args->pmd_pfn = ULONG_MAX; + args->pte_pfn = ULONG_MAX; + } + + if (args->pte_pfn != ULONG_MAX) { + page = pfn_to_page(args->pte_pfn); + __free_pages(page, 0); + + args->pte_pfn = ULONG_MAX; + } + + /* Free page table entries */ + if (args->start_ptep) { + pte_free(args->mm, args->start_ptep); + mm_dec_nr_ptes(args->mm); + } + + if (args->start_pmdp) { + pmd_free(args->mm, args->start_pmdp); + mm_dec_nr_pmds(args->mm); + } + + if (args->start_pudp) { + pud_free(args->mm, args->start_pudp); + mm_dec_nr_puds(args->mm); + } + + if (args->start_p4dp) + p4d_free(args->mm, args->start_p4dp); + + /* Free vma and mm struct */ + if (args->vma) + vm_area_free(args->vma); + + if (args->mm) + mmdrop(args->mm); +} + +static struct page * __init +debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order) +{ + struct page *page = NULL; + +#ifdef CONFIG_CONTIG_ALLOC + if (order >= MAX_ORDER) { + page = alloc_contig_pages((1 << order), GFP_KERNEL, + first_online_node, NULL); + if (page) { + args->is_contiguous_page = true; + return page; + } + } +#endif + + if (order < MAX_ORDER) + page = alloc_pages(GFP_KERNEL, order); + + return page; +} + +static int __init init_args(struct pgtable_debug_args *args) +{ + struct page *page = NULL; + phys_addr_t phys; + int ret = 0; + + /* + * Initialize the debugging data. + * + * __P000 (or even __S000) will help create page table entries with + * PROT_NONE permission as required for pxx_protnone_tests(). + */ + memset(args, 0, sizeof(*args)); + args->vaddr = get_random_vaddr(); + args->page_prot = vm_get_page_prot(VMFLAGS); + args->page_prot_none = __P000; + args->is_contiguous_page = false; + args->pud_pfn = ULONG_MAX; + args->pmd_pfn = ULONG_MAX; + args->pte_pfn = ULONG_MAX; + args->fixed_pgd_pfn = ULONG_MAX; + args->fixed_p4d_pfn = ULONG_MAX; + args->fixed_pud_pfn = ULONG_MAX; + args->fixed_pmd_pfn = ULONG_MAX; + args->fixed_pte_pfn = ULONG_MAX; + + /* Allocate mm and vma */ + args->mm = mm_alloc(); + if (!args->mm) { + pr_err("Failed to allocate mm struct\n"); + ret = -ENOMEM; + goto error; + } + + args->vma = vm_area_alloc(args->mm); + if (!args->vma) { + pr_err("Failed to allocate vma\n"); + ret = -ENOMEM; + goto error; + } + + /* + * Allocate page table entries. They will be modified in the tests. + * Lets save the page table entries so that they can be released + * when the tests are completed. + */ + args->pgdp = pgd_offset(args->mm, args->vaddr); + args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr); + if (!args->p4dp) { + pr_err("Failed to allocate p4d entries\n"); + ret = -ENOMEM; + goto error; + } + args->start_p4dp = p4d_offset(args->pgdp, 0UL); + WARN_ON(!args->start_p4dp); + + args->pudp = pud_alloc(args->mm, args->p4dp, args->vaddr); + if (!args->pudp) { + pr_err("Failed to allocate pud entries\n"); + ret = -ENOMEM; + goto error; + } + args->start_pudp = pud_offset(args->p4dp, 0UL); + WARN_ON(!args->start_pudp); + + args->pmdp = pmd_alloc(args->mm, args->pudp, args->vaddr); + if (!args->pmdp) { + pr_err("Failed to allocate pmd entries\n"); + ret = -ENOMEM; + goto error; + } + args->start_pmdp = pmd_offset(args->pudp, 0UL); + WARN_ON(!args->start_pmdp); + + if (pte_alloc(args->mm, args->pmdp)) { + pr_err("Failed to allocate pte entries\n"); + ret = -ENOMEM; + goto error; + } + args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp)); + WARN_ON(!args->start_ptep); + + /* + * PFN for mapping at PTE level is determined from a standard kernel + * text symbol. But pfns for higher page table levels are derived by + * masking lower bits of this real pfn. These derived pfns might not + * exist on the platform but that does not really matter as pfn_pxx() + * helpers will still create appropriate entries for the test. This + * helps avoid large memory block allocations to be used for mapping + * at higher page table levels in some of the tests. + */ + phys = __pa_symbol(&start_kernel); + args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK); + args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK); + args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK); + args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK); + args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK); + WARN_ON(!pfn_valid(args->fixed_pte_pfn)); + + /* + * Allocate (huge) pages because some of the tests need to access + * the data in the pages. The corresponding tests will be skipped + * if we fail to allocate (huge) pages. + */ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && + has_transparent_hugepage()) { + page = debug_vm_pgtable_alloc_huge_page(args, + HPAGE_PUD_SHIFT - PAGE_SHIFT); + if (page) { + args->pud_pfn = page_to_pfn(page); + args->pmd_pfn = args->pud_pfn; + args->pte_pfn = args->pud_pfn; + return 0; + } + } + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + has_transparent_hugepage()) { + page = debug_vm_pgtable_alloc_huge_page(args, HPAGE_PMD_ORDER); + if (page) { + args->pmd_pfn = page_to_pfn(page); + args->pte_pfn = args->pmd_pfn; + return 0; + } + } + + page = alloc_pages(GFP_KERNEL, 0); + if (page) + args->pte_pfn = page_to_pfn(page); + + return 0; + +error: + destroy_args(args); + return ret; +} + static int __init debug_vm_pgtable(void) { + struct pgtable_debug_args args; struct vm_area_struct *vma; struct mm_struct *mm; pgd_t *pgdp; @@ -970,9 +1232,13 @@ static int __init debug_vm_pgtable(void) unsigned long vaddr, pte_aligned, pmd_aligned; unsigned long pud_aligned, p4d_aligned, pgd_aligned; spinlock_t *ptl = NULL; - int idx; + int idx, ret; pr_info("Validating architecture page table helpers\n"); + ret = init_args(&args); + if (ret) + return ret; + prot = vm_get_page_prot(VMFLAGS); vaddr = get_random_vaddr(); mm = mm_alloc(); @@ -1127,6 +1393,8 @@ static int __init debug_vm_pgtable(void) mm_dec_nr_pmds(mm); mm_dec_nr_ptes(mm); mmdrop(mm); + + destroy_args(&args); return 0; } late_initcall(debug_vm_pgtable); From 36b77d1e159283da3c9414cbe6d9cb8e79a59c19 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 2 Sep 2021 14:52:22 -0700 Subject: [PATCH 311/474] mm/debug_vm_pgtable: use struct pgtable_debug_args in basic tests This uses struct pgtable_debug_args in the basic test functions. The unused variables @pgd_aligned and @p4d_aligned in debug_vm_pgtable() are dropped. Link: https://lkml.kernel.org/r/20210809092631.1888748-3-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 50 +++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 7b6bcf59e376..64b5a76e0f6d 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -89,10 +89,10 @@ struct pgtable_debug_args { unsigned long fixed_pte_pfn; }; -static void __init pte_basic_tests(unsigned long pfn, int idx) +static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) { pgprot_t prot = protection_map[idx]; - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, prot); unsigned long val = idx, *ptr = &val; pr_debug("Validating PTE basic (%pGv)\n", ptr); @@ -174,7 +174,7 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_basic_tests(unsigned long pfn, int idx) +static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { pgprot_t prot = protection_map[idx]; unsigned long val = idx, *ptr = &val; @@ -184,7 +184,7 @@ static void __init pmd_basic_tests(unsigned long pfn, int idx) return; pr_debug("Validating PMD basic (%pGv)\n", ptr); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, prot); /* * This test needs to be executed after the given page table entry @@ -296,7 +296,7 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) +static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { pgprot_t prot = protection_map[idx]; unsigned long val = idx, *ptr = &val; @@ -306,7 +306,7 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int return; pr_debug("Validating PUD basic (%pGv)\n", ptr); - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->fixed_pud_pfn, prot); /* * This test needs to be executed after the given page table entry @@ -327,7 +327,7 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int WARN_ON(pud_dirty(pud_wrprotect(pud_mkclean(pud)))); WARN_ON(!pud_dirty(pud_wrprotect(pud_mkdirty(pud)))); - if (mm_pmd_folded(mm)) + if (mm_pmd_folded(args->mm)) return; /* @@ -404,7 +404,7 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pud_leaf(pud)); } #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { } +static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { } static void __init pud_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pudp, unsigned long pfn, unsigned long vaddr, @@ -414,8 +414,8 @@ static void __init pud_advanced_tests(struct mm_struct *mm, static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_basic_tests(unsigned long pfn, int idx) { } -static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { } +static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { } +static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { } static void __init pmd_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmdp, unsigned long pfn, unsigned long vaddr, @@ -476,7 +476,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot) +static void __init p4d_basic_tests(struct pgtable_debug_args *args) { p4d_t p4d; @@ -485,7 +485,7 @@ static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!p4d_same(p4d, p4d)); } -static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot) +static void __init pgd_basic_tests(struct pgtable_debug_args *args) { pgd_t pgd; @@ -890,7 +890,7 @@ static void __init swap_migration_tests(void) } #ifdef CONFIG_HUGETLB_PAGE -static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) +static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { struct page *page; pte_t pte; @@ -900,21 +900,21 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) * Accessing the page associated with the pfn is safe here, * as it was previously derived from a real kernel symbol. */ - page = pfn_to_page(pfn); - pte = mk_huge_pte(page, prot); + page = pfn_to_page(args->fixed_pmd_pfn); + pte = mk_huge_pte(page, args->page_prot); WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte))); WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte)))); WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte)))); #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB - pte = pfn_pte(pfn, prot); + pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot); WARN_ON(!pte_huge(pte_mkhuge(pte))); #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ } #else /* !CONFIG_HUGETLB_PAGE */ -static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { } +static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -1230,7 +1230,7 @@ static int __init debug_vm_pgtable(void) pgprot_t prot, protnone; phys_addr_t paddr; unsigned long vaddr, pte_aligned, pmd_aligned; - unsigned long pud_aligned, p4d_aligned, pgd_aligned; + unsigned long pud_aligned; spinlock_t *ptl = NULL; int idx, ret; @@ -1273,8 +1273,6 @@ static int __init debug_vm_pgtable(void) pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT; pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT; pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT; - p4d_aligned = (paddr & P4D_MASK) >> PAGE_SHIFT; - pgd_aligned = (paddr & PGDIR_MASK) >> PAGE_SHIFT; WARN_ON(!pfn_valid(pte_aligned)); pgdp = pgd_offset(mm, vaddr); @@ -1308,9 +1306,9 @@ static int __init debug_vm_pgtable(void) * given page table entry. */ for (idx = 0; idx < ARRAY_SIZE(protection_map); idx++) { - pte_basic_tests(pte_aligned, idx); - pmd_basic_tests(pmd_aligned, idx); - pud_basic_tests(mm, pud_aligned, idx); + pte_basic_tests(&args, idx); + pmd_basic_tests(&args, idx); + pud_basic_tests(&args, idx); } /* @@ -1320,8 +1318,8 @@ static int __init debug_vm_pgtable(void) * the above iteration for now to save some test execution * time. */ - p4d_basic_tests(p4d_aligned, prot); - pgd_basic_tests(pgd_aligned, prot); + p4d_basic_tests(&args); + pgd_basic_tests(&args); pmd_leaf_tests(pmd_aligned, prot); pud_leaf_tests(pud_aligned, prot); @@ -1350,7 +1348,7 @@ static int __init debug_vm_pgtable(void) pmd_thp_tests(pmd_aligned, prot); pud_thp_tests(pud_aligned, prot); - hugetlb_basic_tests(pte_aligned, prot); + hugetlb_basic_tests(&args); /* * Page table modifying tests. They need to hold From 8983d231c7cc1adaebed89153552da1e3fd55f61 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 2 Sep 2021 14:52:25 -0700 Subject: [PATCH 312/474] mm/debug_vm_pgtable: use struct pgtable_debug_args in leaf and savewrite tests This uses struct pgtable_debug_args in the leaf and savewrite test functions. Link: https://lkml.kernel.org/r/20210809092631.1888748-4-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 64b5a76e0f6d..759316143a21 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -161,9 +161,9 @@ static void __init pte_advanced_tests(struct mm_struct *mm, WARN_ON(pte_young(pte)); } -static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_savedwrite_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none); if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) return; @@ -262,7 +262,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, pgtable = pgtable_trans_huge_withdraw(mm, pmdp); } -static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -270,7 +270,7 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD leaf\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); /* * PMD based THP is a leaf entry. @@ -279,7 +279,7 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pmd_leaf(pmd)); } -static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -290,7 +290,7 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD saved write\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none); WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd)))); WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd)))); } @@ -388,7 +388,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm, pudp_huge_get_and_clear(mm, vaddr, pudp); } -static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) +static void __init pud_leaf_tests(struct pgtable_debug_args *args) { pud_t pud; @@ -396,7 +396,7 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PUD leaf\n"); - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->fixed_pud_pfn, args->page_prot); /* * PUD based THP is a leaf entry. */ @@ -411,7 +411,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm, pgprot_t prot) { } -static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { } @@ -428,9 +428,9 @@ static void __init pud_advanced_tests(struct mm_struct *mm, pgprot_t prot) { } -static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { } +static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } +static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP @@ -1321,11 +1321,11 @@ static int __init debug_vm_pgtable(void) p4d_basic_tests(&args); pgd_basic_tests(&args); - pmd_leaf_tests(pmd_aligned, prot); - pud_leaf_tests(pud_aligned, prot); + pmd_leaf_tests(&args); + pud_leaf_tests(&args); - pte_savedwrite_tests(pte_aligned, protnone); - pmd_savedwrite_tests(pmd_aligned, protnone); + pte_savedwrite_tests(&args); + pmd_savedwrite_tests(&args); pte_special_tests(pte_aligned, prot); pte_protnone_tests(pte_aligned, protnone); From 8cb183f2f2a014e818cf60de3afd5a06410fd5b9 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 2 Sep 2021 14:52:28 -0700 Subject: [PATCH 313/474] mm/debug_vm_pgtable: use struct pgtable_debug_args in protnone and devmap tests This uses struct pgtable_debug_args in protnone and devmap test functions. After that, the unused variable @protnone in debug_vm_pgtable() is dropped. Link: https://lkml.kernel.org/r/20210809092631.1888748-5-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 58 +++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 759316143a21..8598aefeba4d 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -662,9 +662,9 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, WARN_ON(pmd_bad(pmd)); } -static void __init pte_special_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_special_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) return; @@ -673,9 +673,9 @@ static void __init pte_special_tests(unsigned long pfn, pgprot_t prot) WARN_ON(!pte_special(pte_mkspecial(pte))); } -static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_protnone_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none); if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) return; @@ -686,7 +686,7 @@ static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -697,25 +697,25 @@ static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD protnone\n"); - pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); + pmd = pmd_mkhuge(pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none)); WARN_ON(!pmd_protnone(pmd)); WARN_ON(!pmd_present(pmd)); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_ARCH_HAS_PTE_DEVMAP -static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_devmap_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); pr_debug("Validating PTE devmap\n"); WARN_ON(!pte_devmap(pte_mkdevmap(pte))); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -723,12 +723,12 @@ static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD devmap\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd))); } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) +static void __init pud_devmap_tests(struct pgtable_debug_args *args) { pud_t pud; @@ -736,20 +736,20 @@ static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PUD devmap\n"); - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->fixed_pud_pfn, args->page_prot); WARN_ON(!pud_devmap(pud_mkdevmap(pud))); } #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { } +static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #else -static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pte_devmap_tests(struct pgtable_debug_args *args) { } +static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { } +static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */ static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot) @@ -1227,7 +1227,7 @@ static int __init debug_vm_pgtable(void) pmd_t *pmdp, *saved_pmdp, pmd; pte_t *ptep; pgtable_t saved_ptep; - pgprot_t prot, protnone; + pgprot_t prot; phys_addr_t paddr; unsigned long vaddr, pte_aligned, pmd_aligned; unsigned long pud_aligned; @@ -1247,12 +1247,6 @@ static int __init debug_vm_pgtable(void) return 1; } - /* - * __P000 (or even __S000) will help create page table entries with - * PROT_NONE permission as required for pxx_protnone_tests(). - */ - protnone = __P000; - vma = vm_area_alloc(mm); if (!vma) { pr_err("vma allocation failed\n"); @@ -1327,13 +1321,13 @@ static int __init debug_vm_pgtable(void) pte_savedwrite_tests(&args); pmd_savedwrite_tests(&args); - pte_special_tests(pte_aligned, prot); - pte_protnone_tests(pte_aligned, protnone); - pmd_protnone_tests(pmd_aligned, protnone); + pte_special_tests(&args); + pte_protnone_tests(&args); + pmd_protnone_tests(&args); - pte_devmap_tests(pte_aligned, prot); - pmd_devmap_tests(pmd_aligned, prot); - pud_devmap_tests(pud_aligned, prot); + pte_devmap_tests(&args); + pmd_devmap_tests(&args); + pud_devmap_tests(&args); pte_soft_dirty_tests(pte_aligned, prot); pmd_soft_dirty_tests(pmd_aligned, prot); From 5f447e8067fd9f472bc418de219f3cb9a8c3fbe8 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 2 Sep 2021 14:52:32 -0700 Subject: [PATCH 314/474] mm/debug_vm_pgtable: use struct pgtable_debug_args in soft_dirty and swap tests This uses struct pgtable_debug_args in the soft_dirty and swap test functions. Link: https://lkml.kernel.org/r/20210809092631.1888748-6-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 48 +++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 8598aefeba4d..849081a2099d 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -752,9 +752,9 @@ static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { } static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */ -static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return; @@ -764,9 +764,9 @@ static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot) WARN_ON(pte_soft_dirty(pte_clear_soft_dirty(pte))); } -static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return; @@ -777,7 +777,7 @@ static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -788,12 +788,12 @@ static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD soft dirty\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd))); WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd))); } -static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -805,31 +805,29 @@ static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD swap soft dirty\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd))); WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd))); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) -{ -} +static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { } +static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_swap_tests(struct pgtable_debug_args *args) { swp_entry_t swp; pte_t pte; pr_debug("Validating PTE swap\n"); - pte = pfn_pte(pfn, prot); + pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); swp = __pte_to_swp_entry(pte); pte = __swp_entry_to_pte(swp); - WARN_ON(pfn != pte_pfn(pte)); + WARN_ON(args->fixed_pte_pfn != pte_pfn(pte)); } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION -static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_swap_tests(struct pgtable_debug_args *args) { swp_entry_t swp; pmd_t pmd; @@ -838,13 +836,13 @@ static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD swap\n"); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); swp = __pmd_to_swp_entry(pmd); pmd = __swp_entry_to_pmd(swp); - WARN_ON(pfn != pmd_pfn(pmd)); + WARN_ON(args->fixed_pmd_pfn != pmd_pfn(pmd)); } #else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_swap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static void __init swap_migration_tests(void) @@ -1329,13 +1327,13 @@ static int __init debug_vm_pgtable(void) pmd_devmap_tests(&args); pud_devmap_tests(&args); - pte_soft_dirty_tests(pte_aligned, prot); - pmd_soft_dirty_tests(pmd_aligned, prot); - pte_swap_soft_dirty_tests(pte_aligned, prot); - pmd_swap_soft_dirty_tests(pmd_aligned, prot); + pte_soft_dirty_tests(&args); + pmd_soft_dirty_tests(&args); + pte_swap_soft_dirty_tests(&args); + pmd_swap_soft_dirty_tests(&args); - pte_swap_tests(pte_aligned, prot); - pmd_swap_tests(pmd_aligned, prot); + pte_swap_tests(&args); + pmd_swap_tests(&args); swap_migration_tests(); From 4878a888824bd69ad4fff18efa93901ba2ba24f3 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 2 Sep 2021 14:52:35 -0700 Subject: [PATCH 315/474] mm/debug_vm_pgtable: use struct pgtable_debug_args in migration and thp tests This uses struct pgtable_debug_args in the migration and thp test functions. It's notable that the pre-allocated page is used in swap_migration_tests() as set_pte_at() is used there. Link: https://lkml.kernel.org/r/20210809092631.1888748-7-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 849081a2099d..6df86555b191 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -845,7 +845,7 @@ static void __init pmd_swap_tests(struct pgtable_debug_args *args) static void __init pmd_swap_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static void __init swap_migration_tests(void) +static void __init swap_migration_tests(struct pgtable_debug_args *args) { struct page *page; swp_entry_t swp; @@ -853,19 +853,18 @@ static void __init swap_migration_tests(void) if (!IS_ENABLED(CONFIG_MIGRATION)) return; - pr_debug("Validating swap migration\n"); /* * swap_migration_tests() requires a dedicated page as it needs to * be locked before creating a migration entry from it. Locking the * page that actually maps kernel text ('start_kernel') can be real - * problematic. Lets allocate a dedicated page explicitly for this - * purpose that will be freed subsequently. + * problematic. Lets use the allocated page explicitly for this + * purpose. */ - page = alloc_page(GFP_KERNEL); - if (!page) { - pr_err("page allocation failed\n"); + page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL; + if (!page) return; - } + + pr_debug("Validating swap migration\n"); /* * make_migration_entry() expects given page to be @@ -884,7 +883,6 @@ static void __init swap_migration_tests(void) WARN_ON(!is_migration_entry(swp)); WARN_ON(is_writable_migration_entry(swp)); __ClearPageLocked(page); - __free_page(page); } #ifdef CONFIG_HUGETLB_PAGE @@ -916,7 +914,7 @@ static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_thp_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -935,7 +933,7 @@ static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) * needs to return true. pmd_present() should be true whenever * pmd_trans_huge() returns true. */ - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); WARN_ON(!pmd_trans_huge(pmd_mkhuge(pmd))); #ifndef __HAVE_ARCH_PMDP_INVALIDATE @@ -945,7 +943,7 @@ static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) +static void __init pud_thp_tests(struct pgtable_debug_args *args) { pud_t pud; @@ -953,7 +951,7 @@ static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PUD based THP\n"); - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->fixed_pud_pfn, args->page_prot); WARN_ON(!pud_trans_huge(pud_mkhuge(pud))); /* @@ -965,11 +963,11 @@ static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) */ } #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pud_thp_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_thp_tests(struct pgtable_debug_args *args) { } +static void __init pud_thp_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static unsigned long __init get_random_vaddr(void) @@ -1335,10 +1333,10 @@ static int __init debug_vm_pgtable(void) pte_swap_tests(&args); pmd_swap_tests(&args); - swap_migration_tests(); + swap_migration_tests(&args); - pmd_thp_tests(pmd_aligned, prot); - pud_thp_tests(pud_aligned, prot); + pmd_thp_tests(&args); + pud_thp_tests(&args); hugetlb_basic_tests(&args); From 44966c4480f8024776c9ecc68f5589f023f19884 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 2 Sep 2021 14:52:38 -0700 Subject: [PATCH 316/474] mm/debug_vm_pgtable: use struct pgtable_debug_args in PTE modifying tests This uses struct pgtable_debug_args in PTE modifying tests. The allocated page is used as set_pte_at() is used there. The tests are skipped if the allocated page doesn't exist. It's notable that args->ptep need to be mapped before the tests. The reason why we don't map args->ptep at the beginning is PTE entry is only mapped and accessible in atomic context when CONFIG_HIGHPTE is enabled. So we avoid to do that so that atomic context is only enabled if needed. Besides, the unused variable @pte_aligned and @ptep in debug_vm_pgtable() are dropped. Link: https://lkml.kernel.org/r/20210809092631.1888748-8-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 67 +++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 6df86555b191..652f26f5ecd6 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -117,10 +117,7 @@ static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte)))); } -static void __init pte_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pte_t *ptep, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot) +static void __init pte_advanced_tests(struct pgtable_debug_args *args) { pte_t pte; @@ -129,35 +126,37 @@ static void __init pte_advanced_tests(struct mm_struct *mm, * This requires set_pte_at to be not used to update an * existing pte entry. Clear pte before we do set_pte_at */ + if (args->pte_pfn == ULONG_MAX) + return; pr_debug("Validating PTE advanced\n"); - pte = pfn_pte(pfn, prot); - set_pte_at(mm, vaddr, ptep, pte); - ptep_set_wrprotect(mm, vaddr, ptep); - pte = ptep_get(ptep); + pte = pfn_pte(args->pte_pfn, args->page_prot); + set_pte_at(args->mm, args->vaddr, args->ptep, pte); + ptep_set_wrprotect(args->mm, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); WARN_ON(pte_write(pte)); - ptep_get_and_clear(mm, vaddr, ptep); - pte = ptep_get(ptep); + ptep_get_and_clear(args->mm, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); WARN_ON(!pte_none(pte)); - pte = pfn_pte(pfn, prot); + pte = pfn_pte(args->pte_pfn, args->page_prot); pte = pte_wrprotect(pte); pte = pte_mkclean(pte); - set_pte_at(mm, vaddr, ptep, pte); + set_pte_at(args->mm, args->vaddr, args->ptep, pte); pte = pte_mkwrite(pte); pte = pte_mkdirty(pte); - ptep_set_access_flags(vma, vaddr, ptep, pte, 1); - pte = ptep_get(ptep); + ptep_set_access_flags(args->vma, args->vaddr, args->ptep, pte, 1); + pte = ptep_get(args->ptep); WARN_ON(!(pte_write(pte) && pte_dirty(pte))); - ptep_get_and_clear_full(mm, vaddr, ptep, 1); - pte = ptep_get(ptep); + ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1); + pte = ptep_get(args->ptep); WARN_ON(!pte_none(pte)); - pte = pfn_pte(pfn, prot); + pte = pfn_pte(args->pte_pfn, args->page_prot); pte = pte_mkyoung(pte); - set_pte_at(mm, vaddr, ptep, pte); - ptep_test_and_clear_young(vma, vaddr, ptep); - pte = ptep_get(ptep); + set_pte_at(args->mm, args->vaddr, args->ptep, pte); + ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); WARN_ON(pte_young(pte)); } @@ -618,20 +617,21 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, } #endif /* PAGETABLE_P4D_FOLDED */ -static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot) +static void __init pte_clear_tests(struct pgtable_debug_args *args) { - pte_t pte = pfn_pte(pfn, prot); + pte_t pte = pfn_pte(args->pte_pfn, args->page_prot); + + if (args->pte_pfn == ULONG_MAX) + return; pr_debug("Validating PTE clear\n"); #ifndef CONFIG_RISCV pte = __pte(pte_val(pte) | RANDOM_ORVALUE); #endif - set_pte_at(mm, vaddr, ptep, pte); + set_pte_at(args->mm, args->vaddr, args->ptep, pte); barrier(); - pte_clear(mm, vaddr, ptep); - pte = ptep_get(ptep); + pte_clear(args->mm, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); WARN_ON(!pte_none(pte)); } @@ -1221,11 +1221,10 @@ static int __init debug_vm_pgtable(void) p4d_t *p4dp, *saved_p4dp; pud_t *pudp, *saved_pudp; pmd_t *pmdp, *saved_pmdp, pmd; - pte_t *ptep; pgtable_t saved_ptep; pgprot_t prot; phys_addr_t paddr; - unsigned long vaddr, pte_aligned, pmd_aligned; + unsigned long vaddr, pmd_aligned; unsigned long pud_aligned; spinlock_t *ptl = NULL; int idx, ret; @@ -1260,10 +1259,8 @@ static int __init debug_vm_pgtable(void) */ paddr = __pa_symbol(&start_kernel); - pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT; pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT; pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT; - WARN_ON(!pfn_valid(pte_aligned)); pgdp = pgd_offset(mm, vaddr); p4dp = p4d_alloc(mm, pgdp, vaddr); @@ -1345,10 +1342,10 @@ static int __init debug_vm_pgtable(void) * proper page table lock. */ - ptep = pte_offset_map_lock(mm, pmdp, vaddr, &ptl); - pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot); - pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot); - pte_unmap_unlock(ptep, ptl); + args.ptep = pte_offset_map_lock(args.mm, args.pmdp, args.vaddr, &ptl); + pte_clear_tests(&args); + pte_advanced_tests(&args); + pte_unmap_unlock(args.ptep, ptl); ptl = pmd_lock(mm, pmdp); pmd_clear_tests(mm, pmdp); From c0fe07b0aa72b530d5da63a24eb10d503cae5a95 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 2 Sep 2021 14:52:41 -0700 Subject: [PATCH 317/474] mm/debug_vm_pgtable: use struct pgtable_debug_args in PMD modifying tests This uses struct pgtable_debug_args in PMD modifying tests. The allocated huge page is used when set_pmd_at() is used. The corresponding tests are skipped if the huge page doesn't exist. Besides, the unused variable @pmd_aligned in debug_vm_pgtable() is dropped. Link: https://lkml.kernel.org/r/20210809092631.1888748-9-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 98 ++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 52 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 652f26f5ecd6..abf778f729fd 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -211,54 +211,55 @@ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) WARN_ON(!pmd_bad(pmd_mkhuge(pmd))); } -static void __init pmd_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pmd_t *pmdp, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot, pgtable_t pgtable) +static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { pmd_t pmd; + unsigned long vaddr = args->vaddr; if (!has_transparent_hugepage()) return; + if (args->pmd_pfn == ULONG_MAX) + return; + pr_debug("Validating PMD advanced\n"); /* Align the address wrt HPAGE_PMD_SIZE */ vaddr &= HPAGE_PMD_MASK; - pgtable_trans_huge_deposit(mm, pmdp, pgtable); + pgtable_trans_huge_deposit(args->mm, args->pmdp, args->start_ptep); - pmd = pfn_pmd(pfn, prot); - set_pmd_at(mm, vaddr, pmdp, pmd); - pmdp_set_wrprotect(mm, vaddr, pmdp); - pmd = READ_ONCE(*pmdp); + pmd = pfn_pmd(args->pmd_pfn, args->page_prot); + set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + pmdp_set_wrprotect(args->mm, vaddr, args->pmdp); + pmd = READ_ONCE(*args->pmdp); WARN_ON(pmd_write(pmd)); - pmdp_huge_get_and_clear(mm, vaddr, pmdp); - pmd = READ_ONCE(*pmdp); + pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp); + pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); - pmd = pfn_pmd(pfn, prot); + pmd = pfn_pmd(args->pmd_pfn, args->page_prot); pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); - set_pmd_at(mm, vaddr, pmdp, pmd); + set_pmd_at(args->mm, vaddr, args->pmdp, pmd); pmd = pmd_mkwrite(pmd); pmd = pmd_mkdirty(pmd); - pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1); - pmd = READ_ONCE(*pmdp); + pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1); + pmd = READ_ONCE(*args->pmdp); WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd))); - pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1); - pmd = READ_ONCE(*pmdp); + pmdp_huge_get_and_clear_full(args->vma, vaddr, args->pmdp, 1); + pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); - pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); + pmd = pmd_mkhuge(pfn_pmd(args->pmd_pfn, args->page_prot)); pmd = pmd_mkyoung(pmd); - set_pmd_at(mm, vaddr, pmdp, pmd); - pmdp_test_and_clear_young(vma, vaddr, pmdp); - pmd = READ_ONCE(*pmdp); + set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + pmdp_test_and_clear_young(args->vma, vaddr, args->pmdp); + pmd = READ_ONCE(*args->pmdp); WARN_ON(pmd_young(pmd)); /* Clear the pte entries */ - pmdp_huge_get_and_clear(mm, vaddr, pmdp); - pgtable = pgtable_trans_huge_withdraw(mm, pmdp); + pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp); + pgtable_trans_huge_withdraw(args->mm, args->pmdp); } static void __init pmd_leaf_tests(struct pgtable_debug_args *args) @@ -415,12 +416,7 @@ static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { } static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { } -static void __init pmd_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pmd_t *pmdp, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot, pgtable_t pgtable) -{ -} +static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { } static void __init pud_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pudp, unsigned long pfn, unsigned long vaddr, @@ -433,11 +429,11 @@ static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) +static void __init pmd_huge_tests(struct pgtable_debug_args *args) { pmd_t pmd; - if (!arch_vmap_pmd_supported(prot)) + if (!arch_vmap_pmd_supported(args->page_prot)) return; pr_debug("Validating PMD huge\n"); @@ -445,10 +441,10 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) * X86 defined pmd_set_huge() verifies that the given * PMD is not a populated non-leaf entry. */ - WRITE_ONCE(*pmdp, __pmd(0)); - WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot)); - WARN_ON(!pmd_clear_huge(pmdp)); - pmd = READ_ONCE(*pmdp); + WRITE_ONCE(*args->pmdp, __pmd(0)); + WARN_ON(!pmd_set_huge(args->pmdp, __pfn_to_phys(args->fixed_pmd_pfn), args->page_prot)); + WARN_ON(!pmd_clear_huge(args->pmdp)); + pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); } @@ -471,7 +467,7 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) WARN_ON(!pud_none(pud)); } #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ -static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { } +static void __init pmd_huge_tests(struct pgtable_debug_args *args) { } static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ @@ -635,20 +631,19 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args) WARN_ON(!pte_none(pte)); } -static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp) +static void __init pmd_clear_tests(struct pgtable_debug_args *args) { - pmd_t pmd = READ_ONCE(*pmdp); + pmd_t pmd = READ_ONCE(*args->pmdp); pr_debug("Validating PMD clear\n"); pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE); - WRITE_ONCE(*pmdp, pmd); - pmd_clear(pmdp); - pmd = READ_ONCE(*pmdp); + WRITE_ONCE(*args->pmdp, pmd); + pmd_clear(args->pmdp); + pmd = READ_ONCE(*args->pmdp); WARN_ON(!pmd_none(pmd)); } -static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) +static void __init pmd_populate_tests(struct pgtable_debug_args *args) { pmd_t pmd; @@ -657,8 +652,8 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp, * This entry points to next level page table page. * Hence this must not qualify as pmd_bad(). */ - pmd_populate(mm, pmdp, pgtable); - pmd = READ_ONCE(*pmdp); + pmd_populate(args->mm, args->pmdp, args->start_ptep); + pmd = READ_ONCE(*args->pmdp); WARN_ON(pmd_bad(pmd)); } @@ -1224,7 +1219,7 @@ static int __init debug_vm_pgtable(void) pgtable_t saved_ptep; pgprot_t prot; phys_addr_t paddr; - unsigned long vaddr, pmd_aligned; + unsigned long vaddr; unsigned long pud_aligned; spinlock_t *ptl = NULL; int idx, ret; @@ -1259,7 +1254,6 @@ static int __init debug_vm_pgtable(void) */ paddr = __pa_symbol(&start_kernel); - pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT; pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT; pgdp = pgd_offset(mm, vaddr); @@ -1347,11 +1341,11 @@ static int __init debug_vm_pgtable(void) pte_advanced_tests(&args); pte_unmap_unlock(args.ptep, ptl); - ptl = pmd_lock(mm, pmdp); - pmd_clear_tests(mm, pmdp); - pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep); - pmd_huge_tests(pmdp, pmd_aligned, prot); - pmd_populate_tests(mm, pmdp, saved_ptep); + ptl = pmd_lock(args.mm, args.pmdp); + pmd_clear_tests(&args); + pmd_advanced_tests(&args); + pmd_huge_tests(&args); + pmd_populate_tests(&args); spin_unlock(ptl); ptl = pud_lock(mm, pudp); From 4cbde03bdb0b832503999387f5e86b006fa54674 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 2 Sep 2021 14:52:45 -0700 Subject: [PATCH 318/474] mm/debug_vm_pgtable: use struct pgtable_debug_args in PUD modifying tests This uses struct pgtable_debug_args in PUD modifying tests. The allocated huge page is used when set_pud_at() is used. The corresponding tests are skipped if the huge page doesn't exist. Besides, the following unused variables in debug_vm_pgtable() are dropped: @prot, @paddr, @pud_aligned. Link: https://lkml.kernel.org/r/20210809092631.1888748-10-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 126 ++++++++++++++++-------------------------- 1 file changed, 48 insertions(+), 78 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index abf778f729fd..4e453ae0f291 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -337,55 +337,56 @@ static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) WARN_ON(!pud_bad(pud_mkhuge(pud))); } -static void __init pud_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pud_t *pudp, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot) +static void __init pud_advanced_tests(struct pgtable_debug_args *args) { + unsigned long vaddr = args->vaddr; pud_t pud; if (!has_transparent_hugepage()) return; + if (args->pud_pfn == ULONG_MAX) + return; + pr_debug("Validating PUD advanced\n"); /* Align the address wrt HPAGE_PUD_SIZE */ vaddr &= HPAGE_PUD_MASK; - pud = pfn_pud(pfn, prot); - set_pud_at(mm, vaddr, pudp, pud); - pudp_set_wrprotect(mm, vaddr, pudp); - pud = READ_ONCE(*pudp); + pud = pfn_pud(args->pud_pfn, args->page_prot); + set_pud_at(args->mm, vaddr, args->pudp, pud); + pudp_set_wrprotect(args->mm, vaddr, args->pudp); + pud = READ_ONCE(*args->pudp); WARN_ON(pud_write(pud)); #ifndef __PAGETABLE_PMD_FOLDED - pudp_huge_get_and_clear(mm, vaddr, pudp); - pud = READ_ONCE(*pudp); + pudp_huge_get_and_clear(args->mm, vaddr, args->pudp); + pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->pud_pfn, args->page_prot); pud = pud_wrprotect(pud); pud = pud_mkclean(pud); - set_pud_at(mm, vaddr, pudp, pud); + set_pud_at(args->mm, vaddr, args->pudp, pud); pud = pud_mkwrite(pud); pud = pud_mkdirty(pud); - pudp_set_access_flags(vma, vaddr, pudp, pud, 1); - pud = READ_ONCE(*pudp); + pudp_set_access_flags(args->vma, vaddr, args->pudp, pud, 1); + pud = READ_ONCE(*args->pudp); WARN_ON(!(pud_write(pud) && pud_dirty(pud))); #ifndef __PAGETABLE_PMD_FOLDED - pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1); - pud = READ_ONCE(*pudp); + pudp_huge_get_and_clear_full(args->mm, vaddr, args->pudp, 1); + pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ - pud = pfn_pud(pfn, prot); + pud = pfn_pud(args->pud_pfn, args->page_prot); pud = pud_mkyoung(pud); - set_pud_at(mm, vaddr, pudp, pud); - pudp_test_and_clear_young(vma, vaddr, pudp); - pud = READ_ONCE(*pudp); + set_pud_at(args->mm, vaddr, args->pudp, pud); + pudp_test_and_clear_young(args->vma, vaddr, args->pudp); + pud = READ_ONCE(*args->pudp); WARN_ON(pud_young(pud)); - pudp_huge_get_and_clear(mm, vaddr, pudp); + pudp_huge_get_and_clear(args->mm, vaddr, args->pudp); } static void __init pud_leaf_tests(struct pgtable_debug_args *args) @@ -405,24 +406,14 @@ static void __init pud_leaf_tests(struct pgtable_debug_args *args) } #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { } -static void __init pud_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pud_t *pudp, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot) -{ -} +static void __init pud_advanced_tests(struct pgtable_debug_args *args) { } static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { } static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { } static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { } -static void __init pud_advanced_tests(struct mm_struct *mm, - struct vm_area_struct *vma, pud_t *pudp, - unsigned long pfn, unsigned long vaddr, - pgprot_t prot) -{ -} +static void __init pud_advanced_tests(struct pgtable_debug_args *args) { } static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { } static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { } @@ -448,11 +439,11 @@ static void __init pmd_huge_tests(struct pgtable_debug_args *args) WARN_ON(!pmd_none(pmd)); } -static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) +static void __init pud_huge_tests(struct pgtable_debug_args *args) { pud_t pud; - if (!arch_vmap_pud_supported(prot)) + if (!arch_vmap_pud_supported(args->page_prot)) return; pr_debug("Validating PUD huge\n"); @@ -460,15 +451,15 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) * X86 defined pud_set_huge() verifies that the given * PUD is not a populated non-leaf entry. */ - WRITE_ONCE(*pudp, __pud(0)); - WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot)); - WARN_ON(!pud_clear_huge(pudp)); - pud = READ_ONCE(*pudp); + WRITE_ONCE(*args->pudp, __pud(0)); + WARN_ON(!pud_set_huge(args->pudp, __pfn_to_phys(args->fixed_pud_pfn), args->page_prot)); + WARN_ON(!pud_clear_huge(args->pudp)); + pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); } #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static void __init pmd_huge_tests(struct pgtable_debug_args *args) { } -static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { } +static void __init pud_huge_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ static void __init p4d_basic_tests(struct pgtable_debug_args *args) @@ -490,27 +481,26 @@ static void __init pgd_basic_tests(struct pgtable_debug_args *args) } #ifndef __PAGETABLE_PUD_FOLDED -static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) +static void __init pud_clear_tests(struct pgtable_debug_args *args) { - pud_t pud = READ_ONCE(*pudp); + pud_t pud = READ_ONCE(*args->pudp); - if (mm_pmd_folded(mm)) + if (mm_pmd_folded(args->mm)) return; pr_debug("Validating PUD clear\n"); pud = __pud(pud_val(pud) | RANDOM_ORVALUE); - WRITE_ONCE(*pudp, pud); - pud_clear(pudp); - pud = READ_ONCE(*pudp); + WRITE_ONCE(*args->pudp, pud); + pud_clear(args->pudp); + pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); } -static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, - pmd_t *pmdp) +static void __init pud_populate_tests(struct pgtable_debug_args *args) { pud_t pud; - if (mm_pmd_folded(mm)) + if (mm_pmd_folded(args->mm)) return; pr_debug("Validating PUD populate\n"); @@ -518,16 +508,13 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, * This entry points to next level page table page. * Hence this must not qualify as pud_bad(). */ - pud_populate(mm, pudp, pmdp); - pud = READ_ONCE(*pudp); + pud_populate(args->mm, args->pudp, args->start_pmdp); + pud = READ_ONCE(*args->pudp); WARN_ON(pud_bad(pud)); } #else /* !__PAGETABLE_PUD_FOLDED */ -static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) { } -static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp, - pmd_t *pmdp) -{ -} +static void __init pud_clear_tests(struct pgtable_debug_args *args) { } +static void __init pud_populate_tests(struct pgtable_debug_args *args) { } #endif /* PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_P4D_FOLDED @@ -1217,10 +1204,7 @@ static int __init debug_vm_pgtable(void) pud_t *pudp, *saved_pudp; pmd_t *pmdp, *saved_pmdp, pmd; pgtable_t saved_ptep; - pgprot_t prot; - phys_addr_t paddr; unsigned long vaddr; - unsigned long pud_aligned; spinlock_t *ptl = NULL; int idx, ret; @@ -1229,7 +1213,6 @@ static int __init debug_vm_pgtable(void) if (ret) return ret; - prot = vm_get_page_prot(VMFLAGS); vaddr = get_random_vaddr(); mm = mm_alloc(); if (!mm) { @@ -1243,19 +1226,6 @@ static int __init debug_vm_pgtable(void) return 1; } - /* - * PFN for mapping at PTE level is determined from a standard kernel - * text symbol. But pfns for higher page table levels are derived by - * masking lower bits of this real pfn. These derived pfns might not - * exist on the platform but that does not really matter as pfn_pxx() - * helpers will still create appropriate entries for the test. This - * helps avoid large memory block allocations to be used for mapping - * at higher page table levels. - */ - paddr = __pa_symbol(&start_kernel); - - pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT; - pgdp = pgd_offset(mm, vaddr); p4dp = p4d_alloc(mm, pgdp, vaddr); pudp = pud_alloc(mm, p4dp, vaddr); @@ -1348,11 +1318,11 @@ static int __init debug_vm_pgtable(void) pmd_populate_tests(&args); spin_unlock(ptl); - ptl = pud_lock(mm, pudp); - pud_clear_tests(mm, pudp); - pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot); - pud_huge_tests(pudp, pud_aligned, prot); - pud_populate_tests(mm, pudp, saved_pmdp); + ptl = pud_lock(args.mm, args.pudp); + pud_clear_tests(&args); + pud_advanced_tests(&args); + pud_huge_tests(&args); + pud_populate_tests(&args); spin_unlock(ptl); spin_lock(&mm->page_table_lock); From 2f87f8c39a91effbbfd88a4642bd245b9c2b7ad3 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 2 Sep 2021 14:52:48 -0700 Subject: [PATCH 319/474] mm/debug_vm_pgtable: use struct pgtable_debug_args in PGD and P4D modifying tests This uses struct pgtable_debug_args in PGD/P4D modifying tests. No allocated huge page is used in these tests. Besides, the unused variable @saved_p4dp and @saved_pudp are dropped. Link: https://lkml.kernel.org/r/20210809092631.1888748-11-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 86 +++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 48 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 4e453ae0f291..9e932ae9f158 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -518,27 +518,26 @@ static void __init pud_populate_tests(struct pgtable_debug_args *args) { } #endif /* PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_P4D_FOLDED -static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) +static void __init p4d_clear_tests(struct pgtable_debug_args *args) { - p4d_t p4d = READ_ONCE(*p4dp); + p4d_t p4d = READ_ONCE(*args->p4dp); - if (mm_pud_folded(mm)) + if (mm_pud_folded(args->mm)) return; pr_debug("Validating P4D clear\n"); p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE); - WRITE_ONCE(*p4dp, p4d); - p4d_clear(p4dp); - p4d = READ_ONCE(*p4dp); + WRITE_ONCE(*args->p4dp, p4d); + p4d_clear(args->p4dp); + p4d = READ_ONCE(*args->p4dp); WARN_ON(!p4d_none(p4d)); } -static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, - pud_t *pudp) +static void __init p4d_populate_tests(struct pgtable_debug_args *args) { p4d_t p4d; - if (mm_pud_folded(mm)) + if (mm_pud_folded(args->mm)) return; pr_debug("Validating P4D populate\n"); @@ -546,34 +545,33 @@ static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, * This entry points to next level page table page. * Hence this must not qualify as p4d_bad(). */ - pud_clear(pudp); - p4d_clear(p4dp); - p4d_populate(mm, p4dp, pudp); - p4d = READ_ONCE(*p4dp); + pud_clear(args->pudp); + p4d_clear(args->p4dp); + p4d_populate(args->mm, args->p4dp, args->start_pudp); + p4d = READ_ONCE(*args->p4dp); WARN_ON(p4d_bad(p4d)); } -static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) +static void __init pgd_clear_tests(struct pgtable_debug_args *args) { - pgd_t pgd = READ_ONCE(*pgdp); + pgd_t pgd = READ_ONCE(*(args->pgdp)); - if (mm_p4d_folded(mm)) + if (mm_p4d_folded(args->mm)) return; pr_debug("Validating PGD clear\n"); pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE); - WRITE_ONCE(*pgdp, pgd); - pgd_clear(pgdp); - pgd = READ_ONCE(*pgdp); + WRITE_ONCE(*args->pgdp, pgd); + pgd_clear(args->pgdp); + pgd = READ_ONCE(*args->pgdp); WARN_ON(!pgd_none(pgd)); } -static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, - p4d_t *p4dp) +static void __init pgd_populate_tests(struct pgtable_debug_args *args) { pgd_t pgd; - if (mm_p4d_folded(mm)) + if (mm_p4d_folded(args->mm)) return; pr_debug("Validating PGD populate\n"); @@ -581,23 +579,17 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, * This entry points to next level page table page. * Hence this must not qualify as pgd_bad(). */ - p4d_clear(p4dp); - pgd_clear(pgdp); - pgd_populate(mm, pgdp, p4dp); - pgd = READ_ONCE(*pgdp); + p4d_clear(args->p4dp); + pgd_clear(args->pgdp); + pgd_populate(args->mm, args->pgdp, args->start_p4dp); + pgd = READ_ONCE(*args->pgdp); WARN_ON(pgd_bad(pgd)); } #else /* !__PAGETABLE_P4D_FOLDED */ -static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) { } -static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) { } -static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp, - pud_t *pudp) -{ -} -static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, - p4d_t *p4dp) -{ -} +static void __init p4d_clear_tests(struct pgtable_debug_args *args) { } +static void __init pgd_clear_tests(struct pgtable_debug_args *args) { } +static void __init p4d_populate_tests(struct pgtable_debug_args *args) { } +static void __init pgd_populate_tests(struct pgtable_debug_args *args) { } #endif /* PAGETABLE_P4D_FOLDED */ static void __init pte_clear_tests(struct pgtable_debug_args *args) @@ -1200,8 +1192,8 @@ static int __init debug_vm_pgtable(void) struct vm_area_struct *vma; struct mm_struct *mm; pgd_t *pgdp; - p4d_t *p4dp, *saved_p4dp; - pud_t *pudp, *saved_pudp; + p4d_t *p4dp; + pud_t *pudp; pmd_t *pmdp, *saved_pmdp, pmd; pgtable_t saved_ptep; unsigned long vaddr; @@ -1245,8 +1237,6 @@ static int __init debug_vm_pgtable(void) * page table pages. */ pmd = READ_ONCE(*pmdp); - saved_p4dp = p4d_offset(pgdp, 0UL); - saved_pudp = pud_offset(p4dp, 0UL); saved_pmdp = pmd_offset(pudp, 0UL); saved_ptep = pmd_pgtable(pmd); @@ -1325,15 +1315,15 @@ static int __init debug_vm_pgtable(void) pud_populate_tests(&args); spin_unlock(ptl); - spin_lock(&mm->page_table_lock); - p4d_clear_tests(mm, p4dp); - pgd_clear_tests(mm, pgdp); - p4d_populate_tests(mm, p4dp, saved_pudp); - pgd_populate_tests(mm, pgdp, saved_p4dp); - spin_unlock(&mm->page_table_lock); + spin_lock(&(args.mm->page_table_lock)); + p4d_clear_tests(&args); + pgd_clear_tests(&args); + p4d_populate_tests(&args); + pgd_populate_tests(&args); + spin_unlock(&(args.mm->page_table_lock)); - p4d_free(mm, saved_p4dp); - pud_free(mm, saved_pudp); + p4d_free(mm, p4d_offset(pgdp, 0UL)); + pud_free(mm, pud_offset(p4dp, 0UL)); pmd_free(mm, saved_pmdp); pte_free(mm, saved_ptep); From fda88cfda1ab666ee6c136eeb401b4ead7ecd066 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 2 Sep 2021 14:52:51 -0700 Subject: [PATCH 320/474] mm/debug_vm_pgtable: remove unused code The variables used by old implementation isn't needed as we switched to "struct pgtable_debug_args". Lets remove them and related code in debug_vm_pgtable(). Link: https://lkml.kernel.org/r/20210809092631.1888748-12-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 54 ------------------------------------------- 1 file changed, 54 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 9e932ae9f158..4409e30adcf6 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1189,14 +1189,6 @@ error: static int __init debug_vm_pgtable(void) { struct pgtable_debug_args args; - struct vm_area_struct *vma; - struct mm_struct *mm; - pgd_t *pgdp; - p4d_t *p4dp; - pud_t *pudp; - pmd_t *pmdp, *saved_pmdp, pmd; - pgtable_t saved_ptep; - unsigned long vaddr; spinlock_t *ptl = NULL; int idx, ret; @@ -1205,41 +1197,6 @@ static int __init debug_vm_pgtable(void) if (ret) return ret; - vaddr = get_random_vaddr(); - mm = mm_alloc(); - if (!mm) { - pr_err("mm_struct allocation failed\n"); - return 1; - } - - vma = vm_area_alloc(mm); - if (!vma) { - pr_err("vma allocation failed\n"); - return 1; - } - - pgdp = pgd_offset(mm, vaddr); - p4dp = p4d_alloc(mm, pgdp, vaddr); - pudp = pud_alloc(mm, p4dp, vaddr); - pmdp = pmd_alloc(mm, pudp, vaddr); - /* - * Allocate pgtable_t - */ - if (pte_alloc(mm, pmdp)) { - pr_err("pgtable allocation failed\n"); - return 1; - } - - /* - * Save all the page table page addresses as the page table - * entries will be used for testing with random or garbage - * values. These saved addresses will be used for freeing - * page table pages. - */ - pmd = READ_ONCE(*pmdp); - saved_pmdp = pmd_offset(pudp, 0UL); - saved_ptep = pmd_pgtable(pmd); - /* * Iterate over the protection_map[] to make sure that all * the basic page table transformation validations just hold @@ -1322,17 +1279,6 @@ static int __init debug_vm_pgtable(void) pgd_populate_tests(&args); spin_unlock(&(args.mm->page_table_lock)); - p4d_free(mm, p4d_offset(pgdp, 0UL)); - pud_free(mm, pud_offset(p4dp, 0UL)); - pmd_free(mm, saved_pmdp); - pte_free(mm, saved_ptep); - - vm_area_free(vma); - mm_dec_nr_puds(mm); - mm_dec_nr_pmds(mm); - mm_dec_nr_ptes(mm); - mmdrop(mm); - destroy_args(&args); return 0; } From 8c5b3a8adad2152d162fc0230c822f907c816be9 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 2 Sep 2021 14:52:54 -0700 Subject: [PATCH 321/474] mm/debug_vm_pgtable: fix corrupted page flag In page table entry modifying tests, set_xxx_at() are used to populate the page table entries. On ARM64, PG_arch_1 (PG_dcache_clean) flag is set to the target page flag if execution permission is given. The logic exits since commit 4f04d8f00545 ("arm64: MMU definitions"). The page flag is kept when the page is free'd to buddy's free area list. However, it will trigger page checking failure when it's pulled from the buddy's free area list, as the following warning messages indicate. BUG: Bad page state in process memhog pfn:08000 page:0000000015c0a628 refcount:0 mapcount:0 \ mapping:0000000000000000 index:0x1 pfn:0x8000 flags: 0x7ffff8000000800(arch_1|node=0|zone=0|lastcpupid=0xfffff) raw: 07ffff8000000800 dead000000000100 dead000000000122 0000000000000000 raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: PAGE_FLAGS_CHECK_AT_PREP flag(s) set This fixes the issue by clearing PG_arch_1 through flush_dcache_page() after set_xxx_at() is called. For architectures other than ARM64, the unexpected overhead of cache flushing is acceptable. Link: https://lkml.kernel.org/r/20210809092631.1888748-13-gshan@redhat.com Fixes: a5c3b9ffb0f4 ("mm/debug_vm_pgtable: add tests validating advanced arch page table helpers") Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Tested-by: Christophe Leroy [powerpc 8xx] Tested-by: Gerald Schaefer [s390] Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Chunyu Hu Cc: Qian Cai Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 55 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 4409e30adcf6..1403639302e4 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -29,6 +29,8 @@ #include #include #include + +#include #include #include @@ -119,19 +121,28 @@ static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) static void __init pte_advanced_tests(struct pgtable_debug_args *args) { + struct page *page; pte_t pte; /* * Architectures optimize set_pte_at by avoiding TLB flush. * This requires set_pte_at to be not used to update an * existing pte entry. Clear pte before we do set_pte_at + * + * flush_dcache_page() is called after set_pte_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. */ - if (args->pte_pfn == ULONG_MAX) + page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL; + if (!page) return; pr_debug("Validating PTE advanced\n"); pte = pfn_pte(args->pte_pfn, args->page_prot); set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); ptep_set_wrprotect(args->mm, args->vaddr, args->ptep); pte = ptep_get(args->ptep); WARN_ON(pte_write(pte)); @@ -143,6 +154,7 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args) pte = pte_wrprotect(pte); pte = pte_mkclean(pte); set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); pte = pte_mkwrite(pte); pte = pte_mkdirty(pte); ptep_set_access_flags(args->vma, args->vaddr, args->ptep, pte, 1); @@ -155,6 +167,7 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args) pte = pfn_pte(args->pte_pfn, args->page_prot); pte = pte_mkyoung(pte); set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep); pte = ptep_get(args->ptep); WARN_ON(pte_young(pte)); @@ -213,15 +226,24 @@ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { + struct page *page; pmd_t pmd; unsigned long vaddr = args->vaddr; if (!has_transparent_hugepage()) return; - if (args->pmd_pfn == ULONG_MAX) + page = (args->pmd_pfn != ULONG_MAX) ? pfn_to_page(args->pmd_pfn) : NULL; + if (!page) return; + /* + * flush_dcache_page() is called after set_pmd_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. + */ pr_debug("Validating PMD advanced\n"); /* Align the address wrt HPAGE_PMD_SIZE */ vaddr &= HPAGE_PMD_MASK; @@ -230,6 +252,7 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) pmd = pfn_pmd(args->pmd_pfn, args->page_prot); set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + flush_dcache_page(page); pmdp_set_wrprotect(args->mm, vaddr, args->pmdp); pmd = READ_ONCE(*args->pmdp); WARN_ON(pmd_write(pmd)); @@ -241,6 +264,7 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + flush_dcache_page(page); pmd = pmd_mkwrite(pmd); pmd = pmd_mkdirty(pmd); pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1); @@ -253,6 +277,7 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) pmd = pmd_mkhuge(pfn_pmd(args->pmd_pfn, args->page_prot)); pmd = pmd_mkyoung(pmd); set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + flush_dcache_page(page); pmdp_test_and_clear_young(args->vma, vaddr, args->pmdp); pmd = READ_ONCE(*args->pmdp); WARN_ON(pmd_young(pmd)); @@ -339,21 +364,31 @@ static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) static void __init pud_advanced_tests(struct pgtable_debug_args *args) { + struct page *page; unsigned long vaddr = args->vaddr; pud_t pud; if (!has_transparent_hugepage()) return; - if (args->pud_pfn == ULONG_MAX) + page = (args->pud_pfn != ULONG_MAX) ? pfn_to_page(args->pud_pfn) : NULL; + if (!page) return; + /* + * flush_dcache_page() is called after set_pud_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. + */ pr_debug("Validating PUD advanced\n"); /* Align the address wrt HPAGE_PUD_SIZE */ vaddr &= HPAGE_PUD_MASK; pud = pfn_pud(args->pud_pfn, args->page_prot); set_pud_at(args->mm, vaddr, args->pudp, pud); + flush_dcache_page(page); pudp_set_wrprotect(args->mm, vaddr, args->pudp); pud = READ_ONCE(*args->pudp); WARN_ON(pud_write(pud)); @@ -367,6 +402,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) pud = pud_wrprotect(pud); pud = pud_mkclean(pud); set_pud_at(args->mm, vaddr, args->pudp, pud); + flush_dcache_page(page); pud = pud_mkwrite(pud); pud = pud_mkdirty(pud); pudp_set_access_flags(args->vma, vaddr, args->pudp, pud, 1); @@ -382,6 +418,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) pud = pfn_pud(args->pud_pfn, args->page_prot); pud = pud_mkyoung(pud); set_pud_at(args->mm, vaddr, args->pudp, pud); + flush_dcache_page(page); pudp_test_and_clear_young(args->vma, vaddr, args->pudp); pud = READ_ONCE(*args->pudp); WARN_ON(pud_young(pud)); @@ -594,16 +631,26 @@ static void __init pgd_populate_tests(struct pgtable_debug_args *args) { } static void __init pte_clear_tests(struct pgtable_debug_args *args) { + struct page *page; pte_t pte = pfn_pte(args->pte_pfn, args->page_prot); - if (args->pte_pfn == ULONG_MAX) + page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL; + if (!page) return; + /* + * flush_dcache_page() is called after set_pte_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. + */ pr_debug("Validating PTE clear\n"); #ifndef CONFIG_RISCV pte = __pte(pte_val(pte) | RANDOM_ORVALUE); #endif set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); barrier(); pte_clear(args->mm, args->vaddr, args->ptep); pte = ptep_get(args->ptep); From 4f3eaf452a14ff3982f71c1ca8bdf757254231fa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 2 Sep 2021 14:52:58 -0700 Subject: [PATCH 322/474] mm: report a more useful address for reclaim acquisition A recent lockdep report included these lines: [ 96.177910] 3 locks held by containerd/770: [ 96.177934] #0: ffff88810815ea28 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x115/0x770 [ 96.177999] #1: ffffffff82915020 (rcu_read_lock){....}-{1:2}, at: get_swap_device+0x33/0x140 [ 96.178057] #2: ffffffff82955ba0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 While it was not useful to that bug report to know where the reclaim lock had been acquired, it might be useful under other circumstances. Allow the caller of __fs_reclaim_acquire to specify the instruction pointer to use. Link: https://lkml.kernel.org/r/20210719185709.1755149-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Omar Sandoval Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Boqun Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 8 ++++---- mm/page_alloc.c | 12 ++++++------ mm/vmscan.c | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index e24b1fe348e3..8894825cc4db 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -174,13 +174,13 @@ static inline gfp_t current_gfp_context(gfp_t flags) } #ifdef CONFIG_LOCKDEP -extern void __fs_reclaim_acquire(void); -extern void __fs_reclaim_release(void); +extern void __fs_reclaim_acquire(unsigned long ip); +extern void __fs_reclaim_release(unsigned long ip); extern void fs_reclaim_acquire(gfp_t gfp_mask); extern void fs_reclaim_release(gfp_t gfp_mask); #else -static inline void __fs_reclaim_acquire(void) { } -static inline void __fs_reclaim_release(void) { } +static inline void __fs_reclaim_acquire(unsigned long ip) { } +static inline void __fs_reclaim_release(unsigned long ip) { } static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eeb3a9cb36bb..51c17bf7b127 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4549,14 +4549,14 @@ static bool __need_reclaim(gfp_t gfp_mask) return true; } -void __fs_reclaim_acquire(void) +void __fs_reclaim_acquire(unsigned long ip) { - lock_map_acquire(&__fs_reclaim_map); + lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip); } -void __fs_reclaim_release(void) +void __fs_reclaim_release(unsigned long ip) { - lock_map_release(&__fs_reclaim_map); + lock_release(&__fs_reclaim_map, ip); } void fs_reclaim_acquire(gfp_t gfp_mask) @@ -4565,7 +4565,7 @@ void fs_reclaim_acquire(gfp_t gfp_mask) if (__need_reclaim(gfp_mask)) { if (gfp_mask & __GFP_FS) - __fs_reclaim_acquire(); + __fs_reclaim_acquire(_RET_IP_); #ifdef CONFIG_MMU_NOTIFIER lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); @@ -4582,7 +4582,7 @@ void fs_reclaim_release(gfp_t gfp_mask) if (__need_reclaim(gfp_mask)) { if (gfp_mask & __GFP_FS) - __fs_reclaim_release(); + __fs_reclaim_release(_RET_IP_); } } EXPORT_SYMBOL_GPL(fs_reclaim_release); diff --git a/mm/vmscan.c b/mm/vmscan.c index eeae2f6bc532..17c4b3fdd7dd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3812,7 +3812,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) set_task_reclaim_state(current, &sc.reclaim_state); psi_memstall_enter(&pflags); - __fs_reclaim_acquire(); + __fs_reclaim_acquire(_THIS_IP_); count_vm_event(PAGEOUTRUN); @@ -3938,9 +3938,9 @@ restart: wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ - __fs_reclaim_release(); + __fs_reclaim_release(_THIS_IP_); ret = try_to_freeze(); - __fs_reclaim_acquire(); + __fs_reclaim_acquire(_THIS_IP_); if (ret || kthread_should_stop()) break; @@ -3992,7 +3992,7 @@ out: } snapshot_refaults(NULL, pgdat); - __fs_reclaim_release(); + __fs_reclaim_release(_THIS_IP_); psi_memstall_leave(&pflags); set_task_reclaim_state(current, NULL); From eb2169cee36fc492407a2d483c286b977a46288a Mon Sep 17 00:00:00 2001 From: liuhailong Date: Thu, 2 Sep 2021 14:53:01 -0700 Subject: [PATCH 323/474] mm: add kernel_misc_reclaimable in show_free_areas Print NR_KERNEL_MISC_RECLAIMABLE stat from show_free_areas() so users can check whether the shrinker is working correctly and to show the current memory usage. Link: https://lkml.kernel.org/r/20210813104725.4562-1-liuhailong@oppo.com Signed-off-by: liuhailong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 51c17bf7b127..60a867a549bd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5903,6 +5903,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" + " kernel_misc_reclaimable:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", global_node_page_state(NR_ACTIVE_ANON), global_node_page_state(NR_INACTIVE_ANON), @@ -5919,6 +5920,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_SHMEM), global_node_page_state(NR_PAGETABLE), global_zone_page_state(NR_BOUNCE), + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), free_pcp, global_zone_page_state(NR_FREE_CMA_PAGES)); From 633a2abb9e1cd5c95f3b600f4b2c12cce22ae4a0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Sep 2021 14:53:04 -0700 Subject: [PATCH 324/474] writeback: track number of inodes under writeback Patch series "writeback: Fix bandwidth estimates", v4. Fix estimate of writeback throughput when device is not fully busy doing writeback. Michael Stapelberg has reported that such workload (e.g. generated by linking) tends to push estimated throughput down to 0 and as a result writeback on the device is practically stalled. The first three patches fix the reported issue, the remaining two patches are unrelated cleanups of problems I've noticed when reading the code. This patch (of 4): Track number of inodes under writeback for each bdi_writeback structure. We will use this to decide whether wb does any IO and so we can estimate its writeback throughput. In principle we could use number of pages under writeback (WB_WRITEBACK counter) for this however normal percpu counter reads are too inaccurate for our purposes and summing the counter is too expensive. Link: https://lkml.kernel.org/r/20210713104519.16394-1-jack@suse.cz Link: https://lkml.kernel.org/r/20210713104716.22868-1-jack@suse.cz Signed-off-by: Jan Kara Cc: Wu Fengguang Cc: Michael Stapelberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 5 +++++ include/linux/backing-dev-defs.h | 1 + mm/backing-dev.c | 1 + mm/page-writeback.c | 22 ++++++++++++++++++++-- 4 files changed, 27 insertions(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4c3370548982..7439ecd44ac9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -406,6 +406,11 @@ static bool inode_do_switch_wbs(struct inode *inode, inc_wb_stat(new_wb, WB_WRITEBACK); } + if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { + atomic_dec(&old_wb->writeback_inodes); + atomic_inc(&new_wb->writeback_inodes); + } + wb_get(new_wb); /* diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 1d7edad9914f..06fb8e13f6bc 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -116,6 +116,7 @@ struct bdi_writeback { struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ + atomic_t writeback_inodes; /* number of inodes under writeback */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; unsigned long congested; /* WB_[a]sync_congested flags */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f5561ea7d90a..b4c707ddedb1 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -293,6 +293,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); + atomic_set(&wb->writeback_inodes, 0); wb->bw_time_stamp = jiffies; wb->balanced_dirty_ratelimit = INIT_BW; wb->dirty_ratelimit = INIT_BW; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9f63548f247c..e1aa1c9d8e36 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2731,6 +2731,16 @@ int clear_page_dirty_for_io(struct page *page) } EXPORT_SYMBOL(clear_page_dirty_for_io); +static void wb_inode_writeback_start(struct bdi_writeback *wb) +{ + atomic_inc(&wb->writeback_inodes); +} + +static void wb_inode_writeback_end(struct bdi_writeback *wb) +{ + atomic_dec(&wb->writeback_inodes); +} + int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -2752,6 +2762,9 @@ int test_clear_page_writeback(struct page *page) dec_wb_stat(wb, WB_WRITEBACK); __wb_writeout_inc(wb); + if (!mapping_tagged(mapping, + PAGECACHE_TAG_WRITEBACK)) + wb_inode_writeback_end(wb); } } @@ -2794,8 +2807,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write) PAGECACHE_TAG_WRITEBACK); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); - if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) - inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { + struct bdi_writeback *wb = inode_to_wb(inode); + + inc_wb_stat(wb, WB_WRITEBACK); + if (!on_wblist) + wb_inode_writeback_start(wb); + } /* * We can come through here when swapping anonymous From fee468fdf41cdf36ba6b5a780e2474d0a3e066ac Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Sep 2021 14:53:06 -0700 Subject: [PATCH 325/474] writeback: reliably update bandwidth estimation Currently we trigger writeback bandwidth estimation from balance_dirty_pages() and from wb_writeback(). However neither of these need to trigger when the system is relatively idle and writeback is triggered e.g. from fsync(2). Make sure writeback estimates happen reliably by triggering them from do_writepages(). Link: https://lkml.kernel.org/r/20210713104716.22868-2-jack@suse.cz Signed-off-by: Jan Kara Cc: Michael Stapelberg Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 3 --- include/linux/backing-dev.h | 19 ++++++++++++++++++ include/linux/writeback.h | 1 - mm/page-writeback.c | 39 +++++++++++++++++++++++++------------ 4 files changed, 46 insertions(+), 16 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7439ecd44ac9..867984e778c3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2004,7 +2004,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { - unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; unsigned long dirtied_before = jiffies; struct inode *inode; @@ -2058,8 +2057,6 @@ static long wb_writeback(struct bdi_writeback *wb, progress = __writeback_inodes_wb(wb, work); trace_writeback_written(wb, work); - wb_update_bandwidth(wb, wb_start); - /* * Did we write something? Try for more * diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 44df4fcef65c..8a886bca51e5 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -288,6 +288,17 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) return inode->i_wb; } +static inline struct bdi_writeback *inode_to_wb_wbc( + struct inode *inode, + struct writeback_control *wbc) +{ + /* + * If wbc does not have inode attached, it means cgroup writeback was + * disabled when wbc started. Just use the default wb in that case. + */ + return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb; +} + /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode @@ -366,6 +377,14 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) return &inode_to_bdi(inode)->wb; } +static inline struct bdi_writeback *inode_to_wb_wbc( + struct inode *inode, + struct writeback_control *wbc) +{ + return inode_to_wb(inode); +} + + static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 667e86cfbdcf..2480322c06a7 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -379,7 +379,6 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); -void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time); void balance_dirty_pages_ratelimited(struct address_space *mapping); bool wb_over_bg_thresh(struct bdi_writeback *wb); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e1aa1c9d8e36..e4a381b8944b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1332,7 +1332,6 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, struct dirty_throttle_control *mdtc, - unsigned long start_time, bool update_ratelimit) { struct bdi_writeback *wb = gdtc->wb; @@ -1352,13 +1351,6 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); - /* - * Skip quiet periods when disk bandwidth is under-utilized. - * (at least 1s idle time between two flusher runs) - */ - if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) - goto snapshot; - if (update_ratelimit) { domain_update_bandwidth(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); @@ -1374,17 +1366,36 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, } wb_update_write_bandwidth(wb, elapsed, written); -snapshot: wb->dirtied_stamp = dirtied; wb->written_stamp = written; wb->bw_time_stamp = now; } -void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) +static void wb_update_bandwidth(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; - __wb_update_bandwidth(&gdtc, NULL, start_time, false); + spin_lock(&wb->list_lock); + __wb_update_bandwidth(&gdtc, NULL, false); + spin_unlock(&wb->list_lock); +} + +/* Interval after which we consider wb idle and don't estimate bandwidth */ +#define WB_BANDWIDTH_IDLE_JIF (HZ) + +static void wb_bandwidth_estimate_start(struct bdi_writeback *wb) +{ + unsigned long now = jiffies; + unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp); + + if (elapsed > WB_BANDWIDTH_IDLE_JIF && + !atomic_read(&wb->writeback_inodes)) { + spin_lock(&wb->list_lock); + wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED); + wb->written_stamp = wb_stat(wb, WB_WRITTEN); + wb->bw_time_stamp = now; + spin_unlock(&wb->list_lock); + } } /* @@ -1713,7 +1724,7 @@ free_running: if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) { spin_lock(&wb->list_lock); - __wb_update_bandwidth(gdtc, mdtc, start_time, true); + __wb_update_bandwidth(gdtc, mdtc, true); spin_unlock(&wb->list_lock); } @@ -2347,9 +2358,12 @@ EXPORT_SYMBOL(generic_writepages); int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; + struct bdi_writeback *wb; if (wbc->nr_to_write <= 0) return 0; + wb = inode_to_wb_wbc(mapping->host, wbc); + wb_bandwidth_estimate_start(wb); while (1) { if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); @@ -2360,6 +2374,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); } + wb_update_bandwidth(wb); return ret; } From 45a2966fd64147518dc5bca25f447bd0fb5359ac Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Sep 2021 14:53:09 -0700 Subject: [PATCH 326/474] writeback: fix bandwidth estimate for spiky workload Michael Stapelberg has reported that for workload with short big spikes of writes (GCC linker seem to trigger this frequently) the write throughput is heavily underestimated and tends to steadily sink until it reaches zero. This has rather bad impact on writeback throttling (causing stalls). The problem is that writeback throughput estimate gets updated at most once per 200 ms. One update happens early after we submit pages for writeback (at that point writeout of only small fraction of pages is completed and thus observed throughput is tiny). Next update happens only during the next write spike (updates happen only from inode writeback and dirty throttling code) and if that is more than 1s after previous spike, we decide system was idle and just ignore whatever was written until this moment. Fix the problem by making sure writeback throughput estimate is also updated shortly after writeback completes to get reasonable estimate of throughput for spiky workloads. [jack@suse.cz: avoid division by 0 in wb_update_dirty_ratelimit()] Link: https://lore.kernel.org/lkml/20210617095309.3542373-1-stapelberg+linux@google.com Link: https://lkml.kernel.org/r/20210713104716.22868-3-jack@suse.cz Signed-off-by: Jan Kara Reported-by: Michael Stapelberg Tested-by: Michael Stapelberg Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev-defs.h | 1 + include/linux/writeback.h | 1 + mm/backing-dev.c | 10 ++++++++ mm/page-writeback.c | 39 ++++++++++++++++++++------------ 4 files changed, 37 insertions(+), 14 deletions(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 06fb8e13f6bc..33207004cfde 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -143,6 +143,7 @@ struct bdi_writeback { spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ + struct delayed_work bw_dwork; /* work item used for bandwidth estimate */ unsigned long dirty_sleep; /* last wait */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 2480322c06a7..cbaef099645e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -379,6 +379,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); +void wb_update_bandwidth(struct bdi_writeback *wb); void balance_dirty_pages_ratelimited(struct address_space *mapping); bool wb_over_bg_thresh(struct bdi_writeback *wb); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b4c707ddedb1..6122c78ce914 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -271,6 +271,14 @@ void wb_wakeup_delayed(struct bdi_writeback *wb) spin_unlock_bh(&wb->work_lock); } +static void wb_update_bandwidth_workfn(struct work_struct *work) +{ + struct bdi_writeback *wb = container_of(to_delayed_work(work), + struct bdi_writeback, bw_dwork); + + wb_update_bandwidth(wb); +} + /* * Initial write bandwidth: 100 MB/s */ @@ -303,6 +311,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); + INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); wb->dirty_sleep = jiffies; err = fprop_local_init_percpu(&wb->completions, gfp); @@ -351,6 +360,7 @@ static void wb_shutdown(struct bdi_writeback *wb) mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); + flush_delayed_work(&wb->bw_dwork); } static void wb_exit(struct bdi_writeback *wb) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e4a381b8944b..156f5888c09d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1336,18 +1336,19 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, { struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; - unsigned long elapsed = now - wb->bw_time_stamp; + unsigned long elapsed; unsigned long dirtied; unsigned long written; - lockdep_assert_held(&wb->list_lock); + spin_lock(&wb->list_lock); /* - * rate-limit, only update once every 200ms. + * Lockless checks for elapsed time are racy and delayed update after + * IO completion doesn't do it at all (to make sure written pages are + * accounted reasonably quickly). Make sure elapsed >= 1 to avoid + * division errors. */ - if (elapsed < BANDWIDTH_INTERVAL) - return; - + elapsed = max(now - wb->bw_time_stamp, 1UL); dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); written = percpu_counter_read(&wb->stat[WB_WRITTEN]); @@ -1369,15 +1370,14 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, wb->dirtied_stamp = dirtied; wb->written_stamp = written; wb->bw_time_stamp = now; + spin_unlock(&wb->list_lock); } -static void wb_update_bandwidth(struct bdi_writeback *wb) +void wb_update_bandwidth(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; - spin_lock(&wb->list_lock); __wb_update_bandwidth(&gdtc, NULL, false); - spin_unlock(&wb->list_lock); } /* Interval after which we consider wb idle and don't estimate bandwidth */ @@ -1722,11 +1722,8 @@ free_running: wb->dirty_exceeded = 1; if (time_is_before_jiffies(wb->bw_time_stamp + - BANDWIDTH_INTERVAL)) { - spin_lock(&wb->list_lock); + BANDWIDTH_INTERVAL)) __wb_update_bandwidth(gdtc, mdtc, true); - spin_unlock(&wb->list_lock); - } /* throttle according to the chosen dtc */ dirty_ratelimit = wb->dirty_ratelimit; @@ -2374,7 +2371,13 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) cond_resched(); congestion_wait(BLK_RW_ASYNC, HZ/50); } - wb_update_bandwidth(wb); + /* + * Usually few pages are written by now from those we've just submitted + * but if there's constant writeback being submitted, this makes sure + * writeback bandwidth is updated once in a while. + */ + if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) + wb_update_bandwidth(wb); return ret; } @@ -2754,6 +2757,14 @@ static void wb_inode_writeback_start(struct bdi_writeback *wb) static void wb_inode_writeback_end(struct bdi_writeback *wb) { atomic_dec(&wb->writeback_inodes); + /* + * Make sure estimate of writeback throughput gets updated after + * writeback completed. We delay the update by BANDWIDTH_INTERVAL + * (which is the interval other bandwidth updates use for batching) so + * that if multiple inodes end writeback at a similar time, they get + * batched into one bandwidth update. + */ + queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); } int test_clear_page_writeback(struct page *page) From 42dd235cb15c06c8446f9aed695648a3c2eb8d11 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Sep 2021 14:53:12 -0700 Subject: [PATCH 327/474] writeback: rename domain_update_bandwidth() Rename domain_update_bandwidth() to domain_update_dirty_limit(). The original name is a misnomer. The function has nothing to do with a bandwidth, it updates dirty limits. Link: https://lkml.kernel.org/r/20210713104716.22868-4-jack@suse.cz Signed-off-by: Jan Kara Cc: Michael Stapelberg Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 156f5888c09d..7d8ad4260147 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1147,8 +1147,8 @@ update: dom->dirty_limit = limit; } -static void domain_update_bandwidth(struct dirty_throttle_control *dtc, - unsigned long now) +static void domain_update_dirty_limit(struct dirty_throttle_control *dtc, + unsigned long now) { struct wb_domain *dom = dtc_dom(dtc); @@ -1353,7 +1353,7 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, written = percpu_counter_read(&wb->stat[WB_WRITTEN]); if (update_ratelimit) { - domain_update_bandwidth(gdtc, now); + domain_update_dirty_limit(gdtc, now); wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); /* @@ -1361,7 +1361,7 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, * compiler has no way to figure that out. Help it. */ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { - domain_update_bandwidth(mdtc, now); + domain_update_dirty_limit(mdtc, now); wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); } } From 20792ebf3eeb828a692b29f3000673cb9ca83c3a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Sep 2021 14:53:15 -0700 Subject: [PATCH 328/474] writeback: use READ_ONCE for unlocked reads of writeback stats We do some unlocked reads of writeback statistics like avg_write_bandwidth, dirty_ratelimit, or bw_time_stamp. Generally we are fine with getting somewhat out-of-date values but actually getting different values in various parts of the functions because the compiler decided to reload value from original memory location could confuse calculations. Use READ_ONCE for these unlocked accesses and WRITE_ONCE for the updates to be on the safe side. Link: https://lkml.kernel.org/r/20210713104716.22868-5-jack@suse.cz Signed-off-by: Jan Kara Cc: Michael Stapelberg Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7d8ad4260147..c3b00c6f30ce 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -183,7 +183,7 @@ static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { - unsigned long this_bw = wb->avg_write_bandwidth; + unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); unsigned long long min = wb->bdi->min_ratio; unsigned long long max = wb->bdi->max_ratio; @@ -892,7 +892,7 @@ static long long pos_ratio_polynom(unsigned long setpoint, static void wb_position_ratio(struct dirty_throttle_control *dtc) { struct bdi_writeback *wb = dtc->wb; - unsigned long write_bw = wb->avg_write_bandwidth; + unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long wb_thresh = dtc->wb_thresh; @@ -1115,7 +1115,7 @@ out: &wb->bdi->tot_write_bandwidth) <= 0); } wb->write_bandwidth = bw; - wb->avg_write_bandwidth = avg; + WRITE_ONCE(wb->avg_write_bandwidth, avg); } static void update_dirty_limit(struct dirty_throttle_control *dtc) @@ -1324,7 +1324,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, else dirty_ratelimit -= step; - wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); + WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL)); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); @@ -1369,7 +1369,7 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, wb->dirtied_stamp = dirtied; wb->written_stamp = written; - wb->bw_time_stamp = now; + WRITE_ONCE(wb->bw_time_stamp, now); spin_unlock(&wb->list_lock); } @@ -1393,7 +1393,7 @@ static void wb_bandwidth_estimate_start(struct bdi_writeback *wb) spin_lock(&wb->list_lock); wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED); wb->written_stamp = wb_stat(wb, WB_WRITTEN); - wb->bw_time_stamp = now; + WRITE_ONCE(wb->bw_time_stamp, now); spin_unlock(&wb->list_lock); } } @@ -1418,7 +1418,7 @@ static unsigned long dirty_poll_interval(unsigned long dirty, static unsigned long wb_max_pause(struct bdi_writeback *wb, unsigned long wb_dirty) { - unsigned long bw = wb->avg_write_bandwidth; + unsigned long bw = READ_ONCE(wb->avg_write_bandwidth); unsigned long t; /* @@ -1440,8 +1440,8 @@ static long wb_min_pause(struct bdi_writeback *wb, unsigned long dirty_ratelimit, int *nr_dirtied_pause) { - long hi = ilog2(wb->avg_write_bandwidth); - long lo = ilog2(wb->dirty_ratelimit); + long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth)); + long lo = ilog2(READ_ONCE(wb->dirty_ratelimit)); long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ @@ -1721,12 +1721,12 @@ free_running: if (dirty_exceeded && !wb->dirty_exceeded) wb->dirty_exceeded = 1; - if (time_is_before_jiffies(wb->bw_time_stamp + + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + BANDWIDTH_INTERVAL)) __wb_update_bandwidth(gdtc, mdtc, true); /* throttle according to the chosen dtc */ - dirty_ratelimit = wb->dirty_ratelimit; + dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> RATELIMIT_CALC_SHIFT; max_pause = wb_max_pause(wb, sdtc->wb_dirty); @@ -2376,7 +2376,8 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) * but if there's constant writeback being submitted, this makes sure * writeback bandwidth is updated once in a while. */ - if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + + BANDWIDTH_INTERVAL)) wb_update_bandwidth(wb); return ret; } From 3047250972ff935b1d7a0629fa3acb04c12dcc07 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 2 Sep 2021 14:53:18 -0700 Subject: [PATCH 329/474] mm: remove irqsave/restore locking from contexts with irqs enabled The page cache deletion paths all have interrupts enabled, so no need to use irqsafe/irqrestore locking variants. They used to have irqs disabled by the memcg lock added in commit c4843a7593a9 ("memcg: add per cgroup dirty page accounting"), but that has since been replaced by memcg taking the page lock instead, commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge AP"). Link: https://lkml.kernel.org/r/20210614211904.14420-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 15 ++++++--------- mm/truncate.c | 8 +++----- mm/vmscan.c | 9 ++++----- 3 files changed, 13 insertions(+), 19 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index d1458ecf2f51..4926f16ec52d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -258,12 +258,11 @@ static void page_cache_free_page(struct address_space *mapping, void delete_from_page_cache(struct page *page) { struct address_space *mapping = page_mapping(page); - unsigned long flags; BUG_ON(!PageLocked(page)); - xa_lock_irqsave(&mapping->i_pages, flags); + xa_lock_irq(&mapping->i_pages); __delete_from_page_cache(page, NULL); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); page_cache_free_page(mapping, page); } @@ -335,19 +334,18 @@ void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec) { int i; - unsigned long flags; if (!pagevec_count(pvec)) return; - xa_lock_irqsave(&mapping->i_pages, flags); + xa_lock_irq(&mapping->i_pages); for (i = 0; i < pagevec_count(pvec); i++) { trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); unaccount_page_cache_page(mapping, pvec->pages[i]); } page_cache_delete_batch(mapping, pvec); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); for (i = 0; i < pagevec_count(pvec); i++) page_cache_free_page(mapping, pvec->pages[i]); @@ -821,7 +819,6 @@ void replace_page_cache_page(struct page *old, struct page *new) void (*freepage)(struct page *) = mapping->a_ops->freepage; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); - unsigned long flags; VM_BUG_ON_PAGE(!PageLocked(old), old); VM_BUG_ON_PAGE(!PageLocked(new), new); @@ -833,7 +830,7 @@ void replace_page_cache_page(struct page *old, struct page *new) mem_cgroup_migrate(old, new); - xas_lock_irqsave(&xas, flags); + xas_lock_irq(&xas); xas_store(&xas, new); old->mapping = NULL; @@ -846,7 +843,7 @@ void replace_page_cache_page(struct page *old, struct page *new) __dec_lruvec_page_state(old, NR_SHMEM); if (PageSwapBacked(new)) __inc_lruvec_page_state(new, NR_SHMEM); - xas_unlock_irqrestore(&xas, flags); + xas_unlock_irq(&xas); if (freepage) freepage(old); put_page(old); diff --git a/mm/truncate.c b/mm/truncate.c index 234ddd879caa..2adff8f800bb 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -560,21 +560,19 @@ void invalidate_mapping_pagevec(struct address_space *mapping, static int invalidate_complete_page2(struct address_space *mapping, struct page *page) { - unsigned long flags; - if (page->mapping != mapping) return 0; if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - xa_lock_irqsave(&mapping->i_pages, flags); + xa_lock_irq(&mapping->i_pages); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -582,7 +580,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) put_page(page); /* pagecache ref */ return 1; failed: - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 17c4b3fdd7dd..268ad6570751 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1052,14 +1052,13 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed, struct mem_cgroup *target_memcg) { - unsigned long flags; int refcount; void *shadow = NULL; BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - xa_lock_irqsave(&mapping->i_pages, flags); + xa_lock_irq(&mapping->i_pages); /* * The non racy check for a busy page. * @@ -1100,7 +1099,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(page, target_memcg); __delete_from_swap_cache(page, swap, shadow); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); put_swap_page(page, swap); } else { void (*freepage)(struct page *); @@ -1126,7 +1125,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(page, target_memcg); __delete_from_page_cache(page, shadow); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); if (freepage != NULL) freepage(page); @@ -1135,7 +1134,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, return 1; cannot_free: - xa_unlock_irqrestore(&mapping->i_pages, flags); + xa_unlock_irq(&mapping->i_pages); return 0; } From 16e2df2a05d46c983bf310b19432c5ca4684b2bc Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 2 Sep 2021 14:53:21 -0700 Subject: [PATCH 330/474] fs: drop_caches: fix skipping over shadow cache inodes When drop_caches truncates the page cache in an inode it also includes any shadow entries for evicted pages. However, there is a preliminary check on whether the inode has pages: if it has *only* shadow entries, it will skip running truncation on the inode and leave it behind. Fix the check to mapping_empty(), such that it runs truncation on any inode that has cache entries at all. Link: https://lkml.kernel.org/r/20210614211904.14420-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Roman Gushchin Acked-by: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/drop_caches.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/drop_caches.c b/fs/drop_caches.c index f00fcc4a4f72..e619c31b6bd9 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -3,6 +3,7 @@ * Implement the manual drop-all-pagecache function */ +#include #include #include #include @@ -27,7 +28,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) * we need to reschedule to avoid softlockups. */ if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || - (inode->i_mapping->nrpages == 0 && !need_resched())) { + (mapping_empty(inode->i_mapping) && !need_resched())) { spin_unlock(&inode->i_lock); continue; } From 7ae12c809f6a31d3da7b96339dbefa141884c711 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 2 Sep 2021 14:53:24 -0700 Subject: [PATCH 331/474] fs: inode: count invalidated shadow pages in pginodesteal pginodesteal is supposed to capture the impact that inode reclaim has on the page cache state. Currently, it doesn't consider shadow pages that get dropped this way, even though this can have a significant impact on paging behavior, memory pressure calculations etc. To improve visibility into these effects, make sure shadow pages get counted when they get dropped through inode reclaim. This changes the return value semantics of invalidate_mapping_pages() semantics slightly, but the only two users are the inode shrinker itsel and a usb driver that logs it for debugging purposes. Link: https://lkml.kernel.org/r/20210614211904.14420-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 2 +- mm/truncate.c | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index c93500d84264..8830a727b0af 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -768,7 +768,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, return LRU_ROTATE; } - if (inode_has_buffers(inode) || inode->i_data.nrpages) { + if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(lru_lock); diff --git a/mm/truncate.c b/mm/truncate.c index 2adff8f800bb..787b35f2cdc1 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -483,8 +483,9 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, index = indices[i]; if (xa_is_value(page)) { - invalidate_exceptional_entry(mapping, index, - page); + count += invalidate_exceptional_entry(mapping, + index, + page); continue; } index += thp_nr_pages(page) - 1; @@ -512,19 +513,18 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, } /** - * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate + * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode + * @mapping: the address_space which holds the cache to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * - * This function only removes the unlocked pages, if you want to - * remove all the pages of one inode, you must call truncate_inode_pages. + * This function removes pages that are clean, unmapped and unlocked, + * as well as shadow entries. It will not block on IO activity. * - * invalidate_mapping_pages() will not block on IO activity. It will not - * invalidate pages which are dirty, locked, under writeback or mapped into - * pagetables. + * If you want to remove all the pages of one inode, regardless of + * their use and writeback state, use truncate_inode_pages(). * - * Return: the number of the pages that were invalidated + * Return: the number of the cache entries that were invalidated */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) From 7490a2d248145d8694e1e9828801b496250fd697 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:53:27 -0700 Subject: [PATCH 332/474] writeback: memcg: simplify cgroup_writeback_by_id Currently cgroup_writeback_by_id calls mem_cgroup_wb_stats() to get dirty pages for a memcg. However mem_cgroup_wb_stats() does a lot more than just get the number of dirty pages. Just directly get the number of dirty pages instead of calling mem_cgroup_wb_stats(). Also cgroup_writeback_by_id() is only called for best-effort dirty flushing, so remove the unused 'nr' parameter and no need to explicitly flush memcg stats. Link: https://lkml.kernel.org/r/20210722182627.2267368-1-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Jan Kara Cc: Tejun Heo Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 20 +++++++++----------- include/linux/memcontrol.h | 15 +++++++++++++++ include/linux/writeback.h | 2 +- mm/memcontrol.c | 13 +------------ 4 files changed, 26 insertions(+), 24 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 867984e778c3..35894a2dba75 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1039,20 +1039,20 @@ restart: * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs * @bdi_id: target bdi id * @memcg_id: target memcg css id - * @nr: number of pages to write, 0 for best-effort dirty flushing * @reason: reason why some writeback work initiated * @done: target wb_completion * * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id * with the specified parameters. */ -int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, +int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done) { struct backing_dev_info *bdi; struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; struct wb_writeback_work *work; + unsigned long dirty; int ret; /* lookup bdi and memcg */ @@ -1081,24 +1081,22 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, } /* - * If @nr is zero, the caller is attempting to write out most of + * The caller is attempting to write out most of * the currently dirty pages. Let's take the current dirty page * count and inflate it by 25% which should be large enough to * flush out most dirty pages while avoiding getting livelocked by * concurrent dirtiers. + * + * BTW the memcg stats are flushed periodically and this is best-effort + * estimation, so some potential error is ok. */ - if (!nr) { - unsigned long filepages, headroom, dirty, writeback; - - mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty, - &writeback); - nr = dirty * 10 / 8; - } + dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY); + dirty = dirty * 10 / 8; /* issue the writeback work */ work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); if (work) { - work->nr_pages = nr; + work->nr_pages = dirty; work->sync_mode = WB_SYNC_NONE; work->range_cyclic = 1; work->reason = reason; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 24797929d8a1..3403ec77528a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -955,6 +955,16 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, local_irq_restore(flags); } +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = READ_ONCE(memcg->vmstats.state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { @@ -1391,6 +1401,11 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, { } +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + return 0; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index cbaef099645e..aeda2c0c9986 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -218,7 +218,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, void wbc_detach_inode(struct writeback_control *wbc); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes); -int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages, +int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done); void cgroup_writeback_umount(void); bool cleanup_offline_cgwb(struct bdi_writeback *wb); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 702a81dfe72d..1047f0271ff8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -645,17 +645,6 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); } -/* idx can be of type enum memcg_stat_item or node_stat_item. */ -static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = READ_ONCE(memcg->vmstats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} - /* idx can be of type enum memcg_stat_item or node_stat_item. */ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { @@ -4668,7 +4657,7 @@ void mem_cgroup_flush_foreign(struct bdi_writeback *wb) atomic_read(&frn->done.cnt) == 1) { frn->at = 0; trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); - cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, + cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, WB_REASON_FOREIGN_FLUSH, &frn->done); } From 6de522d1667f628376517e4b177af10c739d745b Mon Sep 17 00:00:00 2001 From: Jing Yangyang Date: Thu, 2 Sep 2021 14:53:30 -0700 Subject: [PATCH 333/474] include/linux/buffer_head.h: fix boolreturn.cocci warnings ./include/linux/buffer_head.h:412:64-65:WARNING:return of 0/1 in function 'has_bh_in_lru' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Generated by: scripts/coccinelle/misc/boolreturn.cocci Link: https://lkml.kernel.org/r/20210824055828.58783-1-deng.changcheng@zte.com.cn Signed-off-by: Jing Yangyang Reported-by: Zeal Robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/buffer_head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index e7e99da31349..6486d3c19463 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -409,7 +409,7 @@ static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } static inline void invalidate_bh_lrus_cpu(int cpu) {} -static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; } +static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; } #define buffer_heads_over_limit 0 #endif /* CONFIG_BLOCK */ From 8fed2f3cd6da9fba1045c557f88fc3c46dc7d2d2 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:53:33 -0700 Subject: [PATCH 334/474] mm: gup: remove set but unused local variable major Patch series "Cleanups and fixup for gup". This series contains cleanups to remove unneeded variable, useless BUG_ON and use helper to improve readability. Also we fix a potential pgmap refcnt leak. More details can be found in the respective changelogs. This patch (of 5): Since commit a2beb5f1efed ("mm: clean up the last pieces of page fault accountings"), the local variable major is unused. Remove it. Link: https://lkml.kernel.org/r/20210807093620.21347-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210807093620.21347-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Reviewed-by: Claudio Imbrenda Cc: Jan Kara Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index b94717977d17..2532ec89f61b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1276,7 +1276,7 @@ int fixup_user_fault(struct mm_struct *mm, bool *unlocked) { struct vm_area_struct *vma; - vm_fault_t ret, major = 0; + vm_fault_t ret; address = untagged_addr(address); @@ -1296,7 +1296,6 @@ retry: return -EINTR; ret = handle_mm_fault(vma, address, fault_flags, NULL); - major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); From 0fef147ba7329d54a08b3b8c7e152b6dd5391199 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:53:36 -0700 Subject: [PATCH 335/474] mm: gup: remove unneed local variable orig_refs Remove unneed local variable orig_refs since refs is unchanged now. Link: https://lkml.kernel.org/r/20210807093620.21347-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: John Hubbard Reviewed-by: Claudio Imbrenda Reviewed-by: David Hildenbrand Cc: Jan Kara Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 2532ec89f61b..82074e819a8d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -117,8 +117,6 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, if (flags & FOLL_GET) return try_get_compound_head(page, refs); else if (flags & FOLL_PIN) { - int orig_refs = refs; - /* * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a * right zone, so fail and let the caller fall back to the slow @@ -150,7 +148,7 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1)); mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, - orig_refs); + refs); return page; } From 06a9e696639c5e0f457ae117c39ec4cbe5096dc9 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:53:39 -0700 Subject: [PATCH 336/474] mm: gup: remove useless BUG_ON in __get_user_pages() Indeed, this BUG_ON couldn't catch anything useful. We are sure ret == 0 here because we would already bail out if ret != 0 and ret is untouched till here. Link: https://lkml.kernel.org/r/20210807093620.21347-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Reviewed-by: Claudio Imbrenda Cc: Jan Kara Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index 82074e819a8d..9f5b9a93ae21 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1149,7 +1149,6 @@ static long __get_user_pages(struct mm_struct *mm, * We must stop here. */ BUG_ON(gup_flags & FOLL_NOWAIT); - BUG_ON(ret != 0); goto out; } continue; From 6401c4eb57f947a49eb144b5b0787cde3318e82e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:53:42 -0700 Subject: [PATCH 337/474] mm: gup: fix potential pgmap refcnt leak in __gup_device_huge() When failed to try_grab_page, put_dev_pagemap() is missed. So pgmap refcnt will leak in this case. Also we remove the check for pgmap against NULL as it's also checked inside the put_dev_pagemap(). [akpm@linux-foundation.org: simplify, cleanup] [akpm@linux-foundation.org: fix return value] Link: https://lkml.kernel.org/r/20210807093620.21347-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Fixes: 3faa52c03f44 ("mm/gup: track FOLL_PIN pages") Reviewed-by: John Hubbard Reviewed-by: Claudio Imbrenda Cc: Jan Kara Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 9f5b9a93ae21..a10c48ef613c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2240,6 +2240,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; + int ret = 1; do { struct page *page = pfn_to_page(pfn); @@ -2247,21 +2248,22 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); - return 0; + ret = 0; + break; } SetPageReferenced(page); pages[*nr] = page; if (unlikely(!try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); - return 0; + ret = 0; + break; } (*nr)++; pfn++; } while (addr += PAGE_SIZE, addr != end); - if (pgmap) - put_dev_pagemap(pgmap); - return 1; + put_dev_pagemap(pgmap); + return ret; } static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, From be51eb18b81b2f0a0a562bac0f6fe0f4b05c439e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:53:45 -0700 Subject: [PATCH 338/474] mm: gup: use helper PAGE_ALIGNED in populate_vma_page_range() Use helper PAGE_ALIGNED to check if address is aligned to PAGE_SIZE. Minor readability improvement. Link: https://lkml.kernel.org/r/20210807093620.21347-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Reviewed-by: Claudio Imbrenda Cc: Jan Kara Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index a10c48ef613c..1973bfe59b2e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1471,8 +1471,8 @@ long populate_vma_page_range(struct vm_area_struct *vma, unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; - VM_BUG_ON(start & ~PAGE_MASK); - VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); From 3967db22ba324939762f618f0d654b13317ca7a4 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 Sep 2021 14:53:48 -0700 Subject: [PATCH 339/474] mm/gup: documentation corrections for gup/pup Patch series "A few gup refactorings and documentation updates", v3. While reviewing some of the other things going on around gup.c, I noticed that the documentation was wrong for a few of the routines that I wrote. And then I noticed that there was some significant code duplication too. So this fixes those issues. This is not entirely risk-free, but after looking closely at this, I think it's actually a useful improvement, getting rid of the code duplication here. This patch (of 3): The documentation for try_grab_compound_head() and try_grab_page() has fallen a little out of date. Update and clarify a few points. Also make it kerneldoc-correct, by adding @args documentation. Link: https://lkml.kernel.org/r/20210813044133.1536842-1-jhubbard@nvidia.com Link: https://lkml.kernel.org/r/20210813044133.1536842-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: Matthew Wilcox Cc: Christoph Hellwig Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 1973bfe59b2e..26ce6bb52044 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -92,10 +92,17 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) return head; } -/* +/** * try_grab_compound_head() - attempt to elevate a page's refcount, by a * flags-dependent amount. * + * Even though the name includes "compound_head", this function is still + * appropriate for callers that have a non-compound @page to get. + * + * @page: pointer to page to be grabbed + * @refs: the value to (effectively) add to the page's refcount + * @flags: gup flags: these are the FOLL_* flag values. + * * "grab" names in this file mean, "look at flags to decide whether to use * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. * @@ -103,8 +110,14 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) * same time. (That's true throughout the get_user_pages*() and * pin_user_pages*() APIs.) Cases: * - * FOLL_GET: page's refcount will be incremented by 1. - * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS. + * FOLL_GET: page's refcount will be incremented by @refs. + * + * FOLL_PIN on compound pages that are > two pages long: page's refcount will + * be incremented by @refs, and page[2].hpage_pinned_refcount will be + * incremented by @refs * GUP_PIN_COUNTING_BIAS. + * + * FOLL_PIN on normal pages, or compound pages that are two pages long: + * page's refcount will be incremented by @refs * GUP_PIN_COUNTING_BIAS. * * Return: head page (with refcount appropriately incremented) for success, or * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's @@ -141,6 +154,8 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, * * However, be sure to *also* increment the normal page refcount * field at least once, so that the page really is pinned. + * That's why the refcount from the earlier + * try_get_compound_head() is left intact. */ if (hpage_pincount_available(page)) hpage_pincount_add(page, refs); @@ -184,10 +199,8 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags) * @flags: gup flags: these are the FOLL_* flag values. * * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same - * time. Cases: - * - * FOLL_GET: page's refcount will be incremented by 1. - * FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS. + * time. Cases: please see the try_grab_compound_head() documentation, with + * "refs=1". * * Return: true for success, or if no action was required (if neither FOLL_PIN * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or From 54d516b1d62ff8f17cee2da06e5e4706a0d00b8a Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 Sep 2021 14:53:51 -0700 Subject: [PATCH 340/474] mm/gup: small refactoring: simplify try_grab_page() try_grab_page() does the same thing as try_grab_compound_head(..., refs=1, ...), just with a different API. So there is a lot of code duplication there. Change try_grab_page() to call try_grab_compound_head(), while keeping the API contract identical for callers. Also, now that try_grab_compound_head() always has a caller, remove the __maybe_unused annotation. Link: https://lkml.kernel.org/r/20210813044133.1536842-3-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- mm/gup.c | 35 +++++------------------------------ 2 files changed, 7 insertions(+), 32 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ca22e6e694a..af4845e81b84 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1214,8 +1214,8 @@ static inline void get_page(struct page *page) } bool __must_check try_grab_page(struct page *page, unsigned int flags); -__maybe_unused struct page *try_grab_compound_head(struct page *page, int refs, - unsigned int flags); +struct page *try_grab_compound_head(struct page *page, int refs, + unsigned int flags); static inline __must_check bool try_get_page(struct page *page) diff --git a/mm/gup.c b/mm/gup.c index 26ce6bb52044..d60419ed9262 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -124,8 +124,8 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) * considered failure, and furthermore, a likely bug in the caller, so a warning * is also emitted. */ -__maybe_unused struct page *try_grab_compound_head(struct page *page, - int refs, unsigned int flags) +struct page *try_grab_compound_head(struct page *page, + int refs, unsigned int flags) { if (flags & FOLL_GET) return try_get_compound_head(page, refs); @@ -208,35 +208,10 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags) */ bool __must_check try_grab_page(struct page *page, unsigned int flags) { - WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); + if (!(flags & (FOLL_GET | FOLL_PIN))) + return true; - if (flags & FOLL_GET) - return try_get_page(page); - else if (flags & FOLL_PIN) { - int refs = 1; - - page = compound_head(page); - - if (WARN_ON_ONCE(page_ref_count(page) <= 0)) - return false; - - if (hpage_pincount_available(page)) - hpage_pincount_add(page, 1); - else - refs = GUP_PIN_COUNTING_BIAS; - - /* - * Similar to try_grab_compound_head(): even if using the - * hpage_pincount_add/_sub() routines, be sure to - * *also* increment the normal page refcount field at least - * once, so that the page really is pinned. - */ - page_ref_add(page, refs); - - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1); - } - - return true; + return try_grab_compound_head(page, 1, flags); } /** From 9857a17f206ff374aea78bccfb687f145368be2e Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 Sep 2021 14:53:54 -0700 Subject: [PATCH 341/474] mm/gup: remove try_get_page(), call try_get_compound_head() directly try_get_page() is very similar to try_get_compound_head(), and in fact try_get_page() has fallen a little behind in terms of maintenance: try_get_compound_head() handles speculative page references more thoroughly. There are only two try_get_page() callsites, so just call try_get_compound_head() directly from those, and remove try_get_page() entirely. Also, seeing as how this changes try_get_compound_head() into a non-static function, provide some kerneldoc documentation for it. Link: https://lkml.kernel.org/r/20210813044133.1536842-4-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/fault.c | 2 +- fs/pipe.c | 2 +- include/linux/mm.h | 10 +--------- mm/gup.c | 21 +++++++++++++++++---- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index e33c43b38afe..81d760749987 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -817,7 +817,7 @@ void do_secure_storage_access(struct pt_regs *regs) break; case KERNEL_FAULT: page = phys_to_page(addr); - if (unlikely(!try_get_page(page))) + if (unlikely(!try_get_compound_head(page, 1))) break; rc = arch_make_page_accessible(page); put_page(page); diff --git a/fs/pipe.c b/fs/pipe.c index 6d4342bad9f1..1fa1f52763f0 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -191,7 +191,7 @@ EXPORT_SYMBOL(generic_pipe_buf_try_steal); */ bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - return try_get_page(buf->page); + return try_get_compound_head(buf->page, 1); } EXPORT_SYMBOL(generic_pipe_buf_get); diff --git a/include/linux/mm.h b/include/linux/mm.h index af4845e81b84..d3439dd4f4ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1217,15 +1217,7 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags); struct page *try_grab_compound_head(struct page *page, int refs, unsigned int flags); - -static inline __must_check bool try_get_page(struct page *page) -{ - page = compound_head(page); - if (WARN_ON_ONCE(page_ref_count(page) <= 0)) - return false; - page_ref_inc(page); - return true; -} +struct page *try_get_compound_head(struct page *page, int refs); static inline void put_page(struct page *page) { diff --git a/mm/gup.c b/mm/gup.c index d60419ed9262..1c7f4ec6990b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -62,11 +62,24 @@ static void put_page_refs(struct page *page, int refs) put_page(page); } -/* - * Return the compound head page with ref appropriately incremented, - * or NULL if that failed. +/** + * try_get_compound_head() - return the compound head page with refcount + * appropriately incremented, or NULL if that failed. + * + * This handles potential refcount overflow correctly. It also works correctly + * for various lockless get_user_pages()-related callers, due to the use of + * page_cache_add_speculative(). + * + * Even though the name includes "compound_head", this function is still + * appropriate for callers that have a non-compound @page to get. + * + * @page: pointer to page to be gotten + * @refs: the value to add to the page's refcount + * + * Return: head page (with refcount appropriately incremented) for success, or + * NULL upon failure. */ -static inline struct page *try_get_compound_head(struct page *page, int refs) +struct page *try_get_compound_head(struct page *page, int refs) { struct page *head = compound_head(page); From 51cc3a6620a6ca934d468bda345678768493f5d8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:53:57 -0700 Subject: [PATCH 342/474] fs, mm: fix race in unlinking swapfile We had a recurring situation in which admin procedures setting up swapfiles would race with test preparation clearing away swapfiles; and just occasionally that got stuck on a swapfile "(deleted)" which could never be swapped off. That is not supposed to be possible. 2.6.28 commit f9454548e17c ("don't unlink an active swapfile") admitted that it was leaving a race window open: now close it. may_delete() makes the IS_SWAPFILE check (amongst many others) before inode_lock has been taken on target: now repeat just that simple check in vfs_unlink() and vfs_rename(), after taking inode_lock. Which goes most of the way to fixing the race, but swapon() must also check after it acquires inode_lock, that the file just opened has not already been unlinked. Link: https://lkml.kernel.org/r/e17b91ad-a578-9a15-5e3-4989e0f999b5@google.com Fixes: f9454548e17c ("don't unlink an active swapfile") Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 8 +++++++- mm/swapfile.c | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index bf6d8a738c59..ff866c07f4d2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4024,7 +4024,9 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir, return -EPERM; inode_lock(target); - if (is_local_mountpoint(dentry)) + if (IS_SWAPFILE(target)) + error = -EPERM; + else if (is_local_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); @@ -4526,6 +4528,10 @@ int vfs_rename(struct renamedata *rd) else if (target) inode_lock(target); + error = -EPERM; + if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) + goto out; + error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) goto out; diff --git a/mm/swapfile.c b/mm/swapfile.c index 1e07d1c776f2..7527afd95284 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3130,6 +3130,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; + struct dentry *dentry; int prio; int error; union swap_header *swap_header; @@ -3173,6 +3174,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->swap_file = swap_file; mapping = swap_file->f_mapping; + dentry = swap_file->f_path.dentry; inode = mapping->host; error = claim_swapfile(p, inode); @@ -3180,6 +3182,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; inode_lock(inode); + if (d_unlinked(dentry) || cant_mount(dentry)) { + error = -ENOENT; + goto bad_swap_unlock_inode; + } if (IS_SWAPFILE(inode)) { error = -EBUSY; goto bad_swap_unlock_inode; From 3969b1a654fb09b7915efc1aa4ad45daf932e12f Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 2 Sep 2021 14:54:00 -0700 Subject: [PATCH 343/474] mm: delete unused get_kernel_page() get_kernel_page() was added in 2012 by [1]. It was used for a while for NFS, but then in 2014, a refactoring [2] removed all callers, and it has apparently not been used since. Remove get_kernel_page() because it has no callers. [1] commit 18022c5d8627 ("mm: add get_kernel_page[s] for pinning of kernel addresses for I/O") [2] commit 91f79c43d1b5 ("new helper: iov_iter_get_pages_alloc()") Link: https://lkml.kernel.org/r/20210729221847.1165665-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: David Hildenbrand Cc: Mel Gorman Cc: Rik van Riel Cc: David S. Miller Cc: Eric B Munson Cc: Eric Paris Cc: James Morris Cc: Mike Christie Cc: Neil Brown Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Cc: Xiaotian Feng Cc: Mark Salter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/swap.c | 22 ---------------------- 2 files changed, 23 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d3439dd4f4ba..35bbac32b6f6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1839,7 +1839,6 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct kvec; int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, struct page **pages); -int get_kernel_page(unsigned long start, int write, struct page **pages); struct page *get_dump_page(unsigned long addr); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); diff --git a/mm/swap.c b/mm/swap.c index 19600430e536..897200d27dd0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -179,28 +179,6 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, } EXPORT_SYMBOL_GPL(get_kernel_pages); -/* - * get_kernel_page() - pin a kernel page in memory - * @start: starting kernel address - * @write: pinning for read/write, currently ignored - * @pages: array that receives pointer to the page pinned. - * Must be at least nr_segs long. - * - * Returns 1 if page is pinned. If the page was not pinned, returns - * -errno. The page returned must be released with a put_page() call - * when it is finished with. - */ -int get_kernel_page(unsigned long start, int write, struct page **pages) -{ - const struct kvec kiov = { - .iov_base = (void *)start, - .iov_len = PAGE_SIZE - }; - - return get_kernel_pages(&kiov, 1, write, pages); -} -EXPORT_SYMBOL_GPL(get_kernel_page); - static void pagevec_lru_move_fn(struct pagevec *pvec, void (*move_fn)(struct page *page, struct lruvec *lruvec)) { From bf11b9a8e9a93c1fc0ebfc2929622d5cf7d43888 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 2 Sep 2021 14:54:03 -0700 Subject: [PATCH 344/474] shmem: use raw_spinlock_t for ->stat_lock Each CPU has SHMEM_INO_BATCH inodes available in `->ino_batch' which is per-CPU. Access here is serialized by disabling preemption. If the pool is empty, it gets reloaded from `->next_ino'. Access here is serialized by ->stat_lock which is a spinlock_t and can not be acquired with disabled preemption. One way around it would make per-CPU ino_batch struct containing the inode number a local_lock_t. Another solution is to promote ->stat_lock to a raw_spinlock_t. The critical sections are short. The mpol_put() must be moved outside of the critical section to avoid invoking the destructor with disabled preemption. Link: https://lkml.kernel.org/r/20210806142916.jdwkb5bx62q5fwfo@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 2 +- mm/shmem.c | 31 +++++++++++++++++-------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 8e775ce517bb..0a8499fb9c3c 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -31,7 +31,7 @@ struct shmem_sb_info { struct percpu_counter used_blocks; /* How many are allocated */ unsigned long max_inodes; /* How many inodes are allowed */ unsigned long free_inodes; /* How many are left for allocation */ - spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ + raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ umode_t mode; /* Mount mode for root directory */ unsigned char huge; /* Whether to try for hugepages */ kuid_t uid; /* Mount uid for root directory */ diff --git a/mm/shmem.c b/mm/shmem.c index dacda7463d54..7eec13e39ac3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -278,10 +278,10 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) ino_t ino; if (!(sb->s_flags & SB_KERNMOUNT)) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->max_inodes) { if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); return -ENOSPC; } sbinfo->free_inodes--; @@ -304,7 +304,7 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) } *inop = ino; } - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } else if (inop) { /* * __shmem_file_setup, one of our callers, is lock-free: it @@ -319,13 +319,14 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) * to worry about things like glibc compatibility. */ ino_t *next_ino; + next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); ino = *next_ino; if (unlikely(ino % SHMEM_INO_BATCH == 0)) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); ino = sbinfo->next_ino; sbinfo->next_ino += SHMEM_INO_BATCH; - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); if (unlikely(is_zero_ino(ino))) ino++; } @@ -341,9 +342,9 @@ static void shmem_free_inode(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } } @@ -1453,10 +1454,10 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { struct mempolicy *mpol = NULL; if (sbinfo->mpol) { - spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ + raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ mpol = sbinfo->mpol; mpol_get(mpol); - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } return mpol; } @@ -3488,9 +3489,10 @@ static int shmem_reconfigure(struct fs_context *fc) struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); unsigned long inodes; + struct mempolicy *mpol = NULL; const char *err; - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); inodes = sbinfo->max_inodes - sbinfo->free_inodes; if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { @@ -3535,14 +3537,15 @@ static int shmem_reconfigure(struct fs_context *fc) * Preserve previous mempolicy unless mpol remount option was specified. */ if (ctx->mpol) { - mpol_put(sbinfo->mpol); + mpol = sbinfo->mpol; sbinfo->mpol = ctx->mpol; /* transfers initial ref */ ctx->mpol = NULL; } - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); + mpol_put(mpol); return 0; out: - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); return invalfc(fc, "%s", err); } @@ -3659,7 +3662,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; - spin_lock_init(&sbinfo->stat_lock); + raw_spin_lock_init(&sbinfo->stat_lock); if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; spin_lock_init(&sbinfo->shrinklist_lock); From f2b346e4522c2849a3dc59b7077b8952918ef486 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:54:06 -0700 Subject: [PATCH 345/474] shmem: remove unneeded variable ret Patch series "Cleanups for shmem". This series contains cleanups to remove unneeded variable, header file, function forward declaration and so on. More details can be found in the respective changelogs. This patch (of 4): The local variable ret is always equal to -ENOMEM and never touched. So remove it and return -ENOMEM directly to simplify the code. Link: https://lkml.kernel.org/r/20210812120350.49801-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210812120350.49801-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 7eec13e39ac3..e28d2593ec27 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3616,7 +3616,6 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) struct shmem_options *ctx = fc->fs_private; struct inode *inode; struct shmem_sb_info *sbinfo; - int err = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), @@ -3694,7 +3693,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) failed: shmem_put_super(sb); - return err; + return -ENOMEM; } static int shmem_get_tree(struct fs_context *fc) From b6378fc8b477921e62fd5763efeec676ff8e7a16 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:54:09 -0700 Subject: [PATCH 346/474] shmem: remove unneeded header file mfill_atomic_install_pte() is introduced to install pte and update mmu cache since commit bf6ebd97aba0 ("userfaultfd/shmem: modify shmem_mfill_atomic_pte to use install_pte()"). So we should remove tlbflush.h as update_mmu_cache() is not called here now. Link: https://lkml.kernel.org/r/20210812120350.49801-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index e28d2593ec27..c69adf9a2db3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -39,8 +39,6 @@ #include #include -#include /* for arch/microblaze update_mmu_cache() */ - static struct vfsmount *shm_mnt; #ifdef CONFIG_SHMEM From cdd89d4cb6507f123f6ecbfe51050e9a9844c2ab Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:54:12 -0700 Subject: [PATCH 347/474] shmem: remove unneeded function forward declaration The forward declaration for shmem_should_replace_page() and shmem_replace_page() is unnecessary. Remove them. Link: https://lkml.kernel.org/r/20210812120350.49801-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index c69adf9a2db3..c4087f70c86c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -135,9 +135,6 @@ static unsigned long shmem_default_max_inodes(void) } #endif -static bool shmem_should_replace_page(struct page *page, gfp_t gfp); -static int shmem_replace_page(struct page **pagep, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index); static int shmem_swapin_page(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, From 86a2f3f2d99e9765d13a55ee2e4364deb6cdf794 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:54:15 -0700 Subject: [PATCH 348/474] shmem: include header file to declare swap_info It's bad to extern swap_info[] in .c. Include corresponding header file instead. Link: https://lkml.kernel.org/r/20210812120350.49801-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index c4087f70c86c..f841c7adb8b2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -38,6 +38,7 @@ #include #include #include +#include static struct vfsmount *shm_mnt; @@ -1152,8 +1153,6 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } -extern struct swap_info_struct *swap_info[]; - static int shmem_find_swap_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices, From 050dcb5c85bb47f8151175ca5833aa882cc7fe0c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:54:18 -0700 Subject: [PATCH 349/474] huge tmpfs: fix fallocate(vanilla) advance over huge pages Patch series "huge tmpfs: shmem_is_huge() fixes and cleanups". A series of huge tmpfs fixes and cleanups. This patch (of 9): shmem_fallocate() goes to a lot of trouble to leave its newly allocated pages !Uptodate, partly to identify and undo them on failure, partly to leave the overhead of clearing them until later. But the huge page case did not skip to the end of the extent, walked through the tail pages one by one, and appeared to work just fine: but in doing so, cleared and Uptodated the huge page, so there was no way to undo it on failure. And by setting Uptodate too soon, it messed up both its nr_falloced and nr_unswapped counts, so that the intended "time to give up" heuristic did not work at all. Now advance immediately to the end of the huge extent, with a comment on why this is more than just an optimization. But although this speeds up huge tmpfs fallocation, it does leave the clearing until first use, and some users may have come to appreciate slow fallocate but fast first use: if they complain, then we can consider adding a pass to clear at the end. Link: https://lkml.kernel.org/r/da632211-8e3e-6b1-aee-ab24734429a0@google.com Link: https://lkml.kernel.org/r/16201bd2-70e-37e2-e89b-5f929430da@google.com Fixes: 800d8c63b2e9 ("shmem: add huge pages support") Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: Shakeel Butt Cc: "Kirill A. Shutemov" Cc: Miaohe Lin Cc: Mike Kravetz Cc: Michal Hocko Cc: Rik van Riel Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index f841c7adb8b2..9ef579f6cab3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2719,7 +2719,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); - for (index = start; index < end; index++) { + for (index = start; index < end; ) { struct page *page; /* @@ -2742,13 +2742,26 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto undone; } + index++; + /* + * Here is a more important optimization than it appears: + * a second SGP_FALLOC on the same huge page will clear it, + * making it PageUptodate and un-undoable if we fail later. + */ + if (PageTransCompound(page)) { + index = round_up(index, HPAGE_PMD_NR); + /* Beware 32-bit wraparound */ + if (!index) + index--; + } + /* * Inform shmem_writepage() how far we have reached. * No need for lock or barrier: we have the page lock. */ - shmem_falloc.next++; if (!PageUptodate(page)) - shmem_falloc.nr_falloced++; + shmem_falloc.nr_falloced += index - shmem_falloc.next; + shmem_falloc.next = index; /* * If !PageUptodate, leave it that way so that freeable pages From d144bf6205342a4b5fed5d204ae18849a4de741b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:54:21 -0700 Subject: [PATCH 350/474] huge tmpfs: fix split_huge_page() after FALLOC_FL_KEEP_SIZE A successful shmem_fallocate() guarantees that the extent has been reserved, even beyond i_size when the FALLOC_FL_KEEP_SIZE flag was used. But that guarantee is broken by shmem_unused_huge_shrink()'s attempts to split huge pages and free their excess beyond i_size; and by other uses of split_huge_page() near i_size. It's sad to add a shmem inode field just for this, but I did not find a better way to keep the guarantee. A flag to say KEEP_SIZE has been used would be cheaper, but I'm averse to unclearable flags. The fallocend field is not perfect either (many disjoint ranges might be fallocated), but good enough; and gains another use later on. Link: https://lkml.kernel.org/r/ca9a146-3a59-6cd3-7f28-e9a044bb1052@google.com Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure") Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 13 +++++++++++++ mm/huge_memory.c | 6 ++++-- mm/shmem.c | 15 ++++++++++++++- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0a8499fb9c3c..bfc5899d18e0 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -18,6 +18,7 @@ struct shmem_inode_info { unsigned long flags; unsigned long alloced; /* data pages alloced to file */ unsigned long swapped; /* subtotal assigned to swap */ + pgoff_t fallocend; /* highest fallocate endindex */ struct list_head shrinklist; /* shrinkable hpage inodes */ struct list_head swaplist; /* chain of maybes on swap */ struct shared_policy policy; /* NUMA memory alloc policy */ @@ -119,6 +120,18 @@ static inline bool shmem_file(struct file *file) return shmem_mapping(file->f_mapping); } +/* + * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages + * beyond i_size's notion of EOF, which fallocate has committed to reserving: + * which split_huge_page() must therefore not delete. This use of a single + * "fallocend" per inode errs on the side of not deleting a reservation when + * in doubt: there are plenty of cases when it preserves unreserved pages. + */ +static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) +{ + return max(eof, SHMEM_I(inode)->fallocend); +} + extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index afff3ac87067..890fb73ac89b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2454,11 +2454,11 @@ static void __split_huge_page(struct page *page, struct list_head *list, for (i = nr - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); - /* Some pages can be beyond i_size: drop them from page cache */ + /* Some pages can be beyond EOF: drop them from page cache */ if (head[i].index >= end) { ClearPageDirty(head + i); __delete_from_page_cache(head + i, NULL); - if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) + if (shmem_mapping(head->mapping)) shmem_uncharge(head->mapping->host, 1); put_page(head + i); } else if (!PageAnon(page)) { @@ -2686,6 +2686,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * head page lock is good enough to serialize the trimming. */ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + if (shmem_mapping(mapping)) + end = shmem_fallocend(mapping->host, end); } /* diff --git a/mm/shmem.c b/mm/shmem.c index 9ef579f6cab3..e391325dfc21 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -902,6 +902,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (lend == -1) end = -1; /* unsigned, so actually very big */ + if (info->fallocend > start && info->fallocend <= end && !unfalloc) + info->fallocend = start; + pagevec_init(&pvec); index = start; while (index < end && find_lock_entries(mapping, index, end - 1, @@ -2650,7 +2653,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_falloc shmem_falloc; - pgoff_t start, index, end; + pgoff_t start, index, end, undo_fallocend; int error; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2719,6 +2722,15 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); + /* + * info->fallocend is only relevant when huge pages might be + * involved: to prevent split_huge_page() freeing fallocated + * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. + */ + undo_fallocend = info->fallocend; + if (info->fallocend < end) + info->fallocend = end; + for (index = start; index < end; ) { struct page *page; @@ -2733,6 +2745,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else error = shmem_getpage(inode, index, &page, SGP_FALLOC); if (error) { + info->fallocend = undo_fallocend; /* Remove the !PageUptodate pages we added */ if (index > start) { shmem_undo_range(inode, From 2b5bbcb1c9c2cd05a06dcf54df77255a8c406a7b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:54:24 -0700 Subject: [PATCH 351/474] huge tmpfs: remove shrinklist addition from shmem_setattr() There's a block of code in shmem_setattr() to add the inode to shmem_unused_huge_shrink()'s shrinklist when lowering i_size: it dates from before 5.7 changed truncation to do split_huge_page() for itself, and should have been removed at that time. I am over-stating that: split_huge_page() can fail (notably if there's an extra reference to the page at that time), so there might be value in retrying. But there were already retries as truncation worked through the tails, and this addition risks repeating unsuccessful retries indefinitely: I'd rather remove it now, and work on reducing the chance of split_huge_page() failures separately, if we need to. Link: https://lkml.kernel.org/r/b73b3492-8822-18f9-83e2-938528cdde94@google.com Fixes: 71725ed10c40 ("mm: huge tmpfs: try to split_huge_page() when punching hole") Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index e391325dfc21..9403048f4ee1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1058,7 +1058,6 @@ static int shmem_setattr(struct user_namespace *mnt_userns, { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int error; error = setattr_prepare(&init_user_ns, dentry, attr); @@ -1094,24 +1093,6 @@ static int shmem_setattr(struct user_namespace *mnt_userns, if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); - - /* - * Part of the huge page can be beyond i_size: subject - * to shrink under memory pressure. - */ - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - spin_lock(&sbinfo->shrinklist_lock); - /* - * _careful to defend against unlocked access to - * ->shrink_list in shmem_unused_huge_shrink() - */ - if (list_empty_careful(&info->shrinklist)) { - list_add_tail(&info->shrinklist, - &sbinfo->shrinklist); - sbinfo->shrinklist_len++; - } - spin_unlock(&sbinfo->shrinklist_lock); - } } } From b9e2faaf6fa0df984d4ecca775f3629a4d5e599b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:54:27 -0700 Subject: [PATCH 352/474] huge tmpfs: revert shmem's use of transhuge_vma_enabled() 5.14 commit e6be37b2e7bd ("mm/huge_memory.c: add missing read-only THP checking in transparent_hugepage_enabled()") added transhuge_vma_enabled() as a wrapper for two very different checks (one check is whether the app has marked its address range not to use THPs, the other check is whether the app is running in a hierarchy that has been marked never to use THPs). shmem_huge_enabled() prefers to show those two checks explicitly, as before. Link: https://lkml.kernel.org/r/45e5338-18d-c6f9-c17e-34f510bc1728@google.com Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 9403048f4ee1..43cb1a99f3ce 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3987,7 +3987,8 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) loff_t i_size; pgoff_t off; - if (!transhuge_vma_enabled(vma, vma->vm_flags)) + if ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; if (shmem_huge == SHMEM_HUGE_FORCE) return true; From c852023e6fd4fa5f75175729e0b55abb062ca799 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:54:31 -0700 Subject: [PATCH 353/474] huge tmpfs: move shmem_huge_enabled() upwards shmem_huge_enabled() is about to be enhanced into shmem_is_huge(), so that it can be used more widely throughout: before making functional changes, shift it to its final position (to avoid forward declaration). Link: https://lkml.kernel.org/r/16fec7b7-5c84-415a-8586-69d8bf6a6685@google.com Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 72 ++++++++++++++++++++++++++---------------------------- 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 43cb1a99f3ce..2df6a5370cd7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -473,6 +473,41 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly; +bool shmem_huge_enabled(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + loff_t i_size; + pgoff_t off; + + if ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return false; + if (shmem_huge == SHMEM_HUGE_FORCE) + return true; + if (shmem_huge == SHMEM_HUGE_DENY) + return false; + switch (sbinfo->huge) { + case SHMEM_HUGE_NEVER: + return false; + case SHMEM_HUGE_ALWAYS: + return true; + case SHMEM_HUGE_WITHIN_SIZE: + off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >= HPAGE_PMD_SIZE && + i_size >> PAGE_SHIFT >= off) + return true; + fallthrough; + case SHMEM_HUGE_ADVISE: + /* TODO: implement fadvise() hints */ + return (vma->vm_flags & VM_HUGEPAGE); + default: + VM_BUG_ON(1); + return false; + } +} + #if defined(CONFIG_SYSFS) static int shmem_parse_huge(const char *str) { @@ -3979,43 +4014,6 @@ struct kobj_attribute shmem_enabled_attr = __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -bool shmem_huge_enabled(struct vm_area_struct *vma) -{ - struct inode *inode = file_inode(vma->vm_file); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - loff_t i_size; - pgoff_t off; - - if ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; - if (shmem_huge == SHMEM_HUGE_FORCE) - return true; - if (shmem_huge == SHMEM_HUGE_DENY) - return false; - switch (sbinfo->huge) { - case SHMEM_HUGE_NEVER: - return false; - case SHMEM_HUGE_ALWAYS: - return true; - case SHMEM_HUGE_WITHIN_SIZE: - off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) - return true; - fallthrough; - case SHMEM_HUGE_ADVISE: - /* TODO: implement fadvise() hints */ - return (vma->vm_flags & VM_HUGEPAGE); - default: - VM_BUG_ON(1); - return false; - } -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - #else /* !CONFIG_SHMEM */ /* From acdd9f8e0fed9f1bd7e83a8ff934694bb4c9a72b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:54:34 -0700 Subject: [PATCH 354/474] huge tmpfs: SGP_NOALLOC to stop collapse_file() on race khugepaged's collapse_file() currently uses SGP_NOHUGE to tell shmem_getpage() not to try allocating a huge page, in the very unlikely event that a racing hole-punch removes the swapped or fallocated page as soon as i_pages lock is dropped. We want to consolidate shmem's huge decisions, removing SGP_HUGE and SGP_NOHUGE; but cannot quite persuade ourselves that it's okay to regress the protection in this case - Yang Shi points out that the huge page would remain indefinitely, charged to root instead of the intended memcg. collapse_file() should not even allocate a small page in this case: why proceed if someone is punching a hole? SGP_READ is almost the right flag here, except that it optimizes away from a fallocated page, with NULL to tell caller to fill with zeroes (like a hole); whereas collapse_file()'s sequence relies on using a cache page. Add SGP_NOALLOC just for this. There are too many consecutive "if (page"s there in shmem_getpage_gfp(): group it better; and fix the outdated "bring it back from swap" comment. Link: https://lkml.kernel.org/r/1355343b-acf-4653-ef79-6aee40214ac5@google.com Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 1 + mm/khugepaged.c | 2 +- mm/shmem.c | 29 +++++++++++++++++------------ 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index bfc5899d18e0..a3f4502ec8a9 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -94,6 +94,7 @@ extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, /* Flag allocation requirements to shmem_getpage */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ + SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_NOHUGE, /* like SGP_CACHE, but no huge pages */ SGP_HUGE, /* like SGP_CACHE, huge pages preferred */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b0412be08fa2..045cc579f724 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1721,7 +1721,7 @@ static void collapse_file(struct mm_struct *mm, xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, - SGP_NOHUGE)) { + SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } diff --git a/mm/shmem.c b/mm/shmem.c index 2df6a5370cd7..867cb404090a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1854,26 +1854,31 @@ repeat: return error; } - if (page) + if (page) { hindex = page->index; - if (page && sgp == SGP_WRITE) - mark_page_accessed(page); - - /* fallocated page? */ - if (page && !PageUptodate(page)) { + if (sgp == SGP_WRITE) + mark_page_accessed(page); + if (PageUptodate(page)) + goto out; + /* fallocated page */ if (sgp != SGP_READ) goto clear; unlock_page(page); put_page(page); - page = NULL; - hindex = index; } - if (page || sgp == SGP_READ) - goto out; /* - * Fast cache lookup did not find it: - * bring it back from swap or allocate. + * SGP_READ: succeed on hole, with NULL page, letting caller zero. + * SGP_NOALLOC: fail on hole, with NULL page, letting caller fail. + */ + *pagep = NULL; + if (sgp == SGP_READ) + return 0; + if (sgp == SGP_NOALLOC) + return -ENOENT; + + /* + * Fast cache lookup and swap lookup did not find it: allocate. */ if (vma && userfaultfd_missing(vma)) { From 5e6e5a12a44ca5ff2b130d8d39aaf9b8c026de94 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:54:37 -0700 Subject: [PATCH 355/474] huge tmpfs: shmem_is_huge(vma, inode, index) Extend shmem_huge_enabled(vma) to shmem_is_huge(vma, inode, index), so that a consistent set of checks can be applied, even when the inode is accessed through read/write syscalls (with NULL vma) instead of mmaps (the index argument is seldom of interest, but required by mount option "huge=within_size"). Clean up and rearrange the checks a little. This then replaces the checks which shmem_fault() and shmem_getpage_gfp() were making, and eliminates the SGP_HUGE and SGP_NOHUGE modes. Replace a couple of 0s by explicit SHMEM_HUGE_NEVERs; and replace the obscure !shmem_mapping() symlink check by explicit S_ISLNK() - nothing else needs that symlink check, so leave it there in shmem_getpage_gfp(). Link: https://lkml.kernel.org/r/23a77889-2ddc-b030-75cd-44ca27fd4d1@google.com Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 9 +++-- mm/shmem.c | 84 ++++++++++++---------------------------- 2 files changed, 31 insertions(+), 62 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a3f4502ec8a9..166158b6e917 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -86,7 +86,12 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); extern int shmem_unuse(unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse); -extern bool shmem_huge_enabled(struct vm_area_struct *vma); +extern bool shmem_is_huge(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index); +static inline bool shmem_huge_enabled(struct vm_area_struct *vma) +{ + return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff); +} extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); @@ -96,8 +101,6 @@ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ - SGP_NOHUGE, /* like SGP_CACHE, but no huge pages */ - SGP_HUGE, /* like SGP_CACHE, huge pages preferred */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; diff --git a/mm/shmem.c b/mm/shmem.c index 867cb404090a..69c9788a0094 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -471,39 +471,35 @@ static bool shmem_confirm_swap(struct address_space *mapping, #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ -static int shmem_huge __read_mostly; +static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -bool shmem_huge_enabled(struct vm_area_struct *vma) +bool shmem_is_huge(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) { - struct inode *inode = file_inode(vma->vm_file); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); loff_t i_size; - pgoff_t off; - if ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + if (shmem_huge == SHMEM_HUGE_DENY) + return false; + if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) return false; if (shmem_huge == SHMEM_HUGE_FORCE) return true; - if (shmem_huge == SHMEM_HUGE_DENY) - return false; - switch (sbinfo->huge) { - case SHMEM_HUGE_NEVER: - return false; + + switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: return true; case SHMEM_HUGE_WITHIN_SIZE: - off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); + index = round_up(index, HPAGE_PMD_NR); i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) + if (i_size >= HPAGE_PMD_SIZE && (i_size >> PAGE_SHIFT) >= index) return true; fallthrough; case SHMEM_HUGE_ADVISE: - /* TODO: implement fadvise() hints */ - return (vma->vm_flags & VM_HUGEPAGE); + if (vma && (vma->vm_flags & VM_HUGEPAGE)) + return true; + fallthrough; default: - VM_BUG_ON(1); return false; } } @@ -677,6 +673,12 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY +bool shmem_is_huge(struct vm_area_struct *vma, + struct inode *inode, pgoff_t index) +{ + return false; +} + static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) { @@ -1812,7 +1814,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; struct page *page; - enum sgp_type sgp_huge = sgp; pgoff_t hindex = index; gfp_t huge_gfp; int error; @@ -1821,8 +1822,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; - if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) - sgp = SGP_CACHE; repeat: if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { @@ -1886,36 +1885,12 @@ repeat: return 0; } - /* shmem_symlink() */ - if (!shmem_mapping(mapping)) + /* Never use a huge page for shmem_symlink() */ + if (S_ISLNK(inode->i_mode)) goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) + if (!shmem_is_huge(vma, inode, index)) goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_FORCE) - goto alloc_huge; - switch (sbinfo->huge) { - case SHMEM_HUGE_NEVER: - goto alloc_nohuge; - case SHMEM_HUGE_WITHIN_SIZE: { - loff_t i_size; - pgoff_t off; - off = round_up(index, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) - goto alloc_huge; - - fallthrough; - } - case SHMEM_HUGE_ADVISE: - if (sgp_huge == SGP_HUGE) - goto alloc_huge; - /* TODO: implement fadvise() hints */ - goto alloc_nohuge; - } - -alloc_huge: huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true); @@ -2071,7 +2046,6 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); - enum sgp_type sgp; int err; vm_fault_t ret = VM_FAULT_LOCKED; @@ -2134,15 +2108,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - sgp = SGP_CACHE; - - if ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - sgp = SGP_NOHUGE; - else if (vma->vm_flags & VM_HUGEPAGE) - sgp = SGP_HUGE; - - err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, + err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, gfp, vma, vmf, &ret); if (err) return vmf_error(err); @@ -3950,7 +3916,7 @@ int __init shmem_init(void) if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else - shmem_huge = 0; /* just in case it was patched */ + shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ #endif return 0; From a7fddc36299a8a99073e9e6e922b6cd451508385 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:54:40 -0700 Subject: [PATCH 356/474] huge tmpfs: decide stat.st_blksize by shmem_is_huge() 4.18 commit 89fdcd262fd4 ("mm: shmem: make stat.st_blksize return huge page size if THP is on") added is_huge_enabled() to decide st_blksize: if hugeness is to be defined per file, that will need to be replaced by shmem_is_huge(). This does give a different answer (No) for small files on a "huge=within_size" mount: but that can be considered a minor bugfix. And a different answer (No) for default files on a "huge=advise" mount: I'm reluctant to complicate it, just to reproduce the same debatable answer as before. Link: https://lkml.kernel.org/r/af7fb3f9-4415-9e8e-fdac-b1a5253ad21@google.com Signed-off-by: Hugh Dickins Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 69c9788a0094..69aa932c08da 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -686,15 +686,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) -{ - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && - shmem_huge != SHMEM_HUGE_DENY) - return true; - return false; -} - /* * Like add_to_page_cache_locked, but error if expected item has gone. */ @@ -1075,7 +1066,6 @@ static int shmem_getattr(struct user_namespace *mnt_userns, { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb); if (info->alloced - info->swapped != inode->i_mapping->nrpages) { spin_lock_irq(&info->lock); @@ -1084,7 +1074,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns, } generic_fillattr(&init_user_ns, inode, stat); - if (is_huge_enabled(sb_info)) + if (shmem_is_huge(NULL, inode, 0)) stat->blksize = HPAGE_PMD_SIZE; return 0; From 1e6decf30af5c5c75445ed6ad4e65a26de578a03 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 2 Sep 2021 14:54:43 -0700 Subject: [PATCH 357/474] shmem: shmem_writepage() split unlikely i915 THP drivers/gpu/drm/i915/gem/i915_gem_shmem.c contains a shmem_writeback() which calls shmem_writepage() from a shrinker: that usually works well enough; but if /sys/kernel/mm/transparent_hugepage/shmem_enabled has been set to "always" (intended to be usable) or "force" (forces huge everywhere for easy testing), shmem_writepage() is surprised to be called with a huge page, and crashes on the VM_BUG_ON_PAGE(PageCompound) (I did not find out where the crash happens when CONFIG_DEBUG_VM is off). LRU page reclaim always splits the shmem huge page first: I'd prefer not to demand that of i915, so check and split compound in shmem_writepage(). Patch history: when first sent last year http://lkml.kernel.org/r/alpine.LSU.2.11.2008301401390.5954@eggly.anvils https://lore.kernel.org/linux-mm/20200919042009.bomzxmrg7%25akpm@linux-foundation.org/ Matthew Wilcox noticed that tail pages were wrongly left clean. This version brackets the split with Set and Clear PageDirty as he suggested: which works very well, even if it falls short of our aspirations. And recently I realized that the crash is not limited to the testing option "force", but affects "always" too: which is more important to fix. Link: https://lkml.kernel.org/r/bac6158c-8b3d-4dca-cffc-4982f58d9794@google.com Fixes: 2d6692e642e7 ("drm/i915: Start writeback from the shrinker") Signed-off-by: Hugh Dickins Reviewed-by: Shakeel Butt Acked-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 69aa932c08da..21c29f7e3894 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1344,7 +1344,19 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) swp_entry_t swap; pgoff_t index; - VM_BUG_ON_PAGE(PageCompound(page), page); + /* + * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or + * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, + * and its shmem_writeback() needs them to be split when swapping. + */ + if (PageTransCompound(page)) { + /* Ensure the subpages are still dirty */ + SetPageDirty(page); + if (split_huge_page(page) < 0) + goto redirty; + ClearPageDirty(page); + } + BUG_ON(!PageLocked(page)); mapping = page->mapping; index = page->index; From 56cab2859fbe08e1f2a5ac3f4655dcc398bc013d Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 14:54:47 -0700 Subject: [PATCH 358/474] mm, memcg: add mem_cgroup_disabled checks in vmpressure and swap-related functions Add mem_cgroup_disabled check in vmpressure, mem_cgroup_uncharge_swap and cgroup_throttle_swaprate functions. This minimizes the memcg overhead in the pagefault and exit_mmap paths when memcgs are disabled using cgroup_disable=memory command-line option. This change results in ~2.1% overhead reduction when running PFT test [1] comparing {CONFIG_MEMCG=n, CONFIG_MEMCG_SWAP=n} against {CONFIG_MEMCG=y, CONFIG_MEMCG_SWAP=y, cgroup_disable=memory} configuration on an 8-core ARM64 Android device. [1] https://lkml.org/lkml/2006/8/29/294 also used in mmtests suite Link: https://lkml.kernel.org/r/20210713010934.299876-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Jens Axboe Cc: Joonsoo Kim Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Minchan Kim Cc: Roman Gushchin Cc: Tejun Heo Cc: Wei Yang Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +++ mm/swapfile.c | 3 +++ mm/vmpressure.c | 7 ++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1047f0271ff8..211f3911228f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7297,6 +7297,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; + if (mem_cgroup_disabled()) + return; + id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); diff --git a/mm/swapfile.c b/mm/swapfile.c index 7527afd95284..627b16aed1dc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3784,6 +3784,9 @@ void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) struct swap_info_struct *si, *next; int nid = page_to_nid(page); + if (mem_cgroup_disabled()) + return; + if (!(gfp_mask & __GFP_IO)) return; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index d69019fc3789..9b172561fded 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -240,7 +240,12 @@ static void vmpressure_work_fn(struct work_struct *work) void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { - struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + struct vmpressure *vmpr; + + if (mem_cgroup_disabled()) + return; + + vmpr = memcg_to_vmpressure(memcg); /* * Here we only want to account pressure that userland is able to From 2c8d8f97ae2272f1455ee31a5af62b326772eb31 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 14:54:50 -0700 Subject: [PATCH 359/474] mm, memcg: inline mem_cgroup_{charge/uncharge} to improve disabled memcg config Inline mem_cgroup_{charge/uncharge} and mem_cgroup_uncharge_list functions functions to perform mem_cgroup_disabled static key check inline before calling the main body of the function. This minimizes the memcg overhead in the pagefault and exit_mmap paths when memcgs are disabled using cgroup_disable=memory command-line option. This change results in ~0.4% overhead reduction when running PFT test [1] comparing {CONFIG_MEMCG=n} against {CONFIG_MEMCG=y, cgroup_disable=memory} configuration on an 8-core ARM64 Android device. [1] https://lkml.org/lkml/2006/8/29/294 also used in mmtests suite Link: https://lkml.kernel.org/r/20210713010934.299876-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Jens Axboe Cc: Joonsoo Kim Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Minchan Kim Cc: Roman Gushchin Cc: Tejun Heo Cc: Wei Yang Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 28 +++++++++++++++++++++++++--- mm/memcontrol.c | 33 ++++++++++++--------------------- 2 files changed, 37 insertions(+), 24 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3403ec77528a..a00bf337567b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -693,13 +693,35 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) page_counter_read(&memcg->memory); } -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); +int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask); +static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask) +{ + if (mem_cgroup_disabled()) + return 0; + return __mem_cgroup_charge(page, mm, gfp_mask); +} + int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); -void mem_cgroup_uncharge(struct page *page); -void mem_cgroup_uncharge_list(struct list_head *page_list); +void __mem_cgroup_uncharge(struct page *page); +static inline void mem_cgroup_uncharge(struct page *page) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge(page); +} + +void __mem_cgroup_uncharge_list(struct list_head *page_list); +static inline void mem_cgroup_uncharge_list(struct list_head *page_list) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge_list(page_list); +} void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 211f3911228f..33bb8434eea0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6693,8 +6693,7 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_low_usage))); } -static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg, - gfp_t gfp) +static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp) { unsigned int nr_pages = thp_nr_pages(page); int ret; @@ -6715,7 +6714,7 @@ out: } /** - * mem_cgroup_charge - charge a newly allocated page to a cgroup + * __mem_cgroup_charge - charge a newly allocated page to a cgroup * @page: page to charge * @mm: mm context of the victim * @gfp_mask: reclaim mode @@ -6728,16 +6727,14 @@ out: * * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) +int __mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask) { struct mem_cgroup *memcg; int ret; - if (mem_cgroup_disabled()) - return 0; - memcg = get_mem_cgroup_from_mm(mm); - ret = __mem_cgroup_charge(page, memcg, gfp_mask); + ret = charge_memcg(page, memcg, gfp_mask); css_put(&memcg->css); return ret; @@ -6772,7 +6769,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, memcg = get_mem_cgroup_from_mm(mm); rcu_read_unlock(); - ret = __mem_cgroup_charge(page, memcg, gfp); + ret = charge_memcg(page, memcg, gfp); css_put(&memcg->css); return ret; @@ -6908,18 +6905,15 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) } /** - * mem_cgroup_uncharge - uncharge a page + * __mem_cgroup_uncharge - uncharge a page * @page: page to uncharge * - * Uncharge a page previously charged with mem_cgroup_charge(). + * Uncharge a page previously charged with __mem_cgroup_charge(). */ -void mem_cgroup_uncharge(struct page *page) +void __mem_cgroup_uncharge(struct page *page) { struct uncharge_gather ug; - if (mem_cgroup_disabled()) - return; - /* Don't touch page->lru of any random page, pre-check: */ if (!page_memcg(page)) return; @@ -6930,20 +6924,17 @@ void mem_cgroup_uncharge(struct page *page) } /** - * mem_cgroup_uncharge_list - uncharge a list of page + * __mem_cgroup_uncharge_list - uncharge a list of page * @page_list: list of pages to uncharge * * Uncharge a list of pages previously charged with - * mem_cgroup_charge(). + * __mem_cgroup_charge(). */ -void mem_cgroup_uncharge_list(struct list_head *page_list) +void __mem_cgroup_uncharge_list(struct list_head *page_list) { struct uncharge_gather ug; struct page *page; - if (mem_cgroup_disabled()) - return; - uncharge_gather_clear(&ug); list_for_each_entry(page, page_list, lru) uncharge_page(page, &ug); From 01c4b28cd2e600160566a7d83b4703800381dae1 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 14:54:54 -0700 Subject: [PATCH 360/474] mm, memcg: inline swap-related functions to improve disabled memcg config Inline mem_cgroup_try_charge_swap, mem_cgroup_uncharge_swap and cgroup_throttle_swaprate functions to perform mem_cgroup_disabled static key check inline before calling the main body of the function. This minimizes the memcg overhead in the pagefault and exit_mmap paths when memcgs are disabled using cgroup_disable=memory command-line option. This change results in ~1% overhead reduction when running PFT test [1] comparing {CONFIG_MEMCG=n} against {CONFIG_MEMCG=y, cgroup_disable=memory} configuration on an 8-core ARM64 Android device. [1] https://lkml.org/lkml/2006/8/29/294 also used in mmtests suite Link: https://lkml.kernel.org/r/20210713010934.299876-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Roman Gushchin Cc: Yang Shi Cc: Alex Shi Cc: Wei Yang Cc: Jens Axboe Cc: Joonsoo Kim Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Alistair Popple Cc: Minchan Kim Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 26 +++++++++++++++++++++++--- mm/memcontrol.c | 14 ++++---------- mm/swapfile.c | 5 +---- 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 6f5a43251593..f30d26b0f71d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -721,7 +721,13 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) #endif #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -extern void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); +extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); +static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) +{ + if (mem_cgroup_disabled()) + return; + __cgroup_throttle_swaprate(page, gfp_mask); +} #else static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { @@ -730,8 +736,22 @@ static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) #ifdef CONFIG_MEMCG_SWAP extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); -extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); -extern void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); +extern int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); +static inline int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +{ + if (mem_cgroup_disabled()) + return 0; + return __mem_cgroup_try_charge_swap(page, entry); +} + +extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); +static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) +{ + if (mem_cgroup_disabled()) + return; + __mem_cgroup_uncharge_swap(entry, nr_pages); +} + extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct page *page); #else diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 33bb8434eea0..81e15a67391b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7226,7 +7226,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) } /** - * mem_cgroup_try_charge_swap - try charging swap space for a page + * __mem_cgroup_try_charge_swap - try charging swap space for a page * @page: page being added to swap * @entry: swap entry to charge * @@ -7234,16 +7234,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * * Returns 0 on success, -ENOMEM on failure. */ -int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) { unsigned int nr_pages = thp_nr_pages(page); struct page_counter *counter; struct mem_cgroup *memcg; unsigned short oldid; - if (mem_cgroup_disabled()) - return 0; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; @@ -7279,18 +7276,15 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) } /** - * mem_cgroup_uncharge_swap - uncharge swap space + * __mem_cgroup_uncharge_swap - uncharge swap space * @entry: swap entry to uncharge * @nr_pages: the amount of swap space to uncharge */ -void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) +void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { struct mem_cgroup *memcg; unsigned short id; - if (mem_cgroup_disabled()) - return; - id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); diff --git a/mm/swapfile.c b/mm/swapfile.c index 627b16aed1dc..22d10f713848 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3779,14 +3779,11 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) +void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { struct swap_info_struct *si, *next; int nid = page_to_nid(page); - if (mem_cgroup_disabled()) - return; - if (!(gfp_mask & __GFP_IO)) return; From fab827dbee8c2e06ca4ba000fa6c48bcf9054aba Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:54:57 -0700 Subject: [PATCH 361/474] memcg: enable accounting for pids in nested pid namespaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 5d097056c9a0 ("kmemcg: account certain kmem allocations to memcg") enabled memcg accounting for pids allocated from init_pid_ns.pid_cachep, but forgot to adjust the setting for nested pid namespaces. As a result, pid memory is not accounted exactly where it is really needed, inside memcg-limited containers with their own pid namespaces. Pid was one the first kernel objects enabled for memcg accounting. init_pid_ns.pid_cachep marked by SLAB_ACCOUNT and we can expect that any new pids in the system are memcg-accounted. Though recently I've noticed that it is wrong. nested pid namespaces creates own slab caches for pid objects, nested pids have increased size because contain id both for all parent and for own pid namespaces. The problem is that these slab caches are _NOT_ marked by SLAB_ACCOUNT, as a result any pids allocated in nested pid namespaces are not memcg-accounted. Pid struct in nested pid namespace consumes up to 500 bytes memory, 100000 such objects gives us up to ~50Mb unaccounted memory, this allow container to exceed assigned memcg limits. Link: https://lkml.kernel.org/r/8b6de616-fd1a-02c6-cbdb-976ecdcfa604@virtuozzo.com Fixes: 5d097056c9a0 ("kmemcg: account certain kmem allocations to memcg") Cc: stable@vger.kernel.org Signed-off-by: Vasily Averin Reviewed-by: Michal Koutný Reviewed-by: Shakeel Butt Acked-by: Christian Brauner Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ca43239a255a..cb5a25a8a0cc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -51,7 +51,8 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) mutex_lock(&pid_caches_mutex); /* Name collision forces to do allocation under mutex. */ if (!*pkc) - *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0); + *pkc = kmem_cache_create(name, len, 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0); mutex_unlock(&pid_caches_mutex); /* current can fail, but someone else can succeed. */ return READ_ONCE(*pkc); From 7e1c0d6f58207e7e60674647d3935f446f05613c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:55:00 -0700 Subject: [PATCH 362/474] memcg: switch lruvec stats to rstat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 2d146aa3aa84 ("mm: memcontrol: switch to rstat") switched memcg stats to rstat infrastructure but skipped the conversion of the lruvec stats as such stats are read in the performance critical code paths and flushing stats may have impacted the performances of the applications. This patch converts the lruvec stats to rstat and later patches add mechanisms to keep the performance impact to minimum. The rstat conversion comes with the price i.e. memory cost. Effectively this patch reverts the savings done by the commit f3344adf38bd ("mm: memcontrol: optimize per-lruvec stats counter memory usage"). However this cost is justified due to negative impact of the inaccurate lruvec stats on many heuristics. One such case is reported in [1]. The memory reclaim code is filled with plethora of heuristics and many of those heuristics reads the lruvec stats. So, inaccurate stats can make such heuristics ineffective. [1] reports the impact of inaccurate lruvec stats on the "cache trim mode" heuristic. Inaccurate lruvec stats can impact the deactivation and aging anon heuristics as well. [1] https://lore.kernel.org/linux-mm/20210311004449.1170308-1-ying.huang@intel.com/ Link: https://lkml.kernel.org/r/20210716212137.1391164-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20210714013948.270662-1-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Tejun Heo Cc: Johannes Weiner Cc: Muchun Song Cc: Michal Hocko Cc: Roman Gushchin Cc: Huang Ying Cc: Hillf Danton Cc: Michal Koutný Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 42 +++++++------- mm/memcontrol.c | 114 +++++++++++++------------------------ 2 files changed, 58 insertions(+), 98 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a00bf337567b..47d35bef9f6e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -105,14 +105,6 @@ struct mem_cgroup_reclaim_iter { unsigned int generation; }; -struct lruvec_stat { - long count[NR_VM_NODE_STAT_ITEMS]; -}; - -struct batched_lruvec_stat { - s32 count[NR_VM_NODE_STAT_ITEMS]; -}; - /* * Bitmap and deferred work of shrinker::id corresponding to memcg-aware * shrinkers, which have elements charged to this memcg. @@ -123,24 +115,30 @@ struct shrinker_info { unsigned long *map; }; +struct lruvec_stats_percpu { + /* Local (CPU and cgroup) state */ + long state[NR_VM_NODE_STAT_ITEMS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[NR_VM_NODE_STAT_ITEMS]; +}; + +struct lruvec_stats { + /* Aggregated (CPU and subtree) state */ + long state[NR_VM_NODE_STAT_ITEMS]; + + /* Pending child counts during tree propagation */ + long state_pending[NR_VM_NODE_STAT_ITEMS]; +}; + /* * per-node information in memory controller. */ struct mem_cgroup_per_node { struct lruvec lruvec; - /* - * Legacy local VM stats. This should be struct lruvec_stat and - * cannot be optimized to struct batched_lruvec_stat. Because - * the threshold of the lruvec_stat_cpu can be as big as - * MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this - * filed has no upper limit. - */ - struct lruvec_stat __percpu *lruvec_stat_local; - - /* Subtree VM stats (batched updates) */ - struct batched_lruvec_stat __percpu *lruvec_stat_cpu; - atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; + struct lruvec_stats_percpu __percpu *lruvec_stats_percpu; + struct lruvec_stats lruvec_stats; unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; @@ -997,7 +995,7 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - x = atomic_long_read(&pn->lruvec_stat[idx]); + x = READ_ONCE(pn->lruvec_stats.state[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -1017,7 +1015,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); for_each_possible_cpu(cpu) - x += per_cpu(pn->lruvec_stat_local->count[idx], cpu); + x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 81e15a67391b..400d210e030a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -660,23 +660,11 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) return x; } -static struct mem_cgroup_per_node * -parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) -{ - struct mem_cgroup *parent; - - parent = parent_mem_cgroup(pn->memcg); - if (!parent) - return NULL; - return parent->nodeinfo[nid]; -} - void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { struct mem_cgroup_per_node *pn; struct mem_cgroup *memcg; - long x, threshold = MEMCG_CHARGE_BATCH; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; @@ -685,21 +673,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_state(memcg, idx, val); /* Update lruvec */ - __this_cpu_add(pn->lruvec_stat_local->count[idx], val); - - if (vmstat_item_in_bytes(idx)) - threshold <<= PAGE_SHIFT; - - x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); - if (unlikely(abs(x) > threshold)) { - pg_data_t *pgdat = lruvec_pgdat(lruvec); - struct mem_cgroup_per_node *pi; - - for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) - atomic_long_add(x, &pi->lruvec_stat[idx]); - x = 0; - } - __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); + __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); } /** @@ -2278,40 +2252,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) mutex_unlock(&percpu_charge_mutex); } -static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu) -{ - int nid; - - for_each_node(nid) { - struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; - unsigned long stat[NR_VM_NODE_STAT_ITEMS]; - struct batched_lruvec_stat *lstatc; - int i; - - lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu); - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { - stat[i] = lstatc->count[i]; - lstatc->count[i] = 0; - } - - do { - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - atomic_long_add(stat[i], &pn->lruvec_stat[i]); - } while ((pn = parent_nodeinfo(pn, nid))); - } -} - static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; - struct mem_cgroup *memcg; stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); - for_each_mem_cgroup(memcg) - memcg_flush_lruvec_page_state(memcg, cpu); - return 0; } @@ -5118,17 +5065,9 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; - pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat, - GFP_KERNEL_ACCOUNT); - if (!pn->lruvec_stat_local) { - kfree(pn); - return 1; - } - - pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat, - GFP_KERNEL_ACCOUNT); - if (!pn->lruvec_stat_cpu) { - free_percpu(pn->lruvec_stat_local); + pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, + GFP_KERNEL_ACCOUNT); + if (!pn->lruvec_stats_percpu) { kfree(pn); return 1; } @@ -5149,8 +5088,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return; - free_percpu(pn->lruvec_stat_cpu); - free_percpu(pn->lruvec_stat_local); + free_percpu(pn->lruvec_stats_percpu); kfree(pn); } @@ -5166,15 +5104,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) static void mem_cgroup_free(struct mem_cgroup *memcg) { - int cpu; - memcg_wb_domain_exit(memcg); - /* - * Flush percpu lruvec stats to guarantee the value - * correctness on parent's and all ancestor levels. - */ - for_each_online_cpu(cpu) - memcg_flush_lruvec_page_state(memcg, cpu); __mem_cgroup_free(memcg); } @@ -5407,7 +5337,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct memcg_vmstats_percpu *statc; long delta, v; - int i; + int i, nid; statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); @@ -5455,6 +5385,36 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (parent) parent->vmstats.events_pending[i] += delta; } + + for_each_node_state(nid, N_MEMORY) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + struct mem_cgroup_per_node *ppn = NULL; + struct lruvec_stats_percpu *lstatc; + + if (parent) + ppn = parent->nodeinfo[nid]; + + lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + delta = pn->lruvec_stats.state_pending[i]; + if (delta) + pn->lruvec_stats.state_pending[i] = 0; + + v = READ_ONCE(lstatc->state[i]); + if (v != lstatc->state_prev[i]) { + delta += v - lstatc->state_prev[i]; + lstatc->state_prev[i] = v; + } + + if (!delta) + continue; + + pn->lruvec_stats.state[i] += delta; + if (ppn) + ppn->lruvec_stats.state_pending[i] += delta; + } + } } #ifdef CONFIG_MMU @@ -6388,6 +6348,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) int i; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + cgroup_rstat_flush(memcg->css.cgroup); + for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { int nid; From aa48e47e3906c332eaf1e5d7b58be11d3509ad9f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:55:04 -0700 Subject: [PATCH 363/474] memcg: infrastructure to flush memcg stats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the moment memcg stats are read in four contexts: 1. memcg stat user interfaces 2. dirty throttling 3. page fault 4. memory reclaim Currently the kernel flushes the stats for first two cases. Flushing the stats for remaining two casese may have performance impact. Always flushing the memcg stats on the page fault code path may negatively impacts the performance of the applications. In addition flushing in the memory reclaim code path, though treated as slowpath, can become the source of contention for the global lock taken for stat flushing because when system or memcg is under memory pressure, many tasks may enter the reclaim path. This patch uses following mechanisms to solve these challenges: 1. Periodically flush the stats from root memcg every 2 seconds. This will time limit the out of sync stats. 2. Asynchronously flush the stats after fixed number of stat updates. In the worst case the stat can be out of sync by O(nr_cpus * BATCH) for 2 seconds. 3. For avoiding thundering herd to flush the stats particularly from the memory reclaim context, introduce memcg local spinlock and let only one flusher active at a time. This could have been done through cgroup_rstat_lock lock but that lock is used by other subsystem and for userspace reading memcg stats. So, it is better to keep flushers introduced by this patch decoupled from cgroup_rstat_lock. However we would have to use irqsafe version of rstat flush but that is fine as this code path will be flushing for whole tree and do the work for everyone. No one will be waiting for that worker. [shakeelb@google.com: fix sleep-in-wrong context bug] Link: https://lkml.kernel.org/r/20210716212137.1391164-2-shakeelb@google.com Link: https://lkml.kernel.org/r/20210714013948.270662-2-shakeelb@google.com Signed-off-by: Shakeel Butt Tested-by: Marek Szyprowski Cc: Hillf Danton Cc: Huang Ying Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++++ mm/memcontrol.c | 34 ++++++++++++++++++++++++++++++++++ mm/vmscan.c | 6 ++++++ 3 files changed, 46 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 47d35bef9f6e..c4c5b652d3af 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1023,6 +1023,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return x; } +void mem_cgroup_flush_stats(void); + void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); @@ -1438,6 +1440,10 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); } +static inline void mem_cgroup_flush_stats(void) +{ +} + static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 400d210e030a..4d8c9afecf98 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -103,6 +103,14 @@ static bool do_memsw_account(void) return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } +/* memcg and lruvec stats flushing */ +static void flush_memcg_stats_dwork(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); +static void flush_memcg_stats_work(struct work_struct *w); +static DECLARE_WORK(stats_flush_work, flush_memcg_stats_work); +static DEFINE_PER_CPU(unsigned int, stats_flush_threshold); +static DEFINE_SPINLOCK(stats_flush_lock); + #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 @@ -674,6 +682,8 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + if (!(__this_cpu_inc_return(stats_flush_threshold) % MEMCG_CHARGE_BATCH)) + queue_work(system_unbound_wq, &stats_flush_work); } /** @@ -5240,6 +5250,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) /* Online state pins memcg ID, memcg ID pins CSS */ refcount_set(&memcg->id.ref, 1); css_get(css); + + if (unlikely(mem_cgroup_is_root(memcg))) + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, + 2UL*HZ); return 0; } @@ -5331,6 +5345,26 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg_wb_domain_size_changed(memcg); } +void mem_cgroup_flush_stats(void) +{ + if (!spin_trylock(&stats_flush_lock)) + return; + + cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); + spin_unlock(&stats_flush_lock); +} + +static void flush_memcg_stats_dwork(struct work_struct *w) +{ + mem_cgroup_flush_stats(); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); +} + +static void flush_memcg_stats_work(struct work_struct *w) +{ + mem_cgroup_flush_stats(); +} + static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); diff --git a/mm/vmscan.c b/mm/vmscan.c index 268ad6570751..6c401b44a245 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2897,6 +2897,12 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); again: + /* + * Flush the memory cgroup stats, so that we read accurate per-memcg + * lruvec stats for heuristics. + */ + mem_cgroup_flush_stats(); + memset(&sc->nr, 0, sizeof(sc->nr)); nr_reclaimed = sc->nr_reclaimed; From bb902cb47cf93b33cd92b3b7a4019330a03ef57f Mon Sep 17 00:00:00 2001 From: Yutian Yang Date: Thu, 2 Sep 2021 14:55:07 -0700 Subject: [PATCH 364/474] memcg: charge fs_context and legacy_fs_context This patch adds accounting flags to fs_context and legacy_fs_context allocation sites so that kernel could correctly charge these objects. We have written a PoC to demonstrate the effect of the missing-charging bugs. The PoC takes around 1,200MB unaccounted memory, while it is charged for only 362MB memory usage. We evaluate the PoC on QEMU x86_64 v5.2.90 + Linux kernel v5.10.19 + Debian buster. All the limitations including ulimits and sysctl variables are set as default. Specifically, the hard NOFILE limit and nr_open in sysctl are both 1,048,576. /*------------------------- POC code ----------------------------*/ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \ } while (0) #define STACK_SIZE (8 * 1024) #ifndef __NR_fsopen #define __NR_fsopen 430 #endif static inline int fsopen(const char *fs_name, unsigned int flags) { return syscall(__NR_fsopen, fs_name, flags); } static char thread_stack[512][STACK_SIZE]; int thread_fn(void* arg) { for (int i = 0; i< 800000; ++i) { int fsfd = fsopen("nfs", FSOPEN_CLOEXEC); if (fsfd == -1) { errExit("fsopen"); } } while(1); return 0; } int main(int argc, char *argv[]) { int thread_pid; for (int i = 0; i < 1; ++i) { thread_pid = clone(thread_fn, thread_stack[i] + STACK_SIZE, \ SIGCHLD, NULL); } while(1); return 0; } /*-------------------------- end --------------------------------*/ Link: https://lkml.kernel.org/r/1626517201-24086-1-git-send-email-nglaive@gmail.com Signed-off-by: Yutian Yang Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs_context.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fs_context.c b/fs/fs_context.c index de1985eae535..b7e43a780a62 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -254,7 +254,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, struct fs_context *fc; int ret = -ENOMEM; - fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); + fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT); if (!fc) return ERR_PTR(-ENOMEM); @@ -649,7 +649,7 @@ const struct fs_context_operations legacy_fs_context_ops = { */ static int legacy_init_fs_context(struct fs_context *fc) { - fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL); + fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT); if (!fc->fs_private) return -ENOMEM; fc->ops = &legacy_fs_context_ops; From 79f6540ba88dfb383ecf057a3425e668105ca774 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:10 -0700 Subject: [PATCH 365/474] memcg: enable accounting for mnt_cache entries Patch series "memcg accounting from OpenVZ", v7. OpenVZ uses memory accounting 20+ years since v2.2.x linux kernels. Initially we used our own accounting subsystem, then partially committed it to upstream, and a few years ago switched to cgroups v1. Now we're rebasing again, revising our old patches and trying to push them upstream. We try to protect the host system from any misuse of kernel memory allocation triggered by untrusted users inside the containers. Patch-set is addressed mostly to cgroups maintainers and cgroups@ mailing list, though I would be very grateful for any comments from maintainersi of affected subsystems or other people added in cc: Compared to the upstream, we additionally account the following kernel objects: - network devices and its Tx/Rx queues - ipv4/v6 addresses and routing-related objects - inet_bind_bucket cache objects - VLAN group arrays - ipv6/sit: ip_tunnel_prl - scm_fp_list objects used by SCM_RIGHTS messages of Unix sockets - nsproxy and namespace objects itself - IPC objects: semaphores, message queues and share memory segments - mounts - pollfd and select bits arrays - signals and posix timers - file lock - fasync_struct used by the file lease code and driver's fasync queues - tty objects - per-mm LDT We have an incorrect/incomplete/obsoleted accounting for few other kernel objects: sk_filter, af_packets, netlink and xt_counters for iptables. They require rework and probably will be dropped at all. Also we're going to add an accounting for nft, however it is not ready yet. We have not tested performance on upstream, however, our performance team compares our current RHEL7-based production kernel and reports that they are at least not worse as the according original RHEL7 kernel. This patch (of 10): The kernel allocates ~400 bytes of 'struct mount' for any new mount. Creating a new mount namespace clones most of the parent mounts, and this can be repeated many times. Additionally, each mount allocates up to PATH_MAX=4096 bytes for mnt->mnt_devname. It makes sense to account for these allocations to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/045db11f-4a45-7c9b-2664-5b32c2b44943@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Acked-by: Christian Brauner Cc: Tejun Heo Cc: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Yutian Yang Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Kirill Tkhai Cc: Oleg Nesterov Cc: Serge Hallyn Cc: Thomas Gleixner Cc: Zefan Li Cc: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 97adcb5ab5d5..e51b63ae233b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -203,7 +203,8 @@ static struct mount *alloc_vfsmnt(const char *name) goto out_free_cache; if (name) { - mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL); + mnt->mnt_devname = kstrdup_const(name, + GFP_KERNEL_ACCOUNT); if (!mnt->mnt_devname) goto out_free_id; } @@ -4240,7 +4241,7 @@ void __init mnt_init(void) int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), From b655843444152c0a14b749308e4cb35d91cbcf0b Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:14 -0700 Subject: [PATCH 366/474] memcg: enable accounting for pollfd and select bits arrays User can call select/poll system calls with a large number of assigned file descriptors and force kernel to allocate up to several pages of memory till end of these sleeping system calls. We have here long-living unaccounted per-task allocations. It makes sense to account for these allocations to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/56e31cb5-6e1e-bdba-d7ca-be64b9842363@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/select.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/select.c b/fs/select.c index 945896d0ac9e..e83e563a351d 100644 --- a/fs/select.c +++ b/fs/select.c @@ -655,7 +655,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, goto out_nofds; alloc_size = 6 * size; - bits = kvmalloc(alloc_size, GFP_KERNEL); + bits = kvmalloc(alloc_size, GFP_KERNEL_ACCOUNT); if (!bits) goto out_nofds; } @@ -1000,7 +1000,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc(struct_size(walk, entries, len), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!walk) { err = -ENOMEM; goto out_fds; From 0f12156dff2862ac54235fc72703f18770769042 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:19 -0700 Subject: [PATCH 367/474] memcg: enable accounting for file lock caches User can create file locks for each open file and force kernel to allocate small but long-living objects per each open file. It makes sense to account for these objects to limit the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/b009f4c7-f0ab-c0ec-8e83-918f47d677da@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/locks.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 74b2a1dfe8d8..1bc7ede75f18 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -3056,10 +3056,12 @@ static int __init filelock_init(void) int i; flctx_cache = kmem_cache_create("file_lock_ctx", - sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL); + sizeof(struct file_lock_context), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); filelock_cache = kmem_cache_create("file_lock_cache", - sizeof(struct file_lock), 0, SLAB_PANIC, NULL); + sizeof(struct file_lock), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); for_each_possible_cpu(i) { struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i); From 839d68206de869b8cb4272c5ea10da2ef7ec34cb Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:23 -0700 Subject: [PATCH 368/474] memcg: enable accounting for fasync_cache fasync_struct is used by almost all character device drivers to set up the fasync queue, and for regular files by the file lease code. This structure is quite small but long-living and it can be assigned for any open file. It makes sense to account for its allocations to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/1b408625-d71c-0b26-b0b6-9baf00f93e69@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fcntl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index f946bec8f1f1..714e7c9a5fc4 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -1049,7 +1049,8 @@ static int __init fcntl_init(void) __FMODE_EXEC | __FMODE_NONOTIFY)); fasync_cache = kmem_cache_create("fasync_cache", - sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); + sizeof(struct fasync_struct), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } From 30acd0bdfb86548172168a0cc71d455944de0683 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:27 -0700 Subject: [PATCH 369/474] memcg: enable accounting for new namesapces and struct nsproxy Container admin can create new namespaces and force kernel to allocate up to several pages of memory for the namespaces and its associated structures. Net and uts namespaces have enabled accounting for such allocations. It makes sense to account for rest ones to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/5525bcbf-533e-da27-79b7-158686c64e13@virtuozzo.com Signed-off-by: Vasily Averin Acked-by: Serge Hallyn Acked-by: Christian Brauner Acked-by: Kirill Tkhai Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 2 +- ipc/namespace.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/nsproxy.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/time/namespace.c | 4 ++-- kernel/user_namespace.c | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index e51b63ae233b..94a9817851cc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3307,7 +3307,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a if (!ucounts) return ERR_PTR(-ENOSPC); - new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); diff --git a/ipc/namespace.c b/ipc/namespace.c index 7bd0766ddc3b..ae83f0f2651b 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -42,7 +42,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL); + ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT); if (ns == NULL) goto fail_dec; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index f5e8828c109c..0d5c29879a50 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -24,7 +24,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) struct cgroup_namespace *new_ns; int ret; - new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); ret = ns_alloc_inum(&new_ns->ns); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index abc01fcad8c7..eec72ca962e2 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -568,6 +568,6 @@ out: int __init nsproxy_cache_init(void) { - nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); + nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT); return 0; } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index cb5a25a8a0cc..a46a3723bc66 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -450,7 +450,7 @@ const struct proc_ns_operations pidns_for_children_operations = { static __init int pid_namespaces_init(void) { - pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); #ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_paths(kern_path, pid_ns_ctl_table); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 12eab0d2ae28..aec832801c26 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -88,13 +88,13 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(*ns), GFP_KERNEL); + ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; refcount_set(&ns->ns.count, 1); - ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ef82d401dde8..6b2e3ca7ee99 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1385,7 +1385,7 @@ const struct proc_ns_operations userns_operations = { static __init int user_namespaces_init(void) { - user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); + user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init); From 18319498fdd4cdf8c1c2c48cd432863b1f915d6f Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:31 -0700 Subject: [PATCH 370/474] memcg: enable accounting of ipc resources When user creates IPC objects it forces kernel to allocate memory for these long-living objects. It makes sense to account them to restrict the host's memory consumption from inside the memcg-limited container. This patch enables accounting for IPC shared memory segments, messages semaphores and semaphore's undo lists. Link: https://lkml.kernel.org/r/d6507b06-4df6-78f8-6c54-3ae86e3b5339@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 2 +- ipc/sem.c | 9 +++++---- ipc/shm.c | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index 6810276d6bb9..a0d05775af2c 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -147,7 +147,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) key_t key = params->key; int msgflg = params->flg; - msq = kmalloc(sizeof(*msq), GFP_KERNEL); + msq = kmalloc(sizeof(*msq), GFP_KERNEL_ACCOUNT); if (unlikely(!msq)) return -ENOMEM; diff --git a/ipc/sem.c b/ipc/sem.c index 971e75d28364..1a8b9f0ac047 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -514,7 +514,7 @@ static struct sem_array *sem_alloc(size_t nsems) if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0])) return NULL; - sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL); + sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL_ACCOUNT); if (unlikely(!sma)) return NULL; @@ -1855,7 +1855,7 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp) undo_list = current->sysvsem.undo_list; if (!undo_list) { - undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL); + undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_ACCOUNT); if (undo_list == NULL) return -ENOMEM; spin_lock_init(&undo_list->lock); @@ -1941,7 +1941,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) /* step 2: allocate new undo structure */ new = kvzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!new) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return ERR_PTR(-ENOMEM); @@ -2005,7 +2005,8 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, if (nsops > ns->sc_semopm) return -E2BIG; if (nsops > SEMOPM_FAST) { - sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL); + sops = kvmalloc_array(nsops, sizeof(*sops), + GFP_KERNEL_ACCOUNT); if (sops == NULL) return -ENOMEM; } diff --git a/ipc/shm.c b/ipc/shm.c index 748933e376ca..ab749be6d8b7 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -619,7 +619,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) ns->shm_tot + numpages > ns->shm_ctlall) return -ENOSPC; - shp = kmalloc(sizeof(*shp), GFP_KERNEL); + shp = kmalloc(sizeof(*shp), GFP_KERNEL_ACCOUNT); if (unlikely(!shp)) return -ENOMEM; From 5f58c39819ff78ca5ddbba2b3cd8ff4779b19bb5 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:35 -0700 Subject: [PATCH 371/474] memcg: enable accounting for signals When a user send a signal to any another processes it forces the kernel to allocate memory for 'struct sigqueue' objects. The number of signals is limited by RLIMIT_SIGPENDING resource limit, but even the default settings allow each user to consume up to several megabytes of memory. It makes sense to account for these allocations to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/e34e958c-e785-712e-a62a-2c7b66c646c7@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/signal.c b/kernel/signal.c index a3229add4455..8921c4a8419f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4663,7 +4663,7 @@ void __init signals_init(void) { siginfo_buildtime_checks(); - sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT); } #ifdef CONFIG_KGDB_KDB From c509723ec27e925bb91a20682c448e95d4bc8c9f Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:39 -0700 Subject: [PATCH 372/474] memcg: enable accounting for posix_timers_cache slab A program may create multiple interval timers using timer_create(). For each timer the kernel preallocates a "queued real-time signal", Consequently, the number of timers is limited by the RLIMIT_SIGPENDING resource limit. The allocated object is quite small, ~250 bytes, but even the default signal limits allow to consume up to 100 megabytes per user. It makes sense to account for them to limit the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/57795560-025c-267c-6b1a-dea852d95530@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Thomas Gleixner Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/posix-timers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dd5697d7347b..7363f81dc31a 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -273,8 +273,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) static __init int init_posix_timers(void) { posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, SLAB_PANIC, - NULL); + sizeof(struct k_itimer), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } __initcall(init_posix_timers); From ec403e2ae0dfc85996aad6e944a98a16e6dfcc6d Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:43 -0700 Subject: [PATCH 373/474] memcg: enable accounting for ldt_struct objects Each task can request own LDT and force the kernel to allocate up to 64Kb memory per-mm. There are legitimate workloads with hundreds of processes and there can be hundreds of workloads running on large machines. The unaccounted memory can cause isolation issues between the workloads particularly on highly utilized machines. It makes sense to account for this objects to restrict the host's memory consumption from inside the memcg-limited container. Link: https://lkml.kernel.org/r/38010594-50fe-c06d-7cb0-d1f77ca422f3@virtuozzo.com Signed-off-by: Vasily Averin Acked-by: Borislav Petkov Reviewed-by: Shakeel Butt Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrei Vagin Cc: Borislav Petkov Cc: Christian Brauner Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Jens Axboe Cc: Jiri Slaby Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Oleg Nesterov Cc: Roman Gushchin Cc: Serge Hallyn Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Yutian Yang Cc: Zefan Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/ldt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index aa15132228da..525876e7b9f4 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -154,7 +154,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) if (num_entries > LDT_ENTRIES) return NULL; - new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); + new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT); if (!new_ldt) return NULL; @@ -168,9 +168,9 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) * than PAGE_SIZE. */ if (alloc_size > PAGE_SIZE) - new_ldt->entries = vzalloc(alloc_size); + new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); else - new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL); + new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!new_ldt->entries) { kfree(new_ldt); From 96e51ccf1af33e82f429a0d6baebba29c6448d0f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:55:46 -0700 Subject: [PATCH 374/474] memcg: cleanup racy sum avoidance code We used to have per-cpu memcg and lruvec stats and the readers have to traverse and sum the stats from each cpu. This summing was racy and may expose transient negative values. So, an explicit check was added to avoid such scenarios. Now these stats are moved to rstat infrastructure and are no more per-cpu, so we can remove the fixup for transient negative values. Link: https://lkml.kernel.org/r/20210728012243.3369123-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c4c5b652d3af..2ee20509ac71 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -977,30 +977,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long x = READ_ONCE(memcg->vmstats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; + return READ_ONCE(memcg->vmstats.state[idx]); } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; - long x; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - x = READ_ONCE(pn->lruvec_stats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; + return READ_ONCE(pn->lruvec_stats.state[idx]); } static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, From 55a68c823951855f3ec313fdb85100db84284505 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:55:49 -0700 Subject: [PATCH 375/474] memcg: replace in_interrupt() by !in_task() in active_memcg() set_active_memcg() uses in_interrupt() check to select proper storage for cgroup: pointer on task struct or per-cpu pointer. It isn't fully correct: obsoleted in_interrupt() includes tasks with disabled BH. It's better to use '!in_task()' instead. Link: https://lkml.org/lkml/2021/7/26/487 Link: https://lkml.kernel.org/r/ed4448b0-4970-616f-7368-ef9dd3cb628d@virtuozzo.com Fixes: 37d5985c003d ("mm: kmem: prepare remote memcg charging infra for interrupt contexts") Signed-off-by: Vasily Averin Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 2 +- mm/memcontrol.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8894825cc4db..5561486fddef 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -306,7 +306,7 @@ set_active_memcg(struct mem_cgroup *memcg) { struct mem_cgroup *old; - if (in_interrupt()) { + if (!in_task()) { old = this_cpu_read(int_active_memcg); this_cpu_write(int_active_memcg, memcg); } else { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4d8c9afecf98..b09f2639f28b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -878,7 +878,7 @@ EXPORT_SYMBOL(mem_cgroup_from_task); static __always_inline struct mem_cgroup *active_memcg(void) { - if (in_interrupt()) + if (!in_task()) return this_cpu_read(int_active_memcg); else return current->active_memcg; From 37bc3cb9bbef86d1ddbbc789e55b588c8a2cac26 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 2 Sep 2021 14:55:53 -0700 Subject: [PATCH 376/474] mm: memcontrol: set the correct memcg swappiness restriction Since commit c843966c556d ("mm: allow swappiness that prefers reclaiming anon over the file workingset") has expended the swappiness value to make swap to be preferred in some systems. We should also change the memcg swappiness restriction to allow memcg swap-preferred. Link: https://lkml.kernel.org/r/d77469b90c45c49953ccbc51e54a1d465bc18f70.1627626255.git.baolin.wang@linux.alibaba.com Fixes: c843966c556d ("mm: allow swappiness that prefers reclaiming anon over the file workingset") Signed-off-by: Baolin Wang Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b09f2639f28b..80840e026c2f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4062,7 +4062,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - if (val > 100) + if (val > 200) return -EINVAL; if (!mem_cgroup_is_root(memcg)) From bec49c067c679e9b7ca7c1aac50b56618c12d879 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:55:56 -0700 Subject: [PATCH 377/474] mm, memcg: remove unused functions Since commit 2d146aa3aa84 ("mm: memcontrol: switch to rstat"), last user of memcg_stat_item_in_bytes() is gone. And since commit fa40d1ee9f15 ("mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node()"), only the declaration of mem_cgroup_select_victim_node() is remained here. Remove them. Link: https://lkml.kernel.org/r/20210807082835.61281-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Alex Shi Cc: Matthew Wilcox (Oracle) Cc: Vladimir Davydov Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2ee20509ac71..69e6c5462c43 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -593,13 +593,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) } #endif -static __always_inline bool memcg_stat_item_in_bytes(int idx) -{ - if (idx == MEMCG_PERCPU_B) - return true; - return vmstat_item_in_bytes(idx); -} - static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); @@ -904,11 +897,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) return !!(memcg->css.flags & CSS_ONLINE); } -/* - * For memory reclaim. - */ -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); - void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages); From 27fb0956ed08780e480a14c5cca2fd4818e99fa7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:55:59 -0700 Subject: [PATCH 378/474] mm, memcg: save some atomic ops when flush is already true Add 'else' to save some atomic ops in obj_stock_flush_required() when flush is already true. No functional change intended here. Link: https://lkml.kernel.org/r/20210807082835.61281-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Muchun Song Cc: Matthew Wilcox (Oracle) Cc: Alex Shi Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 80840e026c2f..894a24d5ec55 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2246,7 +2246,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) if (memcg && stock->nr_pages && mem_cgroup_is_descendant(memcg, root_memcg)) flush = true; - if (obj_stock_flush_required(stock, root_memcg)) + else if (obj_stock_flush_required(stock, root_memcg)) flush = true; rcu_read_unlock(); From 5c49cf9ad600f705f595ecbbbac2a5da9a3bb3bd Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 2 Sep 2021 14:56:02 -0700 Subject: [PATCH 379/474] memcg: fix up drain_local_stock comment Thomas and Vlastimil have noticed that the comment in drain_local_stock doesn't quite make sense. It talks about a synchronization with the memory hotplug but there is no actual memory hotplug involvement here. I meant to talk about cpu hotplug here. Fix that up and hopefuly make the comment more helpful by referencing the cpu hotplug callback as well. Link: https://lkml.kernel.org/r/YRDwOhVglJmY7ES5@dhcp22.suse.cz Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 894a24d5ec55..81fa83133c2c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2178,8 +2178,9 @@ static void drain_local_stock(struct work_struct *dummy) unsigned long flags; /* - * The only protection from memory hotplug vs. drain_stock races is - * that we always operate on local CPU stock here with IRQ disabled + * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. + * drain_stock races is that we always operate on local CPU stock + * here with IRQ disabled */ local_irq_save(flags); From 4ba9515d32bac1c54c2357796e32d68eeab784ce Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 2 Sep 2021 14:56:05 -0700 Subject: [PATCH 380/474] memcg: make memcg->event_list_lock irqsafe The memcg->event_list_lock is usually taken in the normal context but when the userspace closes the corresponding eventfd, eventfd_release through memcg_event_wake takes memcg->event_list_lock with interrupts disabled. This is not an issue on its own but it creates a nested dependency from eventfd_ctx->wqh.lock to memcg->event_list_lock. Independently, for unrelated eventfd, eventfd_signal() can be called in the irq context, thus making eventfd_ctx->wqh.lock an irq lock. For example, FPGA DFL driver, VHOST VPDA driver and couple of VFIO drivers. This will force memcg->event_list_lock to be an irqsafe lock as well. One way to break the nested dependency between eventfd_ctx->wqh.lock and memcg->event_list_lock is to add an indirection. However the simplest solution would be to make memcg->event_list_lock irqsafe. This is cgroup v1 feature, is in maintenance and may get deprecated in near future. So, no need to add more code. BTW this has been discussed previously [1] but there weren't irq users of eventfd_signal() at the time. [1] https://www.spinics.net/lists/cgroups/msg06248.html Link: https://lkml.kernel.org/r/20210830172953.207257-1-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Tejun Heo Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 81fa83133c2c..06ae0075e864 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4839,9 +4839,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, vfs_poll(efile.file, &event->pt); - spin_lock(&memcg->event_list_lock); + spin_lock_irq(&memcg->event_list_lock); list_add(&event->list, &memcg->event_list); - spin_unlock(&memcg->event_list_lock); + spin_unlock_irq(&memcg->event_list_lock); fdput(cfile); fdput(efile); @@ -5268,12 +5268,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) * Notify userspace about cgroup removing only after rmdir of cgroup * directory to avoid race between userspace and kernelspace. */ - spin_lock(&memcg->event_list_lock); + spin_lock_irq(&memcg->event_list_lock); list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { list_del_init(&event->list); schedule_work(&event->remove); } - spin_unlock(&memcg->event_list_lock); + spin_unlock_irq(&memcg->event_list_lock); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); From 6260618e09d375ee6aa19be16813dac40cc47513 Mon Sep 17 00:00:00 2001 From: Po-Hsu Lin Date: Thu, 2 Sep 2021 14:56:08 -0700 Subject: [PATCH 381/474] selftests/vm: use kselftest skip code for skipped tests There are several test cases in the vm directory are still using exit 0 when they need to be skipped. Use the kselftest framework to skip code instead so it can help us to distinguish the return status. Criterion to filter out what should be fixed in vm directory: grep -r "exit 0" -B1 | grep -i skip This change might cause some false-positives if people are running these test scripts directly and only checking their return codes, which will change from 0 to 4. However I think the impact should be small as most of our scripts here are already using this skip code. And there will be no such issue if running them with the kselftest framework. Link: https://lkml.kernel.org/r/20210823073433.37653-1-po-hsu.lin@canonical.com Signed-off-by: Po-Hsu Lin Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/charge_reserved_hugetlb.sh | 5 ++++- tools/testing/selftests/vm/hugetlb_reparenting_test.sh | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh index 18d33684faad..fe8fcfb334e0 100644 --- a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh @@ -1,11 +1,14 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + set -e if [[ $(id -u) -ne 0 ]]; then echo "This test must be run as root. Skipping..." - exit 0 + exit $ksft_skip fi fault_limit_file=limit_in_bytes diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh index d11d1febccc3..4a9a3afe9fd4 100644 --- a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh +++ b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh @@ -1,11 +1,14 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + set -e if [[ $(id -u) -ne 0 ]]; then echo "This test must be run as root. Skipping..." - exit 0 + exit $ksft_skip fi usage_file=usage_in_bytes From 0c52ec9513b309b250046fdb2c4c6c3726fa5cfb Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 2 Sep 2021 14:56:11 -0700 Subject: [PATCH 382/474] selftests: Fix spelling mistake "cann't" -> "cannot" There is a spelling mistake in an error message. Fix it. Link: https://lkml.kernel.org/r/20210826121217.12885-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/mlock-random-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/mlock-random-test.c b/tools/testing/selftests/vm/mlock-random-test.c index ff4d72eb74b9..782ea94dee2f 100644 --- a/tools/testing/selftests/vm/mlock-random-test.c +++ b/tools/testing/selftests/vm/mlock-random-test.c @@ -70,7 +70,7 @@ int get_proc_locked_vm_size(void) } } - perror("cann't parse VmLck in /proc/self/status\n"); + perror("cannot parse VmLck in /proc/self/status\n"); fclose(f); return -1; } From 79c62de859f7e854399efb84a2f04ceeb1c13cc9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Sep 2021 14:56:26 -0700 Subject: [PATCH 383/474] mmc: JZ4740: remove the flush_kernel_dcache_page call in jz4740_mmc_read_data Patch series "_kernel_dcache_page fixes and removal". While looking to convert the block layer away from kmap_atomic towards kmap_local_page and prefeably the helpers that abstract it away I noticed that a few block drivers directly or implicitly call flush_kernel_dcache_page before kunmapping a page that has been written to. flush_kernel_dcache_page is documented to to be used in such cases, but flush_dcache_page is actually required when the page could be in the page cache and mapped to userspace, which is pretty much always the case when kmapping an arbitrary page. Unfortunately the documentation doesn't exactly make that clear, which lead to this misused. And it turns out that only the copy_strings / copy_string_kernel in the exec code were actually correct users of flush_kernel_dcache_page, which is why I think we should just remove it and eat the very minor overhead in exec rather than confusing poor driver writers. This patch (of 6): MIPS now implements flush_kernel_dcache_page (as an alias to flush_dcache_page). Link: https://lkml.kernel.org/r/20210712060928.4161649-1-hch@lst.de Link: https://lkml.kernel.org/r/20210712060928.4161649-2-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Linus Torvalds Cc: "James E.J. Bottomley" Cc: Russell King Cc: Guo Ren Cc: Thomas Bogendoerfer Cc: Nick Hu Cc: Greentime Hu Cc: Vincent Chen Cc: Helge Deller Cc: Yoshinori Sato Cc: Rich Felker Cc: Geoff Levand Cc: Paul Cercueil Cc: Ulf Hansson Cc: Alex Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/mmc/host/jz4740_mmc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c index cb1a64a5c256..80a2c270d502 100644 --- a/drivers/mmc/host/jz4740_mmc.c +++ b/drivers/mmc/host/jz4740_mmc.c @@ -578,10 +578,6 @@ static bool jz4740_mmc_read_data(struct jz4740_mmc_host *host, } } data->bytes_xfered += miter->length; - - /* This can go away once MIPS implements - * flush_kernel_dcache_page */ - flush_dcache_page(miter->page); } sg_miter_stop(miter); From 64a05fe645e27a318470a9f31c76c86f36725689 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Sep 2021 14:56:30 -0700 Subject: [PATCH 384/474] mmc: mmc_spi: replace flush_kernel_dcache_page with flush_dcache_page Pages passed to block drivers can be mapped page cache pages, so we must use flush_dcache_page here instead of the more limited flush_kernel_dcache_page that is intended for highmem pages only. Link: https://lkml.kernel.org/r/20210712060928.4161649-3-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Linus Torvalds Cc: Alex Shi Cc: Geoff Levand Cc: Greentime Hu Cc: Guo Ren Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Nick Hu Cc: Paul Cercueil Cc: Rich Felker Cc: Russell King Cc: Thomas Bogendoerfer Cc: Ulf Hansson Cc: Vincent Chen Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/mmc/host/mmc_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index 65c65bb5737f..3d28a3d3001b 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -948,7 +948,7 @@ mmc_spi_data_do(struct mmc_spi_host *host, struct mmc_command *cmd, /* discard mappings */ if (direction == DMA_FROM_DEVICE) - flush_kernel_dcache_page(sg_page(sg)); + flush_dcache_page(sg_page(sg)); kunmap(sg_page(sg)); if (dma_dev) dma_unmap_page(dma_dev, dma_addr, PAGE_SIZE, dir); From 0e84f5dbf8d6c33d685c45300da55bafd5dd786e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Sep 2021 14:56:33 -0700 Subject: [PATCH 385/474] scatterlist: replace flush_kernel_dcache_page with flush_dcache_page Pages used in scatterlist can be mapped page cache pages (and often are), so we must use flush_dcache_page here instead of the more limited flush_kernel_dcache_page that is intended for highmem pages only. Also remove the PageSlab check given that page_mapping_file as used by the flush_dcache_page implementations already contains that check. Link: https://lkml.kernel.org/r/20210712060928.4161649-5-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Linus Torvalds Cc: Alex Shi Cc: Geoff Levand Cc: Greentime Hu Cc: Guo Ren Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Nick Hu Cc: Paul Cercueil Cc: Rich Felker Cc: Russell King Cc: Thomas Bogendoerfer Cc: Ulf Hansson Cc: Vincent Chen Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/scatterlist.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 27efa6178153..627aa84f8bbd 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -887,9 +887,8 @@ void sg_miter_stop(struct sg_mapping_iter *miter) miter->__offset += miter->consumed; miter->__remaining -= miter->consumed; - if ((miter->__flags & SG_MITER_TO_SG) && - !PageSlab(miter->page)) - flush_kernel_dcache_page(miter->page); + if (miter->__flags & SG_MITER_TO_SG) + flush_dcache_page(miter->page); if (miter->__flags & SG_MITER_ATOMIC) { WARN_ON_ONCE(preemptible()); From f358afc52c3066f4e8cd7b3a2d75b31e822519e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Sep 2021 14:56:36 -0700 Subject: [PATCH 386/474] mm: remove flush_kernel_dcache_page flush_kernel_dcache_page is a rather confusing interface that implements a subset of flush_dcache_page by not being able to properly handle page cache mapped pages. The only callers left are in the exec code as all other previous callers were incorrect as they could have dealt with page cache pages. Replace the calls to flush_kernel_dcache_page with calls to flush_dcache_page, which for all architectures does either exactly the same thing, can contains one or more of the following: 1) an optimization to defer the cache flush for page cache pages not mapped into userspace 2) additional flushing for mapped page cache pages if cache aliases are possible Link: https://lkml.kernel.org/r/20210712060928.4161649-7-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Linus Torvalds Reviewed-by: Ira Weiny Cc: Alex Shi Cc: Geoff Levand Cc: Greentime Hu Cc: Guo Ren Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Nick Hu Cc: Paul Cercueil Cc: Rich Felker Cc: Russell King Cc: Thomas Bogendoerfer Cc: Ulf Hansson Cc: Vincent Chen Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/cachetlb.rst | 76 ++++++++----------- .../translations/zh_CN/core-api/cachetlb.rst | 9 --- arch/arm/include/asm/cacheflush.h | 4 +- arch/arm/mm/flush.c | 33 -------- arch/arm/mm/nommu.c | 6 -- arch/csky/abiv1/cacheflush.c | 11 --- arch/csky/abiv1/inc/abi/cacheflush.h | 4 +- arch/mips/include/asm/cacheflush.h | 8 +- arch/nds32/include/asm/cacheflush.h | 3 +- arch/nds32/mm/cacheflush.c | 9 --- arch/parisc/include/asm/cacheflush.h | 8 +- arch/parisc/kernel/cache.c | 3 +- arch/sh/include/asm/cacheflush.h | 8 +- block/blk-map.c | 2 +- fs/exec.c | 6 +- include/linux/highmem.h | 5 +- tools/testing/scatterlist/linux/mm.h | 1 - 17 files changed, 46 insertions(+), 150 deletions(-) diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index fe4290e26729..8aed9103e48a 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -271,10 +271,15 @@ maps this page at its virtual address. ``void flush_dcache_page(struct page *page)`` - Any time the kernel writes to a page cache page, _OR_ - the kernel is about to read from a page cache page and - user space shared/writable mappings of this page potentially - exist, this routine is called. + This routines must be called when: + + a) the kernel did write to a page that is in the page cache page + and / or in high memory + b) the kernel is about to read from a page cache page and user space + shared/writable mappings of this page potentially exist. Note + that {get,pin}_user_pages{_fast} already call flush_dcache_page + on any page found in the user address space and thus driver + code rarely needs to take this into account. .. note:: @@ -284,38 +289,34 @@ maps this page at its virtual address. handling vfs symlinks in the page cache need not call this interface at all. - The phrase "kernel writes to a page cache page" means, - specifically, that the kernel executes store instructions - that dirty data in that page at the page->virtual mapping - of that page. It is important to flush here to handle - D-cache aliasing, to make sure these kernel stores are - visible to user space mappings of that page. + The phrase "kernel writes to a page cache page" means, specifically, + that the kernel executes store instructions that dirty data in that + page at the page->virtual mapping of that page. It is important to + flush here to handle D-cache aliasing, to make sure these kernel stores + are visible to user space mappings of that page. - The corollary case is just as important, if there are users - which have shared+writable mappings of this file, we must make - sure that kernel reads of these pages will see the most recent - stores done by the user. + The corollary case is just as important, if there are users which have + shared+writable mappings of this file, we must make sure that kernel + reads of these pages will see the most recent stores done by the user. - If D-cache aliasing is not an issue, this routine may - simply be defined as a nop on that architecture. + If D-cache aliasing is not an issue, this routine may simply be defined + as a nop on that architecture. - There is a bit set aside in page->flags (PG_arch_1) as - "architecture private". The kernel guarantees that, - for pagecache pages, it will clear this bit when such - a page first enters the pagecache. + There is a bit set aside in page->flags (PG_arch_1) as "architecture + private". The kernel guarantees that, for pagecache pages, it will + clear this bit when such a page first enters the pagecache. - This allows these interfaces to be implemented much more - efficiently. It allows one to "defer" (perhaps indefinitely) - the actual flush if there are currently no user processes - mapping this page. See sparc64's flush_dcache_page and - update_mmu_cache implementations for an example of how to go - about doing this. + This allows these interfaces to be implemented much more efficiently. + It allows one to "defer" (perhaps indefinitely) the actual flush if + there are currently no user processes mapping this page. See sparc64's + flush_dcache_page and update_mmu_cache implementations for an example + of how to go about doing this. - The idea is, first at flush_dcache_page() time, if - page->mapping->i_mmap is an empty tree, just mark the architecture - private page flag bit. Later, in update_mmu_cache(), a check is - made of this flag bit, and if set the flush is done and the flag - bit is cleared. + The idea is, first at flush_dcache_page() time, if page_file_mapping() + returns a mapping, and mapping_mapped on that mapping returns %false, + just mark the architecture private page flag bit. Later, in + update_mmu_cache(), a check is made of this flag bit, and if set the + flush is done and the flag bit is cleared. .. important:: @@ -351,19 +352,6 @@ maps this page at its virtual address. architectures). For incoherent architectures, it should flush the cache of the page at vmaddr. - ``void flush_kernel_dcache_page(struct page *page)`` - - When the kernel needs to modify a user page is has obtained - with kmap, it calls this function after all modifications are - complete (but before kunmapping it) to bring the underlying - page up to date. It is assumed here that the user has no - incoherent cached copies (i.e. the original page was obtained - from a mechanism like get_user_pages()). The default - implementation is a nop and should remain so on all coherent - architectures. On incoherent architectures, this should flush - the kernel cache for page (using page_address(page)). - - ``void flush_icache_range(unsigned long start, unsigned long end)`` When the kernel stores into addresses that it will execute diff --git a/Documentation/translations/zh_CN/core-api/cachetlb.rst b/Documentation/translations/zh_CN/core-api/cachetlb.rst index 8376485a534d..55827b8a7c53 100644 --- a/Documentation/translations/zh_CN/core-api/cachetlb.rst +++ b/Documentation/translations/zh_CN/core-api/cachetlb.rst @@ -298,15 +298,6 @@ HyperSparc cpu就是这样一个具有这种属性的cpu。 用。默认的实现是nop(对于所有相干的架构应该保持这样)。对于不一致性 的架构,它应该刷新vmaddr处的页面缓存。 - ``void flush_kernel_dcache_page(struct page *page)`` - - 当内核需要修改一个用kmap获得的用户页时,它会在所有修改完成后(但在 - kunmapping之前)调用这个函数,以使底层页面达到最新状态。这里假定用 - 户没有不一致性的缓存副本(即原始页面是从类似get_user_pages()的机制 - 中获得的)。默认的实现是一个nop,在所有相干的架构上都应该如此。在不 - 一致性的架构上,这应该刷新内核缓存中的页面(使用page_address(page))。 - - ``void flush_icache_range(unsigned long start, unsigned long end)`` 当内核存储到它将执行的地址中时(例如在加载模块时),这个函数被调用。 diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 2e24e765e6d3..5e56288e343b 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -291,6 +291,7 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) { if ((cache_is_vivt() || cache_is_vipt_aliasing())) @@ -312,9 +313,6 @@ static inline void flush_anon_page(struct vm_area_struct *vma, __flush_anon_page(vma, page, vmaddr); } -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -extern void flush_kernel_dcache_page(struct page *); - #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 6d89db7895d1..7ff9feea13a6 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -345,39 +345,6 @@ void flush_dcache_page(struct page *page) } EXPORT_SYMBOL(flush_dcache_page); -/* - * Ensure cache coherency for the kernel mapping of this page. We can - * assume that the page is pinned via kmap. - * - * If the page only exists in the page cache and there are no user - * space mappings, this is a no-op since the page was already marked - * dirty at creation. Otherwise, we need to flush the dirty kernel - * cache lines directly. - */ -void flush_kernel_dcache_page(struct page *page) -{ - if (cache_is_vivt() || cache_is_vipt_aliasing()) { - struct address_space *mapping; - - mapping = page_mapping_file(page); - - if (!mapping || mapping_mapped(mapping)) { - void *addr; - - addr = page_address(page); - /* - * kmap_atomic() doesn't set the page virtual - * address for highmem pages, and - * kunmap_atomic() takes care of cache - * flushing already. - */ - if (!IS_ENABLED(CONFIG_HIGHMEM) || addr) - __cpuc_flush_dcache_area(addr, PAGE_SIZE); - } - } -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - /* * Flush an anonymous page so that users of get_user_pages() * can safely access the data. The expected sequence is: diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 8b3d7191e2b8..2658f52903da 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -166,12 +166,6 @@ void flush_dcache_page(struct page *page) } EXPORT_SYMBOL(flush_dcache_page); -void flush_kernel_dcache_page(struct page *page) -{ - __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE); -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long uaddr, void *dst, const void *src, unsigned long len) diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index 07ff17ea33de..fb91b069dc69 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -56,17 +56,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, } } -void flush_kernel_dcache_page(struct page *page) -{ - struct address_space *mapping; - - mapping = page_mapping_file(page); - - if (!mapping || mapping_mapped(mapping)) - dcache_wbinv_all(); -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index 6cab7afae962..ed62e2066ba7 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -14,12 +14,10 @@ extern void flush_dcache_page(struct page *); #define flush_cache_page(vma, page, pfn) cache_wbinv_all() #define flush_cache_dup_mm(mm) cache_wbinv_all() -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -extern void flush_kernel_dcache_page(struct page *); - #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) { dcache_wbinv_all(); diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index d687b40b9fbb..b3dc9c589442 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -125,13 +125,7 @@ static inline void kunmap_noncoherent(void) kunmap_coherent(); } -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -static inline void flush_kernel_dcache_page(struct page *page) -{ - BUG_ON(cpu_has_dc_aliases && PageHighMem(page)); - flush_dcache_page(page); -} - +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 /* * For now flush_kernel_vmap_range and invalidate_kernel_vmap_range both do a * cache writeback and invalidate operation. diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h index 7d6824f7c0e8..c2a222ebfa2a 100644 --- a/arch/nds32/include/asm/cacheflush.h +++ b/arch/nds32/include/asm/cacheflush.h @@ -36,8 +36,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page, void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr); -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -void flush_kernel_dcache_page(struct page *page); +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 void flush_kernel_vmap_range(void *addr, int size); void invalidate_kernel_vmap_range(void *addr, int size); #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&(mapping)->i_pages) diff --git a/arch/nds32/mm/cacheflush.c b/arch/nds32/mm/cacheflush.c index ad5344ef5d33..07aac65d1cab 100644 --- a/arch/nds32/mm/cacheflush.c +++ b/arch/nds32/mm/cacheflush.c @@ -318,15 +318,6 @@ void flush_anon_page(struct vm_area_struct *vma, local_irq_restore(flags); } -void flush_kernel_dcache_page(struct page *page) -{ - unsigned long flags; - local_irq_save(flags); - cpu_dcache_wbinval_page((unsigned long)page_address(page)); - local_irq_restore(flags); -} -EXPORT_SYMBOL(flush_kernel_dcache_page); - void flush_kernel_vmap_range(void *addr, int size) { unsigned long flags; diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 99663fc1f997..eef0096db5f8 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -36,16 +36,12 @@ void flush_cache_all_local(void); void flush_cache_all(void); void flush_cache_mm(struct mm_struct *mm); -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE void flush_kernel_dcache_page_addr(void *addr); -static inline void flush_kernel_dcache_page(struct page *page) -{ - flush_kernel_dcache_page_addr(page_address(page)); -} #define flush_kernel_dcache_range(start,size) \ flush_kernel_dcache_range_asm((start), (start)+(size)); +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 void flush_kernel_vmap_range(void *vaddr, int size); void invalidate_kernel_vmap_range(void *vaddr, int size); @@ -59,7 +55,7 @@ extern void flush_dcache_page(struct page *page); #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) #define flush_icache_page(vma,page) do { \ - flush_kernel_dcache_page(page); \ + flush_kernel_dcache_page_addr(page_address(page)); \ flush_kernel_icache_page(page_address(page)); \ } while (0) diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 86a1a63563fd..39e02227e231 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -334,7 +334,7 @@ void flush_dcache_page(struct page *page) return; } - flush_kernel_dcache_page(page); + flush_kernel_dcache_page_addr(page_address(page)); if (!mapping) return; @@ -375,7 +375,6 @@ EXPORT_SYMBOL(flush_dcache_page); /* Defined in arch/parisc/kernel/pacache.S */ EXPORT_SYMBOL(flush_kernel_dcache_range_asm); -EXPORT_SYMBOL(flush_kernel_dcache_page_asm); EXPORT_SYMBOL(flush_data_cache_local); EXPORT_SYMBOL(flush_kernel_icache_range_asm); diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index 4486a865ff62..372afa82fee6 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -63,6 +63,8 @@ static inline void flush_anon_page(struct vm_area_struct *vma, if (boot_cpu_data.dcache.n_aliases && PageAnon(page)) __flush_anon_page(page, vmaddr); } + +#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) { __flush_wback_region(addr, size); @@ -72,12 +74,6 @@ static inline void invalidate_kernel_vmap_range(void *addr, int size) __flush_invalidate_region(addr, size); } -#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -static inline void flush_kernel_dcache_page(struct page *page) -{ - flush_dcache_page(page); -} - extern void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, const void *src, unsigned long len); diff --git a/block/blk-map.c b/block/blk-map.c index 3743158ddaeb..4639bc6b5c62 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -309,7 +309,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, static void bio_invalidate_vmalloc_pages(struct bio *bio) { -#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE +#ifdef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE if (bio->bi_private && !op_is_write(bio_op(bio))) { unsigned long i, len = 0; diff --git a/fs/exec.c b/fs/exec.c index 38f63451b928..41a888d4edde 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -574,7 +574,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv, } if (kmapped_page) { - flush_kernel_dcache_page(kmapped_page); + flush_dcache_page(kmapped_page); kunmap(kmapped_page); put_arg_page(kmapped_page); } @@ -592,7 +592,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv, ret = 0; out: if (kmapped_page) { - flush_kernel_dcache_page(kmapped_page); + flush_dcache_page(kmapped_page); kunmap(kmapped_page); put_arg_page(kmapped_page); } @@ -634,7 +634,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) kaddr = kmap_atomic(page); flush_arg_page(bprm, pos & PAGE_MASK, page); memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy); - flush_kernel_dcache_page(page); + flush_dcache_page(page); kunmap_atomic(kaddr); put_arg_page(page); } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index d9a606a9fc64..b4c49f9cc379 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -130,10 +130,7 @@ static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page } #endif -#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -static inline void flush_kernel_dcache_page(struct page *page) -{ -} +#ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE static inline void flush_kernel_vmap_range(void *vaddr, int size) { } diff --git a/tools/testing/scatterlist/linux/mm.h b/tools/testing/scatterlist/linux/mm.h index f9a12005fcea..16ec895bbe5f 100644 --- a/tools/testing/scatterlist/linux/mm.h +++ b/tools/testing/scatterlist/linux/mm.h @@ -127,7 +127,6 @@ kmalloc_array(unsigned int n, unsigned int size, unsigned int flags) #define kmemleak_free(a) #define PageSlab(p) (0) -#define flush_kernel_dcache_page(p) #define MAX_ERRNO 4095 From f00230ff8411eaecbea1f2e528e205424f3725ba Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 2 Sep 2021 14:56:40 -0700 Subject: [PATCH 387/474] mm,do_huge_pmd_numa_page: remove unnecessary TLB flushing code Before commit c5b5a3dd2c1f ("mm: thp: refactor NUMA fault handling"), the TLB flushing is done in do_huge_pmd_numa_page() itself via flush_tlb_range(). But after commit c5b5a3dd2c1f ("mm: thp: refactor NUMA fault handling"), the TLB flushing is done in migrate_pages() as in the following code path anyway. do_huge_pmd_numa_page migrate_misplaced_page migrate_pages So now, the TLB flushing code in do_huge_pmd_numa_page() becomes unnecessary. So the code is deleted in this patch to simplify the code. This is only code cleanup, there's no visible performance difference. The mmu_notifier_invalidate_range() in do_huge_pmd_numa_page() is deleted too. Because migrate_pages() takes care of that too when CPU TLB is flushed. Link: https://lkml.kernel.org/r/20210720065529.716031-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Zi Yan Reviewed-by: Yang Shi Cc: Dan Carpenter Cc: Mel Gorman Cc: Christian Borntraeger Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Vasily Gorbik Cc: Paolo Bonzini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 890fb73ac89b..5e9ef0fc261e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1440,32 +1440,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) goto out; } - /* - * Since we took the NUMA fault, we must have observed the !accessible - * bit. Make sure all other CPUs agree with that, to avoid them - * modifying the page we're about to migrate. - * - * Must be done under PTL such that we'll observe the relevant - * inc_tlb_flush_pending(). - * - * We are not sure a pending tlb flush here is for a huge page - * mapping or not. Hence use the tlb range variant - */ - if (mm_tlb_flush_pending(vma->vm_mm)) { - flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); - /* - * change_huge_pmd() released the pmd lock before - * invalidating the secondary MMUs sharing the primary - * MMU pagetables (with ->invalidate_range()). The - * mmu_notifier_invalidate_range_end() (which - * internally calls ->invalidate_range()) in - * change_pmd_range() will run after us, so we can't - * rely on it here and we need an explicit invalidate. - */ - mmu_notifier_invalidate_range(vma->vm_mm, haddr, - haddr + HPAGE_PMD_SIZE); - } - pmd = pmd_modify(oldpmd, vma->vm_page_prot); page = vm_normal_page_pmd(vma, haddr, pmd); if (!page) From e15710bf04063766f428048f4dad7b73b646203f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 2 Sep 2021 14:56:43 -0700 Subject: [PATCH 388/474] mm: change fault_in_pages_* to have an unsigned size parameter fault_in_pages_writeable() and fault_in_pages_readable() treat the size parameter as unsigned, doing pointer math with the value, so make this explicit and set it to be a size_t type which all callers currently treat it as anyway. This solves the issue where static checkers get nervous seeing pointer arithmetic happening with a signed value. Link: https://lkml.kernel.org/r/20210727111136.457638-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman Reported-by: Jordy Zomer Cc: Matthew Wilcox Cc: David Howells Cc: William Kucharski Cc: "Darrick J. Wong" Cc: Hugh Dickins Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ed02aa522263..5dcf446f42e5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -736,7 +736,7 @@ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); /* * Fault everything in given userspace address range in. */ -static inline int fault_in_pages_writeable(char __user *uaddr, int size) +static inline int fault_in_pages_writeable(char __user *uaddr, size_t size) { char __user *end = uaddr + size - 1; @@ -763,7 +763,7 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size) return 0; } -static inline int fault_in_pages_readable(const char __user *uaddr, int size) +static inline int fault_in_pages_readable(const char __user *uaddr, size_t size) { volatile char c; const char __user *end = uaddr + size - 1; From 5b78ed24e8ec48602c1d6f5a188e58d000c81e2b Mon Sep 17 00:00:00 2001 From: Luigi Rizzo Date: Thu, 2 Sep 2021 14:56:46 -0700 Subject: [PATCH 389/474] mm/pagemap: add mmap_assert_locked() annotations to find_vma*() find_vma() and variants need protection when used. This patch adds mmap_assert_lock() calls in the functions. To make sure the invariant is satisfied, we also need to add a mmap_read_lock() around the get_user_pages_remote() call in get_arg_page(). The lock is not strictly necessary because the mm has been newly created, but the extra cost is limited because the same mutex was also acquired shortly before in __bprm_mm_init(), so it is hot and uncontended. [penguin-kernel@i-love.sakura.ne.jp: TOMOYO needs the same protection which get_arg_page() needs] Link: https://lkml.kernel.org/r/58bb6bf7-a57e-8a40-e74b-39584b415152@i-love.sakura.ne.jp Link: https://lkml.kernel.org/r/20210731175341.3458608-1-lrizzo@google.com Signed-off-by: Luigi Rizzo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 ++ mm/mmap.c | 2 ++ security/tomoyo/domain.c | 13 +++++++++---- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 41a888d4edde..fd292e9da4f9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -217,8 +217,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * We are doing an exec(). 'current' is the process * doing the exec and bprm->mm is the new process's mm. */ + mmap_read_lock(bprm->mm); ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, &page, NULL, NULL); + mmap_read_unlock(bprm->mm); if (ret <= 0) return NULL; diff --git a/mm/mmap.c b/mm/mmap.c index ca54d36d203a..79f4f8ae43ec 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -534,6 +534,7 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, { struct rb_node **__rb_link, *__rb_parent, *rb_prev; + mmap_assert_locked(mm); __rb_link = &mm->mm_rb.rb_node; rb_prev = __rb_parent = NULL; @@ -2303,6 +2304,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) struct rb_node *rb_node; struct vm_area_struct *vma; + mmap_assert_locked(mm); /* Check the cache first. */ vma = vmacache_find(mm, addr); if (likely(vma)) diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 98d985895ec8..31af29f669d2 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -897,6 +897,9 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, struct tomoyo_page_dump *dump) { struct page *page; +#ifdef CONFIG_MMU + int ret; +#endif /* dump->data is released by tomoyo_find_next_domain(). */ if (!dump->data) { @@ -909,11 +912,13 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, /* * This is called at execve() time in order to dig around * in the argv/environment of the new proceess - * (represented by bprm). 'current' is the process doing - * the execve(). + * (represented by bprm). */ - if (get_user_pages_remote(bprm->mm, pos, 1, - FOLL_FORCE, &page, NULL, NULL) <= 0) + mmap_read_lock(bprm->mm); + ret = get_user_pages_remote(bprm->mm, pos, 1, + FOLL_FORCE, &page, NULL, NULL); + mmap_read_unlock(bprm->mm); + if (ret <= 0) return false; #else page = bprm->page[pos / PAGE_SIZE]; From 9b593cb20283e68e5e65b09ca10038935297f05b Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 2 Sep 2021 14:56:49 -0700 Subject: [PATCH 390/474] remap_file_pages: Use vma_lookup() instead of find_vma() Using vma_lookup() verifies the start address is contained in the found vma. This results in easier to read code. Link: https://lkml.kernel.org/r/20210817135234.1550204-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 79f4f8ae43ec..52fed230dc21 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2994,14 +2994,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (mmap_write_lock_killable(mm)) return -EINTR; - vma = find_vma(mm, start); + vma = vma_lookup(mm, start); if (!vma || !(vma->vm_flags & VM_SHARED)) goto out; - if (start < vma->vm_start) - goto out; - if (start + size > vma->vm_end) { struct vm_area_struct *next; From 5e22928abe671ea1541f19afb80523442938d342 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Thu, 2 Sep 2021 14:56:52 -0700 Subject: [PATCH 391/474] mm/mremap: fix memory account on do_munmap() failure mremap will account the delta between new_len and old_len in vma_to_resize, and then call move_vma when expanding an existing memory mapping. In function move_vma, there are two scenarios when calling do_munmap: 1. move_page_tables from old_addr to new_addr success 2. move_page_tables from old_addr to new_addr fail In first scenario, it should account old_len if do_munmap fail, because the delta has already been accounted. In second scenario, new_addr/new_len will assign to old_addr/old_len if move_page_table fail, so do_munmap is try to unmap new_addr actually, if do_munmap fail, it should account the new_len, because error code will be return from move_vma, and delta will be unaccounted. What'more, because of new_len == old_len, so account old_len also is OK. In summary, account old_len will be correct if do_munmap fail. Link: https://lkml.kernel.org/r/20210717101942.120607-1-chenwandun@huawei.com Fixes: 51df7bcb6151 ("mm/mremap: account memory on do_munmap() failure") Signed-off-by: Chen Wandun Acked-by: Dmitry Safonov Cc: Kefeng Wang Cc: Wei Yongjun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mremap.c b/mm/mremap.c index 5989d3990020..badfe17ade1f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -686,7 +686,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { /* OOM: unable to split vma, just get accounts right */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) - vm_acct_memory(new_len >> PAGE_SHIFT); + vm_acct_memory(old_len >> PAGE_SHIFT); excess = 0; } From cdcfc631c80e716d4b3bf534471273456bb99556 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 2 Sep 2021 14:56:55 -0700 Subject: [PATCH 392/474] mm/bootmem_info.c: mark __init on register_page_bootmem_info_section register_page_bootmem_info_section() is only called from __init functions, so mark it __init as well. Link: https://lkml.kernel.org/r/20210817042221.77172-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: David Hildenbrand Cc: Oscar Salvador Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem_info.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index 5b152dba7344..f03f42f426f6 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -39,7 +39,7 @@ void put_page_bootmem(struct page *page) } #ifndef CONFIG_SPARSEMEM_VMEMMAP -static void register_page_bootmem_info_section(unsigned long start_pfn) +static void __init register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long mapsize, section_nr, i; struct mem_section *ms; @@ -74,7 +74,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) } #else /* CONFIG_SPARSEMEM_VMEMMAP */ -static void register_page_bootmem_info_section(unsigned long start_pfn) +static void __init register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long mapsize, section_nr, i; struct mem_section *ms; From a1bc561bb2d3d9b944878955095f53aeba30a166 Mon Sep 17 00:00:00 2001 From: Ohhoon Kwon Date: Thu, 2 Sep 2021 14:56:58 -0700 Subject: [PATCH 393/474] mm: sparse: pass section_nr to section_mark_present Patch series "mm: sparse: remove __section_nr() function", v4. This patch (of 3): With CONFIG_SPARSEMEM_EXTREME enabled, __section_nr() which converts mem_section to section_nr could be costly since it iterates all section roots to check if the given mem_section is in its range. Since both callers of section_mark_present already know section_nr, let's also pass section_nr as well as mem_section in order to reduce costly translation. Link: https://lkml.kernel.org/r/20210707150212.855-1-ohoono.kwon@samsung.com Link: https://lkml.kernel.org/r/20210707150212.855-2-ohoono.kwon@samsung.com Signed-off-by: Ohhoon Kwon Acked-by: Mike Rapoport Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index 6326cdf36c4f..8018ee7fcda5 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -187,10 +187,9 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, * those loops early. */ unsigned long __highest_present_section_nr; -static void section_mark_present(struct mem_section *ms) +static void __section_mark_present(struct mem_section *ms, + unsigned long section_nr) { - unsigned long section_nr = __section_nr(ms); - if (section_nr > __highest_present_section_nr) __highest_present_section_nr = section_nr; @@ -280,7 +279,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en if (!ms->section_mem_map) { ms->section_mem_map = sparse_encode_early_nid(nid) | SECTION_IS_ONLINE; - section_mark_present(ms); + __section_mark_present(ms, section); } } } @@ -934,7 +933,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, ms = __nr_to_section(section_nr); set_section_nid(section_nr, nid); - section_mark_present(ms); + __section_mark_present(ms, section_nr); /* Align memmap to section boundary in the subsection case */ if (section_nr_to_pfn(section_nr) != start_pfn) From fc1f5e980a463325cf41d39ac6a69aa3cca73995 Mon Sep 17 00:00:00 2001 From: Ohhoon Kwon Date: Thu, 2 Sep 2021 14:57:01 -0700 Subject: [PATCH 394/474] mm: sparse: pass section_nr to find_memory_block With CONFIG_SPARSEMEM_EXTREME enabled, __section_nr() which converts mem_section to section_nr could be costly since it iterates all section roots to check if the given mem_section is in its range. On the other hand, __nr_to_section() which converts section_nr to mem_section can be done in O(1). Let's pass section_nr instead of mem_section ptr to find_memory_block() in order to reduce needless iterations. Link: https://lkml.kernel.org/r/20210707150212.855-3-ohoono.kwon@samsung.com Signed-off-by: Ohhoon Kwon Acked-by: Michal Hocko Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/pseries/hotplug-memory.c | 4 +--- drivers/base/memory.c | 4 ++-- include/linux/memory.h | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 377d852f5a9a..d4f28ee4d5dc 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -211,13 +211,11 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb) static struct memory_block *lmb_to_memblock(struct drmem_lmb *lmb) { unsigned long section_nr; - struct mem_section *mem_sect; struct memory_block *mem_block; section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr)); - mem_sect = __nr_to_section(section_nr); - mem_block = find_memory_block(mem_sect); + mem_block = find_memory_block(section_nr); return mem_block; } diff --git a/drivers/base/memory.c b/drivers/base/memory.c index aa31a21f33d7..e3fd2dbf4eea 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -578,9 +578,9 @@ static struct memory_block *find_memory_block_by_id(unsigned long block_id) /* * Called under device_hotplug_lock. */ -struct memory_block *find_memory_block(struct mem_section *section) +struct memory_block *find_memory_block(unsigned long section_nr) { - unsigned long block_id = memory_block_id(__section_nr(section)); + unsigned long block_id = memory_block_id(section_nr); return find_memory_block_by_id(block_id); } diff --git a/include/linux/memory.h b/include/linux/memory.h index 97e92e8b556a..d9a0b61cd432 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -90,7 +90,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); -extern struct memory_block *find_memory_block(struct mem_section *); +extern struct memory_block *find_memory_block(unsigned long section_nr); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); From 11e02d3729da1a2d4a33db5ea61291770d411884 Mon Sep 17 00:00:00 2001 From: Ohhoon Kwon Date: Thu, 2 Sep 2021 14:57:04 -0700 Subject: [PATCH 395/474] mm: sparse: remove __section_nr() function As the last users of __section_nr() are gone, let's remove unused function __section_nr(). Link: https://lkml.kernel.org/r/20210707150212.855-4-ohoono.kwon@samsung.com Signed-off-by: Ohhoon Kwon Acked-by: Michal Hocko Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 - mm/sparse.c | 26 -------------------------- 2 files changed, 27 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fcb535560028..8827f4d081d4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1342,7 +1342,6 @@ static inline struct mem_section *__nr_to_section(unsigned long nr) return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; } -extern unsigned long __section_nr(struct mem_section *ms); extern size_t mem_section_usage_size(void); /* diff --git a/mm/sparse.c b/mm/sparse.c index 8018ee7fcda5..d85655242ed9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -109,32 +109,6 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) } #endif -#ifdef CONFIG_SPARSEMEM_EXTREME -unsigned long __section_nr(struct mem_section *ms) -{ - unsigned long root_nr; - struct mem_section *root = NULL; - - for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) { - root = __nr_to_section(root_nr * SECTIONS_PER_ROOT); - if (!root) - continue; - - if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) - break; - } - - VM_BUG_ON(!root); - - return (root_nr * SECTIONS_PER_ROOT) + (ms - root); -} -#else -unsigned long __section_nr(struct mem_section *ms) -{ - return (unsigned long)(ms - mem_section[0]); -} -#endif - /* * During early boot, before section_mem_map is used for an actual * mem_map, we use section_mem_map to store the section's NUMA From 01c8d337d195ed105cabab95bc4dcb9e145bf5ea Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 2 Sep 2021 14:57:07 -0700 Subject: [PATCH 396/474] mm/sparse: set SECTION_NID_SHIFT to 6 Currently SECTION_NID_SHIFT is set to 3, which is incorrect because bit 3 and 4 can be overlapped by sub-field for early NID, and can be unexpectedly set on NUMA systems. There are a few non-critical issues related to this: - Having SECTION_TAINT_ZONE_DEVICE set for wrong sections forces pfn_to_online_page() through the slow path, but doesn't actually break the kernel. - A kdump generation tool like makedumpfile uses this field to calculate the physical address to read. So wrong bits can make the tool access to wrong address and fail to create kdump. This can be avoided by the tool, so it's not critical. To fix it, set SECTION_NID_SHIFT to 6 which is the minimum number of available bits of section flag field. Link: https://lkml.kernel.org/r/20210707045548.810271-1-naoya.horiguchi@linux.dev Fixes: 1f90a3477df3 ("mm: teach pfn_to_online_page() about ZONE_DEVICE section collisions") Signed-off-by: Naoya Horiguchi Reported-by: Kazuhito Hagio Suggested-by: Dan Williams Acked-by: David Hildenbrand Cc: Oscar Salvador Cc: Wang Wensheng Cc: Rui Xiang Cc: Kazu Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8827f4d081d4..59bad25ce78e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1364,7 +1364,7 @@ extern size_t mem_section_usage_size(void); #define SECTION_TAINT_ZONE_DEVICE (1UL<<4) #define SECTION_MAP_LAST_BIT (1UL<<5) #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) -#define SECTION_NID_SHIFT 3 +#define SECTION_NID_SHIFT 6 static inline struct page *__section_mem_map_addr(struct mem_section *section) { From e0dbb2bccf19ce5e870afb420a3d0480c582bb7b Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 2 Sep 2021 14:57:10 -0700 Subject: [PATCH 397/474] include/linux/mmzone.h: avoid a warning in sparse memory support cppcheck warns that we're possibly losing information by shifting an int. It's a false positive, because we don't allow for a NUMA node ID that large, but if we ever change SECTION_NID_SHIFT, it could become a problem, and in any case this is usually a legitimate warning. Fix it by adding the necessary cast, which makes the compiler generate the right code. Link: https://lkml.kernel.org/r/YOya+aBZFFmC476e@casper.infradead.org Link: https://lkml.kernel.org/r/202107130348.6LsVT9Nc-lkp@intel.com Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index d85655242ed9..be7936e65b6a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -117,7 +117,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) */ static inline unsigned long sparse_encode_early_nid(int nid) { - return (nid << SECTION_NID_SHIFT); + return ((unsigned long)nid << SECTION_NID_SHIFT); } static inline int sparse_early_nid(struct mem_section *section) From bdbda735508ca83341899a77f143e4d5c58007b3 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Thu, 2 Sep 2021 14:57:13 -0700 Subject: [PATCH 398/474] mm/sparse: clarify pgdat_to_phys Clarify pgdat_to_phys() by testing if pgdat == &contig_page_data when CONFIG_NUMA=n. We only expect contig_page_data in such case, so we use &contig_page_data directly instead of pgdat. No functional change intended when CONFIG_BUG_VM=n. Comment from Mark [1]: " ... and I reckon it'd be clearer and more robust to define pgdat_to_phys() in the same ifdefs as contig_page_data so that these, stay in-sync. e.g. have: | #ifdef CONFIG_NUMA | #define pgdat_to_phys(x) virt_to_phys(x) | #else /* CONFIG_NUMA */ | | extern struct pglist_data contig_page_data; | ... | #define pgdat_to_phys(x) __pa_symbol(&contig_page_data) | | #endif /* CONIFIG_NUMA */ " [1] https://lore.kernel.org/linux-arm-kernel/20210615131902.GB47121@C02TD0UTHF1T.local/ Link: https://lkml.kernel.org/r/20210723123342.26406-1-miles.chen@mediatek.com Signed-off-by: Miles Chen Reviewed-by: David Hildenbrand Acked-by: Mike Rapoport Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index be7936e65b6a..fd29b3abffa0 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -321,7 +321,8 @@ size_t mem_section_usage_size(void) static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) { #ifndef CONFIG_NUMA - return __pa_symbol(pgdat); + VM_BUG_ON(pgdat != &contig_page_data); + return __pa_symbol(&contig_page_data); #else return __pa(pgdat); #endif From 343ab8178f318b6006d54865972ff9c433b29e10 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 2 Sep 2021 14:57:16 -0700 Subject: [PATCH 399/474] mm/vmalloc: use batched page requests in bulk-allocator In case of simultaneous vmalloc allocations, for example it is 1GB and 12 CPUs my system is able to hit "BUG: soft lockup" for !CONFIG_PREEMPT kernel. RIP: 0010:__alloc_pages_bulk+0xa9f/0xbb0 Call Trace: __vmalloc_node_range+0x11c/0x2d0 __vmalloc_node+0x4b/0x70 fix_size_alloc_test+0x44/0x60 [test_vmalloc] test_func+0xe7/0x1f0 [test_vmalloc] kthread+0x11a/0x140 ret_from_fork+0x22/0x30 To address this issue invoke a bulk-allocator many times until all pages are obtained, i.e. do batched page requests adding cond_resched() meanwhile to reschedule. Batched value is hard-coded and is 100 pages per call. Link: https://lkml.kernel.org/r/20210707182639.31282-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Cc: Christoph Hellwig Cc: Hillf Danton Cc: Matthew Wilcox Cc: Mel Gorman Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d5cd52805149..24bc65f02d04 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2779,7 +2779,7 @@ EXPORT_SYMBOL_GPL(vmap_pfn); static inline unsigned int vm_area_alloc_pages(gfp_t gfp, int nid, - unsigned int order, unsigned long nr_pages, struct page **pages) + unsigned int order, unsigned int nr_pages, struct page **pages) { unsigned int nr_allocated = 0; @@ -2789,10 +2789,32 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * to fails, fallback to a single page allocator that is * more permissive. */ - if (!order) - nr_allocated = alloc_pages_bulk_array_node( - gfp, nid, nr_pages, pages); - else + if (!order) { + while (nr_allocated < nr_pages) { + unsigned int nr, nr_pages_request; + + /* + * A maximum allowed request is hard-coded and is 100 + * pages per call. That is done in order to prevent a + * long preemption off scenario in the bulk-allocator + * so the range is [1:100]. + */ + nr_pages_request = min(100U, nr_pages - nr_allocated); + + nr = alloc_pages_bulk_array_node(gfp, nid, + nr_pages_request, pages + nr_allocated); + + nr_allocated += nr; + cond_resched(); + + /* + * If zero or pages were obtained partly, + * fallback to a single page allocator. + */ + if (nr != nr_pages_request) + break; + } + } else /* * Compound pages required for remap_vmalloc_page if * high-order pages. From 12e376a6f859a000308b6c7cf4a2493eda2bb026 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 2 Sep 2021 14:57:19 -0700 Subject: [PATCH 400/474] mm/vmalloc: remove gfpflags_allow_blocking() check Get rid of gfpflags_allow_blocking() check from the vmalloc() path as it is supposed to be sleepable anyway. Thus remove it from the alloc_vmap_area() as well as from the vm_area_alloc_pages(). Link: https://lkml.kernel.org/r/20210707182639.31282-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Cc: Mel Gorman Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Hillf Danton Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 24bc65f02d04..5dcb65dd9bf3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1479,6 +1479,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int node, gfp_t gfp_mask) { struct vmap_area *va; + unsigned long freed; unsigned long addr; int purged = 0; int ret; @@ -1542,13 +1543,12 @@ overflow: goto retry; } - if (gfpflags_allow_blocking(gfp_mask)) { - unsigned long freed = 0; - blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); - if (freed > 0) { - purged = 0; - goto retry; - } + freed = 0; + blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); + + if (freed > 0) { + purged = 0; + goto retry; } if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) @@ -2838,9 +2838,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, for (i = 0; i < (1U << order); i++) pages[nr_allocated + i] = page + i; - if (gfpflags_allow_blocking(gfp)) - cond_resched(); - + cond_resched(); nr_allocated += 1U << order; } From f8bcbecfb6b48c026e2205679c063d1f16f5a2c0 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 2 Sep 2021 14:57:23 -0700 Subject: [PATCH 401/474] lib/test_vmalloc.c: add a new 'nr_pages' parameter In order to simulate different fixed sizes for vmalloc allocation introduce a new parameter that sets number of pages to be allocated for the "fix_size_alloc_test" test. By default 1 page is used unless a different number is specified over the new parameter. Link: https://lkml.kernel.org/r/20210710194151.21370-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Mel Gorman Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Hillf Danton Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_vmalloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 01e9543de566..e14993bc84d2 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -35,6 +35,9 @@ __param(int, test_repeat_count, 1, __param(int, test_loop_count, 1000000, "Set test loop counter"); +__param(int, nr_pages, 0, + "Set number of pages for fix_size_alloc_test(default: 1)"); + __param(int, run_test_mask, INT_MAX, "Set tests specified in the mask.\n\n" "\t\tid: 1, name: fix_size_alloc_test\n" @@ -262,7 +265,7 @@ static int fix_size_alloc_test(void) int i; for (i = 0; i < test_loop_count; i++) { - ptr = vmalloc(3 * PAGE_SIZE); + ptr = vmalloc((nr_pages > 0 ? nr_pages:1) * PAGE_SIZE); if (!ptr) return -1; From f181234a5a21fd0a86b793330016b92c7b3ed8ee Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Thu, 2 Sep 2021 14:57:26 -0700 Subject: [PATCH 402/474] mm/vmalloc: fix wrong behavior in vread commit f608788cd2d6 ("mm/vmalloc: use rb_tree instead of list for vread() lookups") use rb_tree instread of list to speed up lookup, but function __find_vmap_area is try to find a vmap_area that include target address, if target address is smaller than the leftmost node in vmap_area_root, it will return NULL, then vread will read nothing. This behavior is different from the primitive semantics. The correct way is find the first vmap_are that bigger than target addr, that is what function find_vmap_area_exceed_addr does. Link: https://lkml.kernel.org/r/20210714015959.3204871-1-chenwandun@huawei.com Fixes: f608788cd2d6 ("mm/vmalloc: use rb_tree instead of list for vread() lookups") Signed-off-by: Chen Wandun Reported-by: Hulk Robot Cc: Serapheim Dimitropoulos Cc: Uladzislau Rezki (Sony) Cc: Kefeng Wang Cc: Wei Yongjun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5dcb65dd9bf3..3824dc16ce1c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -787,6 +787,28 @@ unsigned long vmalloc_nr_pages(void) return atomic_long_read(&nr_vmalloc_pages); } +static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) +{ + struct vmap_area *va = NULL; + struct rb_node *n = vmap_area_root.rb_node; + + while (n) { + struct vmap_area *tmp; + + tmp = rb_entry(n, struct vmap_area, rb_node); + if (tmp->va_end > addr) { + va = tmp; + if (tmp->va_start <= addr) + break; + + n = n->rb_left; + } else + n = n->rb_right; + } + + return va; +} + static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; @@ -3287,9 +3309,14 @@ long vread(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; spin_lock(&vmap_area_lock); - va = __find_vmap_area((unsigned long)addr); + va = find_vmap_area_exceed_addr((unsigned long)addr); if (!va) goto finished; + + /* no intersects with alive vmap_area */ + if ((unsigned long)addr + count <= va->va_start) + goto finished; + list_for_each_entry_from(va, &vmap_area_list, list) { if (!count) break; From c9d1af2b780a7077d253047ccc81c5253cf0974a Mon Sep 17 00:00:00 2001 From: Woody Lin Date: Thu, 2 Sep 2021 14:57:29 -0700 Subject: [PATCH 403/474] mm/kasan: move kasan.fault to mm/kasan/report.c Move the boot parameter 'kasan.fault' from hw_tags.c to report.c, so it can support all KASAN modes - generic, and both tag-based. Link: https://lkml.kernel.org/r/20210713010536.3161822-1-woodylin@google.com Signed-off-by: Woody Lin Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 13 ++++++---- mm/kasan/hw_tags.c | 43 ------------------------------- mm/kasan/kasan.h | 1 - mm/kasan/report.c | 29 ++++++++++++++++++--- 4 files changed, 34 insertions(+), 52 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 83ec4a556c19..21dc03bc10a4 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -181,9 +181,16 @@ By default, KASAN prints a bug report only for the first invalid memory access. With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This effectively disables ``panic_on_warn`` for KASAN reports. +Alternatively, independent of ``panic_on_warn`` the ``kasan.fault=`` boot +parameter can be used to control panic and reporting behaviour: + +- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN + report or also panic the kernel (default: ``report``). The panic happens even + if ``kasan_multi_shot`` is enabled. + Hardware tag-based KASAN mode (see the section about various modes below) is intended for use in production as a security mitigation. Therefore, it supports -boot parameters that allow disabling KASAN or controlling its features. +additional boot parameters that allow disabling KASAN or controlling features: - ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``). @@ -199,10 +206,6 @@ boot parameters that allow disabling KASAN or controlling its features. - ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack traces collection (default: ``on``). -- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN - report or also panic the kernel (default: ``report``). The panic happens even - if ``kasan_multi_shot`` is enabled. - Implementation details ---------------------- diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 4ea8c368b5b8..51903639e55f 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -37,16 +37,9 @@ enum kasan_arg_stacktrace { KASAN_ARG_STACKTRACE_ON, }; -enum kasan_arg_fault { - KASAN_ARG_FAULT_DEFAULT, - KASAN_ARG_FAULT_REPORT, - KASAN_ARG_FAULT_PANIC, -}; - static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init; -static enum kasan_arg_fault kasan_arg_fault __ro_after_init; /* Whether KASAN is enabled at all. */ DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); @@ -59,9 +52,6 @@ EXPORT_SYMBOL_GPL(kasan_flag_async); /* Whether to collect alloc/free stack traces. */ DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); -/* Whether to panic or print a report and disable tag checking on fault. */ -bool kasan_flag_panic __ro_after_init; - /* kasan=off/on */ static int __init early_kasan_flag(char *arg) { @@ -113,23 +103,6 @@ static int __init early_kasan_flag_stacktrace(char *arg) } early_param("kasan.stacktrace", early_kasan_flag_stacktrace); -/* kasan.fault=report/panic */ -static int __init early_kasan_fault(char *arg) -{ - if (!arg) - return -EINVAL; - - if (!strcmp(arg, "report")) - kasan_arg_fault = KASAN_ARG_FAULT_REPORT; - else if (!strcmp(arg, "panic")) - kasan_arg_fault = KASAN_ARG_FAULT_PANIC; - else - return -EINVAL; - - return 0; -} -early_param("kasan.fault", early_kasan_fault); - /* kasan_init_hw_tags_cpu() is called for each CPU. */ void kasan_init_hw_tags_cpu(void) { @@ -197,22 +170,6 @@ void __init kasan_init_hw_tags(void) break; } - switch (kasan_arg_fault) { - case KASAN_ARG_FAULT_DEFAULT: - /* - * Default to no panic on report. - * Do nothing, kasan_flag_panic keeps its default value. - */ - break; - case KASAN_ARG_FAULT_REPORT: - /* Do nothing, kasan_flag_panic keeps its default value. */ - break; - case KASAN_ARG_FAULT_PANIC: - /* Enable panic on report. */ - kasan_flag_panic = true; - break; - } - pr_info("KernelAddressSanitizer initialized\n"); } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index d739cdd1621a..fa02c88b6948 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -37,7 +37,6 @@ static inline bool kasan_async_mode_enabled(void) #endif -extern bool kasan_flag_panic __ro_after_init; extern bool kasan_flag_async __ro_after_init; #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 8fff1825b22c..884a950c7026 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -39,6 +39,31 @@ static unsigned long kasan_flags; #define KASAN_BIT_REPORTED 0 #define KASAN_BIT_MULTI_SHOT 1 +enum kasan_arg_fault { + KASAN_ARG_FAULT_DEFAULT, + KASAN_ARG_FAULT_REPORT, + KASAN_ARG_FAULT_PANIC, +}; + +static enum kasan_arg_fault kasan_arg_fault __ro_after_init = KASAN_ARG_FAULT_DEFAULT; + +/* kasan.fault=report/panic */ +static int __init early_kasan_fault(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "report")) + kasan_arg_fault = KASAN_ARG_FAULT_REPORT; + else if (!strcmp(arg, "panic")) + kasan_arg_fault = KASAN_ARG_FAULT_PANIC; + else + return -EINVAL; + + return 0; +} +early_param("kasan.fault", early_kasan_fault); + bool kasan_save_enable_multi_shot(void) { return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); @@ -102,10 +127,8 @@ static void end_report(unsigned long *flags, unsigned long addr) panic_on_warn = 0; panic("panic_on_warn set ...\n"); } -#ifdef CONFIG_KASAN_HW_TAGS - if (kasan_flag_panic) + if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC) panic("kasan.fault=panic set ...\n"); -#endif kasan_enable_current(); } From ab512805710fa0e4ec6b61fee8a52d044a060009 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 2 Sep 2021 14:57:32 -0700 Subject: [PATCH 404/474] kasan: test: rework kmalloc_oob_right Patch series "kasan: test: avoid crashing the kernel with HW_TAGS", v2. KASAN tests do out-of-bounds and use-after-free accesses. Running the tests works fine for the GENERIC mode, as it uses qurantine and redzones. But the HW_TAGS mode uses neither, and running the tests might crash the kernel. Rework the tests to avoid corrupting kernel memory. This patch (of 8): Rework kmalloc_oob_right() to do these bad access checks: 1. An unaligned access one byte past the requested kmalloc size (can only be detected by KASAN_GENERIC). 2. An aligned access into the first out-of-bounds granule that falls within the aligned kmalloc object. 3. Out-of-bounds access past the aligned kmalloc object. Test #3 deliberately uses a read access to avoid corrupting memory. Otherwise, this test might lead to crashes with the HW_TAGS mode, as it neither uses quarantine nor redzones. Link: https://lkml.kernel.org/r/cover.1628779805.git.andreyknvl@gmail.com Link: https://lkml.kernel.org/r/474aa8b7b538c6737a4c6d0090350af2e1776bef.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 8f7b0b2f6e11..1bc3cdd2957f 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -122,12 +122,28 @@ static void kasan_test_exit(struct kunit *test) static void kmalloc_oob_right(struct kunit *test) { char *ptr; - size_t size = 123; + size_t size = 128 - KASAN_GRANULE_SIZE - 5; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + OOB_TAG_OFF] = 'x'); + /* + * An unaligned access past the requested kmalloc size. + * Only generic KASAN can precisely detect these. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 'x'); + + /* + * An aligned access into the first out-of-bounds granule that falls + * within the aligned kmalloc object. + */ + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + 5] = 'y'); + + /* Out-of-bounds access past the aligned kmalloc object. */ + KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = + ptr[size + KASAN_GRANULE_SIZE + 5]); + kfree(ptr); } From 8fbad19bdcb4b9be8131536e5bb9616ab2e4eeb3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 2 Sep 2021 14:57:35 -0700 Subject: [PATCH 405/474] kasan: test: avoid writing invalid memory Multiple KASAN tests do writes past the allocated objects or writes to freed memory. Turn these writes into reads to avoid corrupting memory. Otherwise, these tests might lead to crashes with the HW_TAGS mode, as it neither uses quarantine nor redzones. Link: https://lkml.kernel.org/r/c3cd2a383e757e27dd9131635fc7d09a48a49cf9.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 1bc3cdd2957f..c82a82eb5393 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -167,7 +167,7 @@ static void kmalloc_node_oob_right(struct kunit *test) ptr = kmalloc_node(size, GFP_KERNEL, 0); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0); + KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]); kfree(ptr); } @@ -203,7 +203,7 @@ static void kmalloc_pagealloc_uaf(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); kfree(ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = 0); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); } static void kmalloc_pagealloc_invalid_free(struct kunit *test) @@ -237,7 +237,7 @@ static void pagealloc_oob_right(struct kunit *test) ptr = page_address(pages); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0); + KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]); free_pages((unsigned long)ptr, order); } @@ -252,7 +252,7 @@ static void pagealloc_uaf(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); free_pages((unsigned long)ptr, order); - KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = 0); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); } static void kmalloc_large_oob_right(struct kunit *test) @@ -514,7 +514,7 @@ static void kmalloc_uaf(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); kfree(ptr); - KUNIT_EXPECT_KASAN_FAIL(test, *(ptr + 8) = 'x'); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[8]); } static void kmalloc_uaf_memset(struct kunit *test) @@ -553,7 +553,7 @@ again: goto again; } - KUNIT_EXPECT_KASAN_FAIL(test, ptr1[40] = 'x'); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[40]); KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2); kfree(ptr2); @@ -700,7 +700,7 @@ static void ksize_unpoisons_memory(struct kunit *test) ptr[size] = 'x'; /* This one must. */ - KUNIT_EXPECT_KASAN_FAIL(test, ptr[real_size] = 'y'); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size]); kfree(ptr); } From 555999a009aacd90ea51a6690e8eb2a5d0427edc Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 2 Sep 2021 14:57:38 -0700 Subject: [PATCH 406/474] kasan: test: avoid corrupting memory via memset kmalloc_oob_memset_*() tests do writes past the allocated objects. As the result, they corrupt memory, which might lead to crashes with the HW_TAGS mode, as it neither uses quarantine nor redzones. Adjust the tests to only write memory within the aligned kmalloc objects. Also add a comment mentioning that memset tests are designed to touch both valid and invalid memory. Link: https://lkml.kernel.org/r/64fd457668a16e7b58d094f14a165f9d5170c5a9.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index c82a82eb5393..db73bc9e3fa2 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -428,64 +428,70 @@ static void kmalloc_uaf_16(struct kunit *test) kfree(ptr1); } +/* + * Note: in the memset tests below, the written range touches both valid and + * invalid memory. This makes sure that the instrumentation does not only check + * the starting address but the whole range. + */ + static void kmalloc_oob_memset_2(struct kunit *test) { char *ptr; - size_t size = 8; + size_t size = 128 - KASAN_GRANULE_SIZE; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 7 + OOB_TAG_OFF, 0, 2)); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 1, 0, 2)); kfree(ptr); } static void kmalloc_oob_memset_4(struct kunit *test) { char *ptr; - size_t size = 8; + size_t size = 128 - KASAN_GRANULE_SIZE; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 5 + OOB_TAG_OFF, 0, 4)); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 3, 0, 4)); kfree(ptr); } - static void kmalloc_oob_memset_8(struct kunit *test) { char *ptr; - size_t size = 8; + size_t size = 128 - KASAN_GRANULE_SIZE; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 1 + OOB_TAG_OFF, 0, 8)); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 7, 0, 8)); kfree(ptr); } static void kmalloc_oob_memset_16(struct kunit *test) { char *ptr; - size_t size = 16; + size_t size = 128 - KASAN_GRANULE_SIZE; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 1 + OOB_TAG_OFF, 0, 16)); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 15, 0, 16)); kfree(ptr); } static void kmalloc_oob_in_memset(struct kunit *test) { char *ptr; - size_t size = 666; + size_t size = 128 - KASAN_GRANULE_SIZE; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size + 5 + OOB_TAG_OFF)); + KUNIT_EXPECT_KASAN_FAIL(test, + memset(ptr, 0, size + KASAN_GRANULE_SIZE)); kfree(ptr); } From 1b0668be62cfa394903bb368641c80533bf42d5a Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 2 Sep 2021 14:57:41 -0700 Subject: [PATCH 407/474] kasan: test: disable kmalloc_memmove_invalid_size for HW_TAGS The HW_TAGS mode doesn't check memmove for negative size. As a result, the kmalloc_memmove_invalid_size test corrupts memory, which can result in a crash. Disable this test with HW_TAGS KASAN. Link: https://lkml.kernel.org/r/088733a06ac21eba29aa85b6f769d2abd74f9638.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index db73bc9e3fa2..1f533a7346d9 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -501,11 +501,17 @@ static void kmalloc_memmove_invalid_size(struct kunit *test) size_t size = 64; volatile size_t invalid_size = -2; + /* + * Hardware tag-based mode doesn't check memmove for negative size. + * As a result, this test introduces a side-effect memory corruption, + * which can result in a crash. + */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_HW_TAGS); + ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset((char *)ptr, 0, 64); - KUNIT_EXPECT_KASAN_FAIL(test, memmove((char *)ptr, (char *)ptr + 4, invalid_size)); kfree(ptr); From 25b12a58e848459ae2dbf2e7d318ef168bd1c5e2 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 2 Sep 2021 14:57:44 -0700 Subject: [PATCH 408/474] kasan: test: only do kmalloc_uaf_memset for generic mode kmalloc_uaf_memset() writes to freed memory, which is only safe with the GENERIC mode (as it uses quarantine). For other modes, this test corrupts kernel memory, which might result in a crash. Only enable kmalloc_uaf_memset() for the GENERIC mode. Link: https://lkml.kernel.org/r/2e1c87b607b1292556cde3cab2764f108542b60c.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 1f533a7346d9..1dcba6dbfc97 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -534,6 +534,12 @@ static void kmalloc_uaf_memset(struct kunit *test) char *ptr; size_t size = 33; + /* + * Only generic KASAN uses quarantine, which is required to avoid a + * kernel memory corruption this test causes. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); From b38fcca339dbcf680c9e43054502608fabc81508 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 2 Sep 2021 14:57:47 -0700 Subject: [PATCH 409/474] kasan: test: clean up ksize_uaf Some KASAN tests use global variables to store function returns values so that the compiler doesn't optimize away these functions. ksize_uaf() doesn't call any functions, so it doesn't need to use kasan_int_result. Use volatile accesses instead, to be consistent with other similar tests. Link: https://lkml.kernel.org/r/a1fc34faca4650f4a6e4dfb3f8d8d82c82eb953a.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 1dcba6dbfc97..30f2cde96e81 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -737,8 +737,8 @@ static void ksize_uaf(struct kunit *test) kfree(ptr); KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr)); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *ptr); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *(ptr + size)); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]); } static void kasan_stack_oob(struct kunit *test) From 756e5a47a5ddf0caa3708f922385a92af9d330b5 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 2 Sep 2021 14:57:50 -0700 Subject: [PATCH 410/474] kasan: test: avoid corrupting memory in copy_user_test copy_user_test() does writes past the allocated object. As the result, it corrupts kernel memory, which might lead to crashes with the HW_TAGS mode, as it neither uses quarantine nor redzones. (Technically, this test can't yet be enabled with the HW_TAGS mode, but this will be implemented in the future.) Adjust the test to only write memory within the aligned kmalloc object. Link: https://lkml.kernel.org/r/19bf3a5112ee65b7db88dc731643b657b816c5e8.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan_module.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/lib/test_kasan_module.c b/lib/test_kasan_module.c index f1017f345d6c..fa73b9df0be4 100644 --- a/lib/test_kasan_module.c +++ b/lib/test_kasan_module.c @@ -15,13 +15,11 @@ #include "../mm/kasan/kasan.h" -#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE) - static noinline void __init copy_user_test(void) { char *kmem; char __user *usermem; - size_t size = 10; + size_t size = 128 - KASAN_GRANULE_SIZE; int __maybe_unused unused; kmem = kmalloc(size, GFP_KERNEL); @@ -38,25 +36,25 @@ static noinline void __init copy_user_test(void) } pr_info("out-of-bounds in copy_from_user()\n"); - unused = copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF); + unused = copy_from_user(kmem, usermem, size + 1); pr_info("out-of-bounds in copy_to_user()\n"); - unused = copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF); + unused = copy_to_user(usermem, kmem, size + 1); pr_info("out-of-bounds in __copy_from_user()\n"); - unused = __copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF); + unused = __copy_from_user(kmem, usermem, size + 1); pr_info("out-of-bounds in __copy_to_user()\n"); - unused = __copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF); + unused = __copy_to_user(usermem, kmem, size + 1); pr_info("out-of-bounds in __copy_from_user_inatomic()\n"); - unused = __copy_from_user_inatomic(kmem, usermem, size + 1 + OOB_TAG_OFF); + unused = __copy_from_user_inatomic(kmem, usermem, size + 1); pr_info("out-of-bounds in __copy_to_user_inatomic()\n"); - unused = __copy_to_user_inatomic(usermem, kmem, size + 1 + OOB_TAG_OFF); + unused = __copy_to_user_inatomic(usermem, kmem, size + 1); pr_info("out-of-bounds in strncpy_from_user()\n"); - unused = strncpy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF); + unused = strncpy_from_user(kmem, usermem, size + 1); vm_munmap((unsigned long)usermem, PAGE_SIZE); kfree(kmem); From f16de0bcdb55bf18e2533ca625f3e4b4952f254c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 2 Sep 2021 14:57:53 -0700 Subject: [PATCH 411/474] kasan: test: avoid corrupting memory in kasan_rcu_uaf kasan_rcu_uaf() writes to freed memory via kasan_rcu_reclaim(), which is only safe with the GENERIC mode (as it uses quarantine). For other modes, this test corrupts kernel memory, which might result in a crash. Turn the write into a read. Link: https://lkml.kernel.org/r/b6f2c3bf712d2457c783fa59498225b66a634f62.1628779805.git.andreyknvl@gmail.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan_module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_kasan_module.c b/lib/test_kasan_module.c index fa73b9df0be4..7ebf433edef3 100644 --- a/lib/test_kasan_module.c +++ b/lib/test_kasan_module.c @@ -71,7 +71,7 @@ static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp) struct kasan_rcu_info, rcu); kfree(fp); - fp->i = 1; + ((volatile struct kasan_rcu_info *)fp)->i; } static noinline void __init kasan_rcu_uaf(void) From c3ab6baf6a004eab7344a1d8880a971f2414e1b6 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 2 Sep 2021 14:57:56 -0700 Subject: [PATCH 412/474] mm/page_alloc: always initialize memory map for the holes Patch series "mm: ensure consistency of memory map poisoning". Currently memory map allocation for FLATMEM case does not poison the struct pages regardless of CONFIG_PAGE_POISON setting. This happens because allocation of the memory map for FLATMEM and SPARSMEM use different memblock functions and those that are used for SPARSMEM case (namely memblock_alloc_try_nid_raw() and memblock_alloc_exact_nid_raw()) implicitly poison the allocated memory. Another side effect of this implicit poisoning is that early setup code that uses the same functions to allocate memory burns cycles for the memory poisoning even if it was not intended. These patches introduce memmap_alloc() wrapper that ensure that the memory map allocation is consistent for different memory models. This patch (of 4): Currently memory map for the holes is initialized only when SPARSEMEM memory model is used. Yet, even with FLATMEM there could be holes in the physical memory layout that have memory map entries. For instance, the memory reserved using e820 API on i386 or "reserved-memory" nodes in device tree would not appear in memblock.memory and hence the struct pages for such holes will be skipped during memory map initialization. These struct pages will be zeroed because the memory map for FLATMEM systems is allocated with memblock_alloc_node() that clears the allocated memory. While zeroed struct pages do not cause immediate problems, the correct behaviour is to initialize every page using __init_single_page(). Besides, enabling page poison for FLATMEM case will trigger PF_POISONED_CHECK() unless the memory map is properly initialized. Make sure init_unavailable_range() is called for both SPARSEMEM and FLATMEM so that struct pages representing memory holes would appear as PG_Reserved with any memory layout. [rppt@kernel.org: fix microblaze] Link: https://lkml.kernel.org/r/YQWW3RCE4eWBuMu/@kernel.org Link: https://lkml.kernel.org/r/20210714123739.16493-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20210714123739.16493-2-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: David Hildenbrand Tested-by: Guenter Roeck Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/microblaze/include/asm/page.h | 3 +-- mm/page_alloc.c | 8 -------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index ce550978f4fc..4b8b2fa78fc5 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -112,8 +112,7 @@ extern int page_is_ram(unsigned long pfn); # define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) # define ARCH_PFN_OFFSET (memory_start >> PAGE_SHIFT) -# define pfn_valid(pfn) ((pfn) < (max_mapnr + ARCH_PFN_OFFSET)) - +# define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (max_mapnr + ARCH_PFN_OFFSET)) # endif /* __ASSEMBLY__ */ #define virt_addr_valid(vaddr) (pfn_valid(virt_to_pfn(vaddr))) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 60a867a549bd..bd2efd6287ce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6642,7 +6642,6 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -#if !defined(CONFIG_FLATMEM) /* * Only struct pages that correspond to ranges defined by memblock.memory * are zeroed and initialized by going through __init_single_page() during @@ -6687,13 +6686,6 @@ static void __init init_unavailable_range(unsigned long spfn, pr_info("On node %d, zone %s: %lld pages in unavailable ranges", node, zone_names[zone], pgcnt); } -#else -static inline void init_unavailable_range(unsigned long spfn, - unsigned long epfn, - int zone, int node) -{ -} -#endif static void __init memmap_init_zone_range(struct zone *zone, unsigned long start_pfn, From 22e7878102f94a50e9a4c2c19f909a9a0898c4ce Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 2 Sep 2021 14:57:59 -0700 Subject: [PATCH 413/474] microblaze: simplify pte_alloc_one_kernel() The microblaze's implementation of pte_alloc_one_kernel() used memblock_alloc_try_nid_raw() along with clear_page() to allocated a zeroed page during early setup. Replace calls of these functions with a call to memblock_alloc_try_nid() that already returns zeroed page and respects the same allocation limits as memblock_alloc_try_nid_raw(). While on it drop early_get_page() wrapper that was only used in pte_alloc_one_kernel(). Link: https://lkml.kernel.org/r/20210714123739.16493-3-rppt@kernel.org Signed-off-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/microblaze/include/asm/pgtable.h | 2 -- arch/microblaze/mm/init.c | 12 ------------ arch/microblaze/mm/pgtable.c | 17 ++++++++--------- 3 files changed, 8 insertions(+), 23 deletions(-) diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 71cd547655d9..c136a01e467e 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -443,8 +443,6 @@ extern int mem_init_done; asmlinkage void __init mmu_init(void); -void __init *early_get_page(void); - #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index ab55c70380a5..952f35b335b2 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -265,18 +265,6 @@ asmlinkage void __init mmu_init(void) dma_contiguous_reserve(memory_start + lowmem_size - 1); } -/* This is only called until mem_init is done. */ -void __init *early_get_page(void) -{ - /* - * Mem start + kernel_tlb -> here is limit - * because of mem mapping from head.S - */ - return memblock_alloc_try_nid_raw(PAGE_SIZE, PAGE_SIZE, - MEMBLOCK_LOW_LIMIT, memory_start + kernel_tlb, - NUMA_NO_NODE); -} - void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask) { void *p; diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index 38ccb909bc9d..c1833b159d3b 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -242,15 +243,13 @@ unsigned long iopa(unsigned long addr) __ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - pte_t *pte; - if (mem_init_done) { - pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - } else { - pte = (pte_t *)early_get_page(); - if (pte) - clear_page(pte); - } - return pte; + if (mem_init_done) + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + else + return memblock_alloc_try_nid(PAGE_SIZE, PAGE_SIZE, + MEMBLOCK_LOW_LIMIT, + memory_start + kernel_tlb, + NUMA_NO_NODE); } void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) From c803b3c8b3b70f306ee6300bf8acdd70ffd1441a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 2 Sep 2021 14:58:02 -0700 Subject: [PATCH 414/474] mm: introduce memmap_alloc() to unify memory map allocation There are several places that allocate memory for the memory map: alloc_node_mem_map() for FLATMEM, sparse_buffer_init() and __populate_section_memmap() for SPARSEMEM. The memory allocated in the FLATMEM case is zeroed and it is never poisoned, regardless of CONFIG_PAGE_POISON setting. The memory allocated in the SPARSEMEM cases is not zeroed and it is implicitly poisoned inside memblock if CONFIG_PAGE_POISON is set. Introduce memmap_alloc() wrapper for memblock allocators that will be used for both FLATMEM and SPARSEMEM cases and will makei memory map zeroing and poisoning consistent for different memory models. Link: https://lkml.kernel.org/r/20210714123739.16493-4-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 4 ++++ mm/page_alloc.c | 24 ++++++++++++++++++++++-- mm/sparse.c | 6 ++---- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 31ff935b2547..57e28261a3b1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -211,6 +211,10 @@ extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); +extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, + int nid, bool exact_nid); + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bd2efd6287ce..71ad97c96075 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6748,6 +6748,26 @@ static void __init memmap_init(void) init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); } +void __init *memmap_alloc(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, int nid, bool exact_nid) +{ + void *ptr; + + if (exact_nid) + ptr = memblock_alloc_exact_nid_raw(size, align, min_addr, + MEMBLOCK_ALLOC_ACCESSIBLE, + nid); + else + ptr = memblock_alloc_try_nid_raw(size, align, min_addr, + MEMBLOCK_ALLOC_ACCESSIBLE, + nid); + + if (ptr && size > 0) + page_init_poison(ptr, size); + + return ptr; +} + static int zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU @@ -7519,8 +7539,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); - map = memblock_alloc_node(size, SMP_CACHE_BYTES, - pgdat->node_id); + map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, + pgdat->node_id, false); if (!map) panic("Failed to allocate %ld bytes for node %d memory map\n", size, pgdat->node_id); diff --git a/mm/sparse.c b/mm/sparse.c index fd29b3abffa0..120bc8ea5293 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -436,8 +436,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn, if (map) return map; - map = memblock_alloc_try_nid_raw(size, size, addr, - MEMBLOCK_ALLOC_ACCESSIBLE, nid); + map = memmap_alloc(size, size, addr, nid, false); if (!map) panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n", __func__, size, PAGE_SIZE, nid, &addr); @@ -464,8 +463,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid) * and we want it to be properly aligned to the section size - this is * especially the case for VMEMMAP which maps memmap to PMDs */ - sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(), - addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); sparsemap_buf_end = sparsemap_buf + size; } From 08678804e0b305bbbf5b756ad365373e5fe885a2 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 2 Sep 2021 14:58:05 -0700 Subject: [PATCH 415/474] memblock: stop poisoning raw allocations Functions memblock_alloc_exact_nid_raw() and memblock_alloc_try_nid_raw() are intended for early memory allocation without overhead of zeroing the allocated memory. Since these functions were used to allocate the memory map, they have ended up with addition of a call to page_init_poison() that poisoned the allocated memory when CONFIG_PAGE_POISON was set. Since the memory map is allocated using a dedicated memmep_alloc() function that takes care of the poisoning, remove page poisoning from the memblock_alloc_*_raw() functions. Link: https://lkml.kernel.org/r/20210714123739.16493-5-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index de7b553baa50..a69449bffc8d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1491,18 +1491,12 @@ void * __init memblock_alloc_exact_nid_raw( phys_addr_t min_addr, phys_addr_t max_addr, int nid) { - void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid, true); - if (ptr && size > 0) - page_init_poison(ptr, size); - - return ptr; + return memblock_alloc_internal(size, align, min_addr, max_addr, nid, + true); } /** @@ -1529,18 +1523,12 @@ void * __init memblock_alloc_try_nid_raw( phys_addr_t min_addr, phys_addr_t max_addr, int nid) { - void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); - ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid, false); - if (ptr && size > 0) - page_init_poison(ptr, size); - - return ptr; + return memblock_alloc_internal(size, align, min_addr, max_addr, nid, + false); } /** From b346075fcf5dda7f9e9ae671703aae60e8a94564 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Thu, 2 Sep 2021 14:58:08 -0700 Subject: [PATCH 416/474] mm/page_alloc.c: fix 'zone_id' may be used uninitialized in this function warning When compiling with -Werror, cc1 will warn that 'zone_id' may be used uninitialized in this function warning. Initialize the zone_id as 0. Its safe to assume that if the code reaches this point it has at least one numa node with memory, so no need for an assertion before init_unavilable_range. Link: https://lkml.kernel.org/r/20210716210336.1114114-1-npache@redhat.com Fixes: 122e093c1734 ("mm/page_alloc: fix memory map initialization for descending nodes") Signed-off-by: Nico Pache Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 71ad97c96075..04021e37120e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6715,7 +6715,7 @@ static void __init memmap_init(void) { unsigned long start_pfn, end_pfn; unsigned long hole_pfn = 0; - int i, j, zone_id, nid; + int i, j, zone_id = 0, nid; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { struct pglist_data *node = NODE_DATA(nid); From 3b446da6be7a722d769e23f68dbaf4ebb2eda542 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 2 Sep 2021 14:58:10 -0700 Subject: [PATCH 417/474] mm/page_alloc: make alloc_node_mem_map() __init rather than __ref alloc_node_mem_map() is never only called from free_area_init_node() that is an __init function. Make the actual alloc_node_mem_map() also __init and its stub version static inline. Link: https://lkml.kernel.org/r/20210716064124.31865-1-rppt@kernel.org Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 04021e37120e..05079b4ae7bc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7515,7 +7515,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) } #ifdef CONFIG_FLATMEM -static void __ref alloc_node_mem_map(struct pglist_data *pgdat) +static void __init alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long __maybe_unused start = 0; unsigned long __maybe_unused offset = 0; @@ -7561,7 +7561,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) #endif } #else -static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } +static inline void alloc_node_mem_map(struct pglist_data *pgdat) { } #endif /* CONFIG_FLATMEM */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT From 88dc6f208829cfdbc0b96495c5c73a6af0559300 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 14:58:13 -0700 Subject: [PATCH 418/474] mm/page_alloc.c: use in_task() Obsoleted in_intrrupt() include task context with disabled BH, it's better to use in_task() instead. Link: https://lkml.kernel.org/r/877caa99-1994-5545-92d2-d0bb2e394182@virtuozzo.com Signed-off-by: Vasily Averin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05079b4ae7bc..eaa936efad7e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4211,7 +4211,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) if (tsk_is_oom_victim(current) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; - if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) + if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; show_mem(filter, nodemask); @@ -4697,7 +4697,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * comment for __cpuset_node_allowed(). */ alloc_flags &= ~ALLOC_CPUSET; - } else if (unlikely(rt_task(current)) && !in_interrupt()) + } else if (unlikely(rt_task(current)) && in_task()) alloc_flags |= ALLOC_HARDER; alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); @@ -5157,7 +5157,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, * When we are in the interrupt context, it is irrelevant * to the current task context. It means that any node ok. */ - if (!in_interrupt() && !ac->nodemask) + if (in_task() && !ac->nodemask) ac->nodemask = &cpuset_current_mems_allowed; else *alloc_flags |= ALLOC_CPUSET; From 1d09510bcc6bc00ed406f0d65e39ab3b734f124b Mon Sep 17 00:00:00 2001 From: "George G. Davis" Date: Thu, 2 Sep 2021 14:58:16 -0700 Subject: [PATCH 419/474] mm/page_isolation: tracing: trace all test_pages_isolated failures Some test_pages_isolated failure conditions don't include trace points. For debugging issues caused by "pinned" pages, make sure to trace all calls whether they succeed or fail. In this case, a failure case did not result in a trace point. So add the missing failure case in test_pages_isolated traces. Link: https://lkml.kernel.org/r/20210823202823.13765-1-george_davis@mentor.com Signed-off-by: George G. Davis Cc: Eugeniu Rosca Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index bddf788f45bf..fff55bb830f9 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -287,6 +287,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn, flags; struct page *page; struct zone *zone; + int ret; /* * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages @@ -299,15 +300,21 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, break; } page = __first_valid_page(start_pfn, end_pfn - start_pfn); - if ((pfn < end_pfn) || !page) - return -EBUSY; + if ((pfn < end_pfn) || !page) { + ret = -EBUSY; + goto out; + } + /* Check all pages are free or marked as ISOLATED */ zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags); spin_unlock_irqrestore(&zone->lock, flags); + ret = pfn < end_pfn ? -EBUSY : 0; + +out: trace_test_pages_isolated(start_pfn, end_pfn, pfn); - return pfn < end_pfn ? -EBUSY : 0; + return ret; } From ae611d072c5c2968e2cc29431cf58094d8971b94 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:58:19 -0700 Subject: [PATCH 420/474] mm/hwpoison: remove unneeded variable unmap_success Patch series "Cleanups and fixup for hwpoison" This series contains cleanups to remove unneeded variable, fix some obsolete comments and so on. Also we fix potential pte_unmap_unlock on wrong pte. More details can be found in the respective changelogs. This patch (of 4): unmap_success is used to indicate whether page is successfully unmapped but it's irrelated with ZONE_DEVICE page and unmap_success is always true here. Remove this unneeded one. Link: https://lkml.kernel.org/r/20210814105131.48814-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210814105131.48814-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 470400cc7513..9793c78c3777 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1518,7 +1518,6 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, struct dev_pagemap *pgmap) { struct page *page = pfn_to_page(pfn); - const bool unmap_success = true; unsigned long size = 0; struct to_kill *tk; LIST_HEAD(tokill); @@ -1590,7 +1589,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, start = (page->index << PAGE_SHIFT) & ~(size - 1); unmap_mapping_range(page->mapping, start, size, 0); } - kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags); + kill_procs(&tokill, flags & MF_MUST_KILL, false, pfn, flags); rc = 0; unlock: dax_unlock_page(page, cookie); From ea3732f7a1cf636284388988d1a1e56d5cba6044 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:58:22 -0700 Subject: [PATCH 421/474] mm/hwpoison: fix potential pte_unmap_unlock pte error If the first pte is equal to poisoned_pfn, i.e. check_hwpoisoned_entry() return 1, the wrong ptep - 1 would be passed to pte_unmap_unlock(). Link: https://lkml.kernel.org/r/20210814105131.48814-3-linmiaohe@huawei.com Fixes: ad9c59c24095 ("mm,hwpoison: send SIGBUS with error virutal address") Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9793c78c3777..224bd0be223c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -632,7 +632,7 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, { struct hwp_walk *hwp = (struct hwp_walk *)walk->private; int ret = 0; - pte_t *ptep; + pte_t *ptep, *mapped_pte; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmdp, walk->vma); @@ -645,14 +645,15 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, if (pmd_trans_unstable(pmdp)) goto out; - ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl); + mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, + addr, &ptl); for (; addr != end; ptep++, addr += PAGE_SIZE) { ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT, hwp->pfn, &hwp->tk); if (ret == 1) break; } - pte_unmap_unlock(ptep - 1, ptl); + pte_unmap_unlock(mapped_pte, ptl); out: cond_resched(); return ret; From ed8c2f492d4e7248a9c0493c444c47bed84d345d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:58:25 -0700 Subject: [PATCH 422/474] mm/hwpoison: change argument struct page **hpagep to *hpage It's unnecessary to pass in a struct page **hpagep because it's never modified. Changing to use *hpage to simplify the code. Link: https://lkml.kernel.org/r/20210814105131.48814-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 224bd0be223c..102caf78aae8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1271,14 +1271,13 @@ static int get_hwpoison_page(struct page *p, unsigned long flags) * the pages and send SIGBUS to the processes if the data was dirty. */ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, - int flags, struct page **hpagep) + int flags, struct page *hpage) { enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC; struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; int kill = 1, forcekill; - struct page *hpage = *hpagep; bool mlocked = PageMlocked(hpage); /* @@ -1503,7 +1502,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) goto out; } - if (!hwpoison_user_mappings(p, pfn, flags, &head)) { + if (!hwpoison_user_mappings(p, pfn, flags, head)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; @@ -1783,7 +1782,7 @@ try_again: * Now take care of user space mappings. * Abort on fail: __delete_from_page_cache() assumes unmapped page. */ - if (!hwpoison_user_mappings(p, pfn, flags, &p)) { + if (!hwpoison_user_mappings(p, pfn, flags, p)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto unlock_page; From a21c184fe25eab36fb6efabae55333452171d53b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:58:28 -0700 Subject: [PATCH 423/474] mm/hwpoison: fix some obsolete comments Since commit cb731d6c62bb ("vmscan: per memory cgroup slab shrinkers"), shrink_node_slabs is renamed to drop_slab_node. And doit argument is changed to forcekill since commit 6751ed65dc66 ("x86/mce: Fix siginfo_t->si_addr value for non-recoverable memory faults"). Link: https://lkml.kernel.org/r/20210814105131.48814-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 102caf78aae8..f83a2af0af18 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -296,7 +296,7 @@ void shake_page(struct page *p, int access) } /* - * Only call shrink_node_slabs here (which would also shrink + * Only call drop_slab_node here (which would also shrink * other caches) if access is not potentially fatal. */ if (access) @@ -391,8 +391,8 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, /* * Kill the processes that have been collected earlier. * - * Only do anything when DOIT is set, otherwise just free the list - * (this is used for clean pages which do not need killing) + * Only do anything when FORCEKILL is set, otherwise just free the + * list (this is used for clean pages which do not need killing) * Also when FAIL is set do a force kill because something went * wrong earlier. */ From d0505e9f7dcec85da6634ec66da2b17656ee177b Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 2 Sep 2021 14:58:31 -0700 Subject: [PATCH 424/474] mm: hwpoison: don't drop slab caches for offlining non-LRU page In the current implementation of soft offline, if non-LRU page is met, all the slab caches will be dropped to free the page then offline. But if the page is not slab page all the effort is wasted in vain. Even though it is a slab page, it is not guaranteed the page could be freed at all. However the side effect and cost is quite high. It does not only drop the slab caches, but also may drop a significant amount of page caches which are associated with inode caches. It could make the most workingset gone in order to just offline a page. And the offline is not guaranteed to succeed at all, actually I really doubt the success rate for real life workload. Furthermore the worse consequence is the system may be locked up and unusable since the page cache release may incur huge amount of works queued for memcg release. Actually we ran into such unpleasant case in our production environment. Firstly, the workqueue of memory_failure_work_func is locked up as below: BUG: workqueue lockup - pool cpus=1 node=0 flags=0x0 nice=0 stuck for 53s! Showing busy workqueues and worker pools: workqueue events: flags=0x0 pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=14/256 refcnt=15 in-flight: 409271:memory_failure_work_func pending: kfree_rcu_work, kfree_rcu_monitor, kfree_rcu_work, rht_deferred_worker, rht_deferred_worker, rht_deferred_worker, rht_deferred_worker, kfree_rcu_work, kfree_rcu_work, kfree_rcu_work, kfree_rcu_work, drain_local_stock, kfree_rcu_work workqueue mm_percpu_wq: flags=0x8 pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/256 refcnt=2 pending: vmstat_update workqueue cgroup_destroy: flags=0x0 pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/1 refcnt=12072 pending: css_release_work_fn There were over 12K css_release_work_fn queued, and this caused a few lockups due to the contention of worker pool lock with IRQ disabled, for example: NMI watchdog: Watchdog detected hard LOCKUP on cpu 1 Modules linked in: amd64_edac_mod edac_mce_amd crct10dif_pclmul crc32_pclmul ghash_clmulni_intel xt_DSCP iptable_mangle kvm_amd bpfilter vfat fat acpi_ipmi i2c_piix4 usb_storage ipmi_si k10temp i2c_core ipmi_devintf ipmi_msghandler acpi_cpufreq sch_fq_codel xfs libcrc32c crc32c_intel mlx5_core mlxfw nvme xhci_pci ptp nvme_core pps_core xhci_hcd CPU: 1 PID: 205500 Comm: kworker/1:0 Tainted: G L 5.10.32-t1.el7.twitter.x86_64 #1 Hardware name: TYAN F5AMT /z /S8026GM2NRE-CGN, BIOS V8.030 03/30/2021 Workqueue: events memory_failure_work_func RIP: 0010:queued_spin_lock_slowpath+0x41/0x1a0 Code: 41 f0 0f ba 2f 08 0f 92 c0 0f b6 c0 c1 e0 08 89 c2 8b 07 30 e4 09 d0 a9 00 01 ff ff 75 1b 85 c0 74 0e 8b 07 84 c0 74 08 f3 90 <8b> 07 84 c0 75 f8 b8 01 00 00 00 66 89 07 c3 f6 c4 01 75 04 c6 47 RSP: 0018:ffff9b2ac278f900 EFLAGS: 00000002 RAX: 0000000000480101 RBX: ffff8ce98ce71800 RCX: 0000000000000084 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8ce98ce6a140 RBP: 00000000000284c8 R08: ffffd7248dcb6808 R09: 0000000000000000 R10: 0000000000000003 R11: ffff9b2ac278f9b0 R12: 0000000000000001 R13: ffff8cb44dab9c00 R14: ffffffffbd1ce6a0 R15: ffff8cacaa37f068 FS: 0000000000000000(0000) GS:ffff8ce98ce40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fcf6e8cb000 CR3: 0000000a0c60a000 CR4: 0000000000350ee0 Call Trace: __queue_work+0xd6/0x3c0 queue_work_on+0x1c/0x30 uncharge_batch+0x10e/0x110 mem_cgroup_uncharge_list+0x6d/0x80 release_pages+0x37f/0x3f0 __pagevec_release+0x1c/0x50 __invalidate_mapping_pages+0x348/0x380 inode_lru_isolate+0x10a/0x160 __list_lru_walk_one+0x7b/0x170 list_lru_walk_one+0x4a/0x60 prune_icache_sb+0x37/0x50 super_cache_scan+0x123/0x1a0 do_shrink_slab+0x10c/0x2c0 shrink_slab+0x1f1/0x290 drop_slab_node+0x4d/0x70 soft_offline_page+0x1ac/0x5b0 memory_failure_work_func+0x6a/0x90 process_one_work+0x19e/0x340 worker_thread+0x30/0x360 kthread+0x116/0x130 The lockup made the machine is quite unusable. And it also made the most workingset gone, the reclaimabled slab caches were reduced from 12G to 300MB, the page caches were decreased from 17G to 4G. But the most disappointing thing is all the effort doesn't make the page offline, it just returns: soft_offline: 0x1469f2: unknown non LRU page type 5ffff0000000000 () It seems the aggressive behavior for non-LRU page didn't pay back, so it doesn't make too much sense to keep it considering the terrible side effect. Link: https://lkml.kernel.org/r/20210819054116.266126-1-shy828301@gmail.com Signed-off-by: Yang Shi Reported-by: David Mackey Acked-by: David Hildenbrand Acked-by: Naoya Horiguchi Cc: Oscar Salvador Cc: Matthew Wilcox (Oracle) Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/hwpoison-inject.c | 2 +- mm/memory-failure.c | 18 ++++++++---------- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 35bbac32b6f6..11c38550627c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3110,7 +3110,7 @@ extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; -extern void shake_page(struct page *p, int access); +extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 1ae1ebc2b9b1..aff4d27ec235 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -30,7 +30,7 @@ static int hwpoison_inject(void *data, u64 val) if (!hwpoison_filter_enable) goto inject; - shake_page(hpage, 0); + shake_page(hpage); /* * This implies unable to support non-LRU pages. */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f83a2af0af18..5decacb86b9f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -282,9 +282,9 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) /* * Unknown page type encountered. Try to check whether it can turn PageLRU by - * lru_add_drain_all, or a free page by reclaiming slabs when possible. + * lru_add_drain_all. */ -void shake_page(struct page *p, int access) +void shake_page(struct page *p) { if (PageHuge(p)) return; @@ -296,11 +296,9 @@ void shake_page(struct page *p, int access) } /* - * Only call drop_slab_node here (which would also shrink - * other caches) if access is not potentially fatal. + * TODO: Could shrink slab caches here if a lightweight range-based + * shrinker will be available. */ - if (access) - drop_slab_node(page_to_nid(p)); } EXPORT_SYMBOL_GPL(shake_page); @@ -1205,7 +1203,7 @@ try_again: * page, retry. */ if (pass++ < 3) { - shake_page(p, 1); + shake_page(p); goto try_again; } ret = -EIO; @@ -1222,7 +1220,7 @@ try_again: */ if (pass++ < 3) { put_page(p); - shake_page(p, 1); + shake_page(p); count_increased = false; goto try_again; } @@ -1369,7 +1367,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * shake_page() again to ensure that it's flushed. */ if (mlocked) - shake_page(hpage, 0); + shake_page(hpage); /* * Now that the dirty bit has been propagated to the @@ -1723,7 +1721,7 @@ try_again: * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - shake_page(p, 0); + shake_page(p); lock_page(p); From f6533121696b2126a33b436f433a048b4427944f Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 2 Sep 2021 14:58:34 -0700 Subject: [PATCH 425/474] doc: hwpoison: correct the support for hugepage The hwpoison support for huge page, both hugetlb and THP, has been in kernel for a while, the statement in document is obsolete, correct it. Link: https://lkml.kernel.org/r/20210819054116.266126-2-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Naoya Horiguchi Reviewed-by: David Hildenbrand Cc: Oscar Salvador Cc: David Mackey Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hwpoison.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/vm/hwpoison.rst b/Documentation/vm/hwpoison.rst index a5c884293dac..89b5f7a52077 100644 --- a/Documentation/vm/hwpoison.rst +++ b/Documentation/vm/hwpoison.rst @@ -180,7 +180,6 @@ Limitations =========== - Not all page types are supported and never will. Most kernel internal objects cannot be recovered, only LRU pages for now. -- Right now hugepage support is missing. --- Andi Kleen, Oct 2009 From 941ca063eb8ed01e66336b1f493e95b107024bc8 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 2 Sep 2021 14:58:37 -0700 Subject: [PATCH 426/474] mm: hwpoison: dump page for unhandlable page Currently just very simple message is shown for unhandlable page, e.g. non-LRU page, like: soft_offline: 0x1469f2: unknown non LRU page type 5ffff0000000000 () It is not very helpful for further debug, calling dump_page() could show more useful information. Calling dump_page() in get_any_page() in order to not duplicate the call in a couple of different places. It may be called with pcp disabled and holding memory hotplug lock, it should be not a big deal since hwpoison handler is not called very often. [shy828301@gmail.com: remove redundant pr_info per Noaya Horiguchi] Link: https://lkml.kernel.org/r/20210824020946.195257-3-shy828301@gmail.com Link: https://lkml.kernel.org/r/20210819054116.266126-3-shy828301@gmail.com Signed-off-by: Yang Shi Suggested-by: Matthew Wilcox Acked-by: Naoya Horiguchi Reviewed-by: David Hildenbrand Cc: Oscar Salvador Cc: David Mackey Cc: Jonathan Corbet Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5decacb86b9f..b3e8c75bee66 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1228,6 +1228,9 @@ try_again: ret = -EIO; } out: + if (ret == -EIO) + dump_page(p, "hwpoison: unhandlable page"); + return ret; } @@ -2205,9 +2208,6 @@ retry: try_again = false; goto retry; } - } else if (ret == -EIO) { - pr_info("%s: %#lx: unknown page type: %lx (%pGp)\n", - __func__, pfn, page->flags, &page->flags); } return ret; From f87060d345232c7d855167a43faf006e24afa999 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 2 Sep 2021 14:58:40 -0700 Subject: [PATCH 427/474] mm: fix panic caused by __page_handle_poison() In commit 510d25c92ec4 ("mm/hwpoison: disable pcp for page_handle_poison()"), __page_handle_poison() was introduced, and if we mark: RET_A = dissolve_free_huge_page(); RET_B = take_page_off_buddy(); then __page_handle_poison was supposed to return TRUE When RET_A == 0 && RET_B == TRUE But since it failed to take care the case when RET_A is -EBUSY or -ENOMEM, and just return the ret as a bool which actually become TRUE, it break the original logic. The following result is a huge page in freelist but was referenced as poisoned, and lead into the final panic: kernel BUG at mm/internal.h:95! invalid opcode: 0000 [#1] SMP PTI skip... RIP: 0010:set_page_refcounted mm/internal.h:95 [inline] RIP: 0010:remove_hugetlb_page+0x23c/0x240 mm/hugetlb.c:1371 skip... Call Trace: remove_pool_huge_page+0xe4/0x110 mm/hugetlb.c:1892 return_unused_surplus_pages+0x8d/0x150 mm/hugetlb.c:2272 hugetlb_acct_memory.part.91+0x524/0x690 mm/hugetlb.c:4017 This patch replaces 'bool' with 'int' to handle RET_A correctly. Link: https://lkml.kernel.org/r/61782ac6-1e8a-4f6f-35e6-e94fce3b37f5@linux.alibaba.com Fixes: 510d25c92ec4 ("mm/hwpoison: disable pcp for page_handle_poison()") Signed-off-by: Michael Wang Acked-by: Naoya Horiguchi Reported-by: Abaci Cc: [5.14+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b3e8c75bee66..2f925615e573 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -68,7 +68,7 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool __page_handle_poison(struct page *page) { - bool ret; + int ret; zone_pcp_disable(page_zone(page)); ret = dissolve_free_huge_page(page); @@ -76,7 +76,7 @@ static bool __page_handle_poison(struct page *page) ret = take_page_off_buddy(page); zone_pcp_enable(page_zone(page)); - return ret; + return ret > 0; } static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release) From 416d85ed3e08c1164c1405249a94343166837802 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 2 Sep 2021 14:58:43 -0700 Subject: [PATCH 428/474] hugetlb: simplify prep_compound_gigantic_page ref count racing code Code in prep_compound_gigantic_page waits for a rcu grace period if it notices a temporarily inflated ref count on a tail page. This was due to the identified potential race with speculative page cache references which could only last for a rcu grace period. This is overly complicated as this situation is VERY unlikely to ever happen. Instead, just quickly return an error. Also, only print a warning in prep_compound_gigantic_page instead of multiple callers. Link: https://lkml.kernel.org/r/20210809184832.18342-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8ea35ba6699f..3658398c835d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1657,16 +1657,14 @@ static bool prep_compound_gigantic_page(struct page *page, unsigned int order) * cache adding could take a ref on a 'to be' tail page. * We need to respect any increased ref count, and only set * the ref count to zero if count is currently 1. If count - * is not 1, we call synchronize_rcu in the hope that a rcu - * grace period will cause ref count to drop and then retry. - * If count is still inflated on retry we return an error and - * must discard the pages. + * is not 1, we return an error. An error return indicates + * the set of pages can not be converted to a gigantic page. + * The caller who allocated the pages should then discard the + * pages using the appropriate free interface. */ if (!page_ref_freeze(p, 1)) { - pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n"); - synchronize_rcu(); - if (!page_ref_freeze(p, 1)) - goto out_error; + pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); + goto out_error; } set_page_count(p, 0); set_compound_head(p, page); @@ -1830,7 +1828,6 @@ retry: retry = true; goto retry; } - pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); return NULL; } } @@ -2828,8 +2825,8 @@ static void __init gather_bootmem_prealloc(void) prep_new_huge_page(h, page, page_to_nid(page)); put_page(page); /* add to the hugepage allocator */ } else { + /* VERY unlikely inflated ref count on a tail page */ free_gigantic_page(page, huge_page_order(h)); - pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); } /* From b65a4edae11ecd209a0f7c39e856de24678612d9 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 2 Sep 2021 14:58:47 -0700 Subject: [PATCH 429/474] hugetlb: drop ref count earlier after page allocation When discussing the possibility of inflated page ref counts, Muuchun Song pointed out this potential issue [1]. It is true that any code could potentially take a reference on a compound page after allocation and before it is converted to and put into use as a hugetlb page. Specifically, this could be done by any users of get_page_unless_zero. There are three areas of concern within hugetlb code. 1) When adding pages to the pool. In this case, new pages are allocated added to the pool by calling put_page to invoke the hugetlb destructor (free_huge_page). If there is an inflated ref count on the page, it will not be immediately added to the free list. It will only be added to the free list when the temporary ref count is dropped. This is deemed acceptable and will not be addressed. 2) A page is allocated for immediate use normally as a surplus page or migration target. In this case, the user of the page will also hold a reference. There is no issue as this is just like normal page ref counting. 3) A page is allocated and MUST be added to the free list to satisfy a reservation. One such example is gather_surplus_pages as pointed out by Muchun in [1]. More specifically, this case covers callers of enqueue_huge_page where the page reference count must be zero. This patch covers this third case. Three routines call enqueue_huge_page when the page reference count could potentially be inflated. They are: gather_surplus_pages, alloc_and_dissolve_huge_page and add_hugetlb_page. add_hugetlb_page is called on error paths when a huge page can not be freed due to the inability to allocate vmemmap pages. In this case, the temporairly inflated ref count is not an issue. When the ref is dropped the appropriate action will be taken. Instead of VM_BUG_ON if the ref count does not drop to zero, simply return. In gather_surplus_pages and alloc_and_dissolve_huge_page the caller expects a page (or pages) to be put on the free lists. In this case we must ensure there are no temporary ref counts. We do this by calling put_page_testzero() earlier and not using pages without a zero ref count. The temporary page flag (HPageTemporary) is used in such cases so that as soon as the inflated ref count is dropped the page will be freed. [1] https://lore.kernel.org/linux-mm/CAMZfGtVMn3daKrJwZMaVOGOaJU+B4dS--x_oPmGQMD=c=QNGEg@mail.gmail.com/ Link: https://lkml.kernel.org/r/20210809184832.18342-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 100 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 78 insertions(+), 22 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3658398c835d..8f97321396cc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1072,6 +1072,8 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) int nid = page_to_nid(page); lockdep_assert_held(&hugetlb_lock); + VM_BUG_ON_PAGE(page_count(page), page); + list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; @@ -1399,11 +1401,20 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, SetHPageVmemmapOptimized(page); /* - * This page is now managed by the hugetlb allocator and has - * no users -- drop the last reference. + * This page is about to be managed by the hugetlb allocator and + * should have no users. Drop our reference, and check for others + * just in case. */ zeroed = put_page_testzero(page); - VM_BUG_ON_PAGE(!zeroed, page); + if (!zeroed) + /* + * It is VERY unlikely soneone else has taken a ref on + * the page. In this case, we simply return as the + * hugetlb destructor (free_huge_page) will be called + * when this other ref is dropped. + */ + return; + arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); } @@ -2017,9 +2028,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) * Allocates a fresh surplus page from the page allocator. */ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask) + int nid, nodemask_t *nmask, bool zero_ref) { struct page *page = NULL; + bool retry = false; if (hstate_is_gigantic(h)) return NULL; @@ -2029,6 +2041,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, goto out_unlock; spin_unlock_irq(&hugetlb_lock); +retry: page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; @@ -2046,11 +2059,35 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, spin_unlock_irq(&hugetlb_lock); put_page(page); return NULL; - } else { - h->surplus_huge_pages++; - h->surplus_huge_pages_node[page_to_nid(page)]++; } + if (zero_ref) { + /* + * Caller requires a page with zero ref count. + * We will drop ref count here. If someone else is holding + * a ref, the page will be freed when they drop it. Abuse + * temporary page flag to accomplish this. + */ + SetHPageTemporary(page); + if (!put_page_testzero(page)) { + /* + * Unexpected inflated ref count on freshly allocated + * huge. Retry once. + */ + pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n"); + spin_unlock_irq(&hugetlb_lock); + if (retry) + return NULL; + + retry = true; + goto retry; + } + ClearHPageTemporary(page); + } + + h->surplus_huge_pages++; + h->surplus_huge_pages_node[page_to_nid(page)]++; + out_unlock: spin_unlock_irq(&hugetlb_lock); @@ -2092,7 +2129,7 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); mpol_cond_put(mpol); return page; @@ -2164,7 +2201,7 @@ retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), - NUMA_NO_NODE, NULL); + NUMA_NO_NODE, NULL, true); if (!page) { alloc_ok = false; break; @@ -2205,24 +2242,20 @@ retry: /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { - int zeroed; - if ((--needed) < 0) break; - /* - * This page is now managed by the hugetlb allocator and has - * no users -- drop the buddy allocator's reference. - */ - zeroed = put_page_testzero(page); - VM_BUG_ON_PAGE(!zeroed, page); + /* Add the page to the hugetlb allocator */ enqueue_huge_page(h, page); } free: spin_unlock_irq(&hugetlb_lock); - /* Free unnecessary surplus pages to the buddy allocator */ + /* + * Free unnecessary surplus pages to the buddy allocator. + * Pages have no ref count, call free_huge_page directly. + */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) - put_page(page); + free_huge_page(page); spin_lock_irq(&hugetlb_lock); return ret; @@ -2531,6 +2564,7 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nid = page_to_nid(old_page); + bool alloc_retry = false; struct page *new_page; int ret = 0; @@ -2541,9 +2575,30 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, * the pool. This simplifies and let us do most of the processing * under the lock. */ +alloc_retry: new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); if (!new_page) return -ENOMEM; + /* + * If all goes well, this page will be directly added to the free + * list in the pool. For this the ref count needs to be zero. + * Attempt to drop now, and retry once if needed. It is VERY + * unlikely there is another ref on the page. + * + * If someone else has a reference to the page, it will be freed + * when they drop their ref. Abuse temporary page flag to accomplish + * this. Retry once if there is an inflated ref count. + */ + SetHPageTemporary(new_page); + if (!put_page_testzero(new_page)) { + if (alloc_retry) + return -EBUSY; + + alloc_retry = true; + goto alloc_retry; + } + ClearHPageTemporary(new_page); + __prep_new_huge_page(h, new_page); retry: @@ -2583,11 +2638,10 @@ retry: remove_hugetlb_page(h, old_page, false); /* - * Reference count trick is needed because allocator gives us - * referenced page but the pool requires pages with 0 refcount. + * Ref count on new page is already zero as it was dropped + * earlier. It can be directly added to the pool free list. */ __prep_account_new_huge_page(h, nid); - page_ref_dec(new_page); enqueue_huge_page(h, new_page); /* @@ -2601,6 +2655,8 @@ retry: free_new: spin_unlock_irq(&hugetlb_lock); + /* Page has a zero ref count, but needs a ref to be freed */ + set_page_refcounted(new_page); update_and_free_page(h, new_page, false); return ret; From e32d20c0c88b1cd0a44f882c4f0eb2f536363d1b Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 2 Sep 2021 14:58:50 -0700 Subject: [PATCH 430/474] hugetlb: before freeing hugetlb page set dtor to appropriate value When removing a hugetlb page from the pool the ref count is set to one (as the free page has no ref count) and compound page destructor is set to NULL_COMPOUND_DTOR. Since a subsequent call to free the hugetlb page will call __free_pages for non-gigantic pages and free_gigantic_page for gigantic pages the destructor is not used. However, consider the following race with code taking a speculative reference on the page: Thread 0 Thread 1 -------- -------- remove_hugetlb_page set_page_refcounted(page); set_compound_page_dtor(page, NULL_COMPOUND_DTOR); get_page_unless_zero(page) __update_and_free_page __free_pages(page, huge_page_order(h)); /* Note that __free_pages() will simply drop the reference to the page. */ put_page(page) __put_compound_page() destroy_compound_page NULL_COMPOUND_DTOR BUG: kernel NULL pointer dereference, address: 0000000000000000 To address this race, set the dtor to the normal compound page dtor for non-gigantic pages. The dtor for gigantic pages does not matter as gigantic pages are changed from a compound page to 'just a group of pages' before freeing. Hence, the destructor is not used. Link: https://lkml.kernel.org/r/20210809184832.18342-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Michal Hocko Cc: Oscar Salvador Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Mina Almasry Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8f97321396cc..dd1c1e7d970b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1370,8 +1370,28 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page, h->surplus_huge_pages_node[nid]--; } + /* + * Very subtle + * + * For non-gigantic pages set the destructor to the normal compound + * page dtor. This is needed in case someone takes an additional + * temporary ref to the page, and freeing is delayed until they drop + * their reference. + * + * For gigantic pages set the destructor to the null dtor. This + * destructor will never be called. Before freeing the gigantic + * page destroy_compound_gigantic_page will turn the compound page + * into a simple group of pages. After this the destructor does not + * apply. + * + * This handles the case where more than one ref is held when and + * after update_and_free_page is called. + */ set_page_refcounted(page); - set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + if (hstate_is_gigantic(h)) + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + else + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; From 09a26e832705fdb7a9484495b71a05e0bbc65207 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 2 Sep 2021 14:58:53 -0700 Subject: [PATCH 431/474] hugetlb: fix hugetlb cgroup refcounting during vma split Guillaume Morin reported hitting the following WARNING followed by GPF or NULL pointer deference either in cgroups_destroy or in the kill_css path.: percpu ref (css_release) <= 0 (-1) after switching to atomic WARNING: CPU: 23 PID: 130 at lib/percpu-refcount.c:196 percpu_ref_switch_to_atomic_rcu+0x127/0x130 CPU: 23 PID: 130 Comm: ksoftirqd/23 Kdump: loaded Tainted: G O 5.10.60 #1 RIP: 0010:percpu_ref_switch_to_atomic_rcu+0x127/0x130 Call Trace: rcu_core+0x30f/0x530 rcu_core_si+0xe/0x10 __do_softirq+0x103/0x2a2 run_ksoftirqd+0x2b/0x40 smpboot_thread_fn+0x11a/0x170 kthread+0x10a/0x140 ret_from_fork+0x22/0x30 Upon further examination, it was discovered that the css structure was associated with hugetlb reservations. For private hugetlb mappings the vma points to a reserve map that contains a pointer to the css. At mmap time, reservations are set up and a reference to the css is taken. This reference is dropped in the vma close operation; hugetlb_vm_op_close. However, if a vma is split no additional reference to the css is taken yet hugetlb_vm_op_close will be called twice for the split vma resulting in an underflow. Fix by taking another reference in hugetlb_vm_op_open. Note that the reference is only taken for the owner of the reserve map. In the more common fork case, the pointer to the reserve map is cleared for non-owning vmas. Link: https://lkml.kernel.org/r/20210830215015.155224-1-mike.kravetz@oracle.com Fixes: e9fe92ae0cd2 ("hugetlb_cgroup: add reservation accounting for private mappings") Signed-off-by: Mike Kravetz Reported-by: Guillaume Morin Suggested-by: Guillaume Morin Tested-by: Guillaume Morin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb_cgroup.h | 12 ++++++++++++ mm/hugetlb.c | 4 +++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 0b8d1fdda3a1..c137396129db 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -121,6 +121,13 @@ static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg) css_put(&h_cg->css); } +static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( + struct resv_map *resv_map) +{ + if (resv_map->css) + css_get(resv_map->css); +} + extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, @@ -199,6 +206,11 @@ static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg) { } +static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( + struct resv_map *resv_map) +{ +} + static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd1c1e7d970b..41a1778d3f67 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4106,8 +4106,10 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ - if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + resv_map_dup_hugetlb_cgroup_uncharge_info(resv); kref_get(&resv->refs); + } } static void hugetlb_vm_op_close(struct vm_area_struct *vma) From a759a909d42d727e918bd5248d6cff7562fa8109 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 2 Sep 2021 14:58:56 -0700 Subject: [PATCH 432/474] userfaultfd: change mmap_changing to atomic Patch series "userfaultfd: minor bug fixes". Three unrelated bug fixes. The first two addresses possible issues (not too theoretical ones), but I did not encounter them in practice. The third patch addresses a test bug that causes the test to fail on my system. It has been sent before as part of a bigger RFC. This patch (of 3): mmap_changing is currently a boolean variable, which is set and cleared without any lock that protects against concurrent modifications. mmap_changing is supposed to mark whether userfaultfd page-faults handling should be retried since mappings are undergoing a change. However, concurrent calls, for instance to madvise(MADV_DONTNEED), might cause mmap_changing to be false, although the remove event was still not read (hence acknowledged) by the user. Change mmap_changing to atomic_t and increase/decrease appropriately. Add a debug assertion to see whether mmap_changing is negative. Link: https://lkml.kernel.org/r/20210808020724.1022515-1-namit@vmware.com Link: https://lkml.kernel.org/r/20210808020724.1022515-2-namit@vmware.com Fixes: df2cc96e77011 ("userfaultfd: prevent non-cooperative events vs mcopy_atomic races") Signed-off-by: Nadav Amit Cc: Mike Rapoport Cc: Peter Xu Cc: Axel Rasmussen Cc: Alexander Viro Cc: Andrea Arcangeli Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 25 +++++++++++++------------ include/linux/userfaultfd_k.h | 8 ++++---- mm/userfaultfd.c | 15 ++++++++------- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5c2d806e6ae5..29a3016f16c9 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -74,7 +74,7 @@ struct userfaultfd_ctx { /* released */ bool released; /* memory mappings are changing because of non-cooperative event */ - bool mmap_changing; + atomic_t mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ struct mm_struct *mm; }; @@ -623,7 +623,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, * already released. */ out: - WRITE_ONCE(ctx->mmap_changing, false); + atomic_dec(&ctx->mmap_changing); + VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0); userfaultfd_ctx_put(ctx); } @@ -669,12 +670,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; ctx->released = false; - ctx->mmap_changing = false; + atomic_set(&ctx->mmap_changing, 0); ctx->mm = vma->vm_mm; mmgrab(ctx->mm); userfaultfd_ctx_get(octx); - WRITE_ONCE(octx->mmap_changing, true); + atomic_inc(&octx->mmap_changing); fctx->orig = octx; fctx->new = ctx; list_add_tail(&fctx->list, fcs); @@ -721,7 +722,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); - WRITE_ONCE(ctx->mmap_changing, true); + atomic_inc(&ctx->mmap_changing); } else { /* Drop uffd context if remap feature not enabled */ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -766,7 +767,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma, return true; userfaultfd_ctx_get(ctx); - WRITE_ONCE(ctx->mmap_changing, true); + atomic_inc(&ctx->mmap_changing); mmap_read_unlock(mm); msg_init(&ewq.msg); @@ -810,7 +811,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, return -ENOMEM; userfaultfd_ctx_get(ctx); - WRITE_ONCE(ctx->mmap_changing, true); + atomic_inc(&ctx->mmap_changing); unmap_ctx->ctx = ctx; unmap_ctx->start = start; unmap_ctx->end = end; @@ -1700,7 +1701,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, user_uffdio_copy = (struct uffdio_copy __user *) arg; ret = -EAGAIN; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out; ret = -EFAULT; @@ -1757,7 +1758,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; ret = -EAGAIN; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out; ret = -EFAULT; @@ -1807,7 +1808,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, struct userfaultfd_wake_range range; bool mode_wp, mode_dontwake; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) return -EAGAIN; user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; @@ -1855,7 +1856,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) user_uffdio_continue = (struct uffdio_continue __user *)arg; ret = -EAGAIN; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out; ret = -EFAULT; @@ -2087,7 +2088,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) ctx->features = 0; ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; - ctx->mmap_changing = false; + atomic_set(&ctx->mmap_changing, 0); ctx->mm = current->mm; /* prevent the mm struct to be freed */ mmgrab(ctx->mm); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 331d2ccf0bcc..33cea484d1ad 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -60,16 +60,16 @@ extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool *mmap_changing, __u64 mode); + atomic_t *mmap_changing, __u64 mode); extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, - bool *mmap_changing); + atomic_t *mmap_changing); extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len, bool *mmap_changing); + unsigned long len, atomic_t *mmap_changing); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, - bool enable_wp, bool *mmap_changing); + bool enable_wp, atomic_t *mmap_changing); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0e2132834bc7..7a9008415534 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -483,7 +483,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long src_start, unsigned long len, enum mcopy_atomic_mode mcopy_mode, - bool *mmap_changing, + atomic_t *mmap_changing, __u64 mode) { struct vm_area_struct *dst_vma; @@ -517,7 +517,7 @@ retry: * request the user to retry later */ err = -EAGAIN; - if (mmap_changing && READ_ONCE(*mmap_changing)) + if (mmap_changing && atomic_read(mmap_changing)) goto out_unlock; /* @@ -650,28 +650,29 @@ out: ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool *mmap_changing, __u64 mode) + atomic_t *mmap_changing, __u64 mode) { return __mcopy_atomic(dst_mm, dst_start, src_start, len, MCOPY_ATOMIC_NORMAL, mmap_changing, mode); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool *mmap_changing) + unsigned long len, atomic_t *mmap_changing) { return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, mmap_changing, 0); } ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool *mmap_changing) + unsigned long len, atomic_t *mmap_changing) { return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, mmap_changing, 0); } int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool enable_wp, bool *mmap_changing) + unsigned long len, bool enable_wp, + atomic_t *mmap_changing) { struct vm_area_struct *dst_vma; pgprot_t newprot; @@ -694,7 +695,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, * request the user to retry later */ err = -EAGAIN; - if (mmap_changing && READ_ONCE(*mmap_changing)) + if (mmap_changing && atomic_read(mmap_changing)) goto out_unlock; err = -ENOENT; From 22e5fe2a2a279d9a6fcbdfb4dffe73821bef1c90 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 2 Sep 2021 14:58:59 -0700 Subject: [PATCH 433/474] userfaultfd: prevent concurrent API initialization userfaultfd assumes that the enabled features are set once and never changed after UFFDIO_API ioctl succeeded. However, currently, UFFDIO_API can be called concurrently from two different threads, succeed on both threads and leave userfaultfd's features in non-deterministic state. Theoretically, other uffd operations (ioctl's and page-faults) can be dispatched while adversely affected by such changes of features. Moreover, the writes to ctx->state and ctx->features are not ordered, which can - theoretically, again - let userfaultfd_ioctl() think that userfaultfd API completed, while the features are still not initialized. To avoid races, it is arguably best to get rid of ctx->state. Since there are only 2 states, record the API initialization in ctx->features as the uppermost bit and remove ctx->state. Link: https://lkml.kernel.org/r/20210808020724.1022515-3-namit@vmware.com Fixes: 9cd75c3cd4c3d ("userfaultfd: non-cooperative: add ability to report non-PF events from uffd descriptor") Signed-off-by: Nadav Amit Cc: Alexander Viro Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Jens Axboe Cc: Mike Rapoport Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 91 +++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 47 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 29a3016f16c9..003f0d31743e 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -33,11 +33,6 @@ int sysctl_unprivileged_userfaultfd __read_mostly; static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; -enum userfaultfd_state { - UFFD_STATE_WAIT_API, - UFFD_STATE_RUNNING, -}; - /* * Start with fault_pending_wqh and fault_wqh so they're more likely * to be in the same cacheline. @@ -69,8 +64,6 @@ struct userfaultfd_ctx { unsigned int flags; /* features requested from the userspace */ unsigned int features; - /* state machine */ - enum userfaultfd_state state; /* released */ bool released; /* memory mappings are changing because of non-cooperative event */ @@ -104,6 +97,14 @@ struct userfaultfd_wake_range { unsigned long len; }; +/* internal indication that UFFD_API ioctl was successfully executed */ +#define UFFD_FEATURE_INITIALIZED (1u << 31) + +static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) +{ + return ctx->features & UFFD_FEATURE_INITIALIZED; +} + static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { @@ -667,7 +668,6 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) refcount_set(&ctx->refcount, 1); ctx->flags = octx->flags; - ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; ctx->released = false; atomic_set(&ctx->mmap_changing, 0); @@ -944,38 +944,33 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait) poll_wait(file, &ctx->fd_wqh, wait); - switch (ctx->state) { - case UFFD_STATE_WAIT_API: + if (!userfaultfd_is_initialized(ctx)) return EPOLLERR; - case UFFD_STATE_RUNNING: - /* - * poll() never guarantees that read won't block. - * userfaults can be waken before they're read(). - */ - if (unlikely(!(file->f_flags & O_NONBLOCK))) - return EPOLLERR; - /* - * lockless access to see if there are pending faults - * __pollwait last action is the add_wait_queue but - * the spin_unlock would allow the waitqueue_active to - * pass above the actual list_add inside - * add_wait_queue critical section. So use a full - * memory barrier to serialize the list_add write of - * add_wait_queue() with the waitqueue_active read - * below. - */ - ret = 0; - smp_mb(); - if (waitqueue_active(&ctx->fault_pending_wqh)) - ret = EPOLLIN; - else if (waitqueue_active(&ctx->event_wqh)) - ret = EPOLLIN; - return ret; - default: - WARN_ON_ONCE(1); + /* + * poll() never guarantees that read won't block. + * userfaults can be waken before they're read(). + */ + if (unlikely(!(file->f_flags & O_NONBLOCK))) return EPOLLERR; - } + /* + * lockless access to see if there are pending faults + * __pollwait last action is the add_wait_queue but + * the spin_unlock would allow the waitqueue_active to + * pass above the actual list_add inside + * add_wait_queue critical section. So use a full + * memory barrier to serialize the list_add write of + * add_wait_queue() with the waitqueue_active read + * below. + */ + ret = 0; + smp_mb(); + if (waitqueue_active(&ctx->fault_pending_wqh)) + ret = EPOLLIN; + else if (waitqueue_active(&ctx->event_wqh)) + ret = EPOLLIN; + + return ret; } static const struct file_operations userfaultfd_fops; @@ -1170,7 +1165,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, int no_wait = file->f_flags & O_NONBLOCK; struct inode *inode = file_inode(file); - if (ctx->state == UFFD_STATE_WAIT_API) + if (!userfaultfd_is_initialized(ctx)) return -EINVAL; for (;;) { @@ -1909,9 +1904,10 @@ out: static inline unsigned int uffd_ctx_features(__u64 user_features) { /* - * For the current set of features the bits just coincide + * For the current set of features the bits just coincide. Set + * UFFD_FEATURE_INITIALIZED to mark the features as enabled. */ - return (unsigned int)user_features; + return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; } /* @@ -1924,12 +1920,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, { struct uffdio_api uffdio_api; void __user *buf = (void __user *)arg; + unsigned int ctx_features; int ret; __u64 features; - ret = -EINVAL; - if (ctx->state != UFFD_STATE_WAIT_API) - goto out; ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; @@ -1953,9 +1947,13 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) goto out; - ctx->state = UFFD_STATE_RUNNING; + /* only enable the requested features for this uffd context */ - ctx->features = uffd_ctx_features(features); + ctx_features = uffd_ctx_features(features); + ret = -EINVAL; + if (cmpxchg(&ctx->features, 0, ctx_features) != 0) + goto err_out; + ret = 0; out: return ret; @@ -1972,7 +1970,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, int ret = -EINVAL; struct userfaultfd_ctx *ctx = file->private_data; - if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API) + if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx)) return -EINVAL; switch(cmd) { @@ -2086,7 +2084,6 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) refcount_set(&ctx->refcount, 1); ctx->flags = flags; ctx->features = 0; - ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; atomic_set(&ctx->mmap_changing, 0); ctx->mm = current->mm; From 4410cbb5c9f9371556c4e928b5dd00226b073082 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 2 Sep 2021 14:59:02 -0700 Subject: [PATCH 434/474] selftests/vm/userfaultfd: wake after copy failure When userfaultfd copy-ioctl fails since the PTE already exists, an -EEXIST error is returned and the faulting thread is not woken. The current userfaultfd test does not wake the faulting thread in such case. The assumption is presumably that another thread set the PTE through copy/wp ioctl and would wake the faulting thread or that alternatively the fault handler would realize there is no need to "must_wait" and continue. This is not necessarily true. There is an assumption that the "must_wait" tests in handle_userfault() are sufficient to provide definitive answer whether the offending PTE is populated or not. However, userfaultfd_must_wait() test is lockless. Consequently, concurrent calls to ptep_modify_prot_start(), for instance, can clear the PTE and can cause userfaultfd_must_wait() to wrongly assume it is not populated and a wait is needed. There are therefore 3 options: (1) Change the tests to wake on copy failure. (2) Wake faulting thread unconditionally on zero/copy ioctls before returning -EEXIST. (3) Change the userfaultfd_must_wait() to hold locks. This patch took the first approach, but the others are valid solutions with different tradeoffs. Link: https://lkml.kernel.org/r/20210808020724.1022515-4-namit@vmware.com Signed-off-by: Nadav Amit Cc: Jens Axboe Cc: Andrea Arcangeli Cc: Peter Xu Cc: Alexander Viro Cc: Axel Rasmussen Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 2ea438e6b8b1..10ab56c2484a 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -566,6 +566,18 @@ static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, } } +static void wake_range(int ufd, unsigned long addr, unsigned long len) +{ + struct uffdio_range uffdio_wake; + + uffdio_wake.start = addr; + uffdio_wake.len = len; + + if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) + fprintf(stderr, "error waking %lu\n", + addr), exit(1); +} + static int __copy_page(int ufd, unsigned long offset, bool retry) { struct uffdio_copy uffdio_copy; @@ -585,6 +597,7 @@ static int __copy_page(int ufd, unsigned long offset, bool retry) if (uffdio_copy.copy != -EEXIST) err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); + wake_range(ufd, uffdio_copy.dst, page_size); } else if (uffdio_copy.copy != page_size) { err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); } else { From 79c28a41672278283fa72e03d0bf80e6644d4ac4 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Sep 2021 14:59:06 -0700 Subject: [PATCH 435/474] mm/numa: automatically generate node migration order Patch series "Migrate Pages in lieu of discard", v11. We're starting to see systems with more and more kinds of memory such as Intel's implementation of persistent memory. Let's say you have a system with some DRAM and some persistent memory. Today, once DRAM fills up, reclaim will start and some of the DRAM contents will be thrown out. Allocations will, at some point, start falling over to the slower persistent memory. That has two nasty properties. First, the newer allocations can end up in the slower persistent memory. Second, reclaimed data in DRAM are just discarded even if there are gobs of space in persistent memory that could be used. This patchset implements a solution to these problems. At the end of the reclaim process in shrink_page_list() just before the last page refcount is dropped, the page is migrated to persistent memory instead of being dropped. While I've talked about a DRAM/PMEM pairing, this approach would function in any environment where memory tiers exist. This is not perfect. It "strands" pages in slower memory and never brings them back to fast DRAM. Huang Ying has follow-on work which repurposes NUMA balancing to promote hot pages back to DRAM. This is also all based on an upstream mechanism that allows persistent memory to be onlined and used as if it were volatile: http://lkml.kernel.org/r/20190124231441.37A4A305@viggo.jf.intel.com With that, the DRAM and PMEM in each socket will be represented as 2 separate NUMA nodes, with the CPUs sit in the DRAM node. So the general inter-NUMA demotion mechanism introduced in the patchset can migrate the cold DRAM pages to the PMEM node. We have tested the patchset with the postgresql and pgbench. On a 2-socket server machine with DRAM and PMEM, the kernel with the patchset can improve the score of pgbench up to 22.1% compared with that of the DRAM only + disk case. This comes from the reduced disk read throughput (which reduces up to 70.8%). == Open Issues == * Memory policies and cpusets that, for instance, restrict allocations to DRAM can be demoted to PMEM whenever they opt in to this new mechanism. A cgroup-level API to opt-in or opt-out of these migrations will likely be required as a follow-on. * Could be more aggressive about where anon LRU scanning occurs since it no longer necessarily involves I/O. get_scan_count() for instance says: "If we have no swap space, do not bother scanning anon pages" This patch (of 9): Prepare for the kernel to auto-migrate pages to other memory nodes with a node migration table. This allows creating single migration target for each NUMA node to enable the kernel to do NUMA page migrations instead of simply discarding colder pages. A node with no target is a "terminal node", so reclaim acts normally there. The migration target does not fundamentally _need_ to be a single node, but this implementation starts there to limit complexity. When memory fills up on a node, memory contents can be automatically migrated to another node. The biggest problems are knowing when to migrate and to where the migration should be targeted. The most straightforward way to generate the "to where" list would be to follow the page allocator fallback lists. Those lists already tell us if memory is full where to look next. It would also be logical to move memory in that order. But, the allocator fallback lists have a fatal flaw: most nodes appear in all the lists. This would potentially lead to migration cycles (A->B, B->A, A->B, ...). Instead of using the allocator fallback lists directly, keep a separate node migration ordering. But, reuse the same data used to generate page allocator fallback in the first place: find_next_best_node(). This means that the firmware data used to populate node distances essentially dictates the ordering for now. It should also be architecture-neutral since all NUMA architectures have a working find_next_best_node(). RCU is used to allow lock-less read of node_demotion[] and prevent demotion cycles been observed. If multiple reads of node_demotion[] are performed, a single rcu_read_lock() must be held over all reads to ensure no cycles are observed. Details are as follows. === What does RCU provide? === Imagine a simple loop which walks down the demotion path looking for the last node: terminal_node = start_node; while (node_demotion[terminal_node] != NUMA_NO_NODE) { terminal_node = node_demotion[terminal_node]; } The initial values are: node_demotion[0] = 1; node_demotion[1] = NUMA_NO_NODE; and are updated to: node_demotion[0] = NUMA_NO_NODE; node_demotion[1] = 0; What guarantees that the cycle is not observed: node_demotion[0] = 1; node_demotion[1] = 0; and would loop forever? With RCU, a rcu_read_lock/unlock() can be placed around the loop. Since the write side does a synchronize_rcu(), the loop that observed the old contents is known to be complete before the synchronize_rcu() has completed. RCU, combined with disable_all_migrate_targets(), ensures that the old migration state is not visible by the time __set_migration_target_nodes() is called. === What does READ_ONCE() provide? === READ_ONCE() forbids the compiler from merging or reordering successive reads of node_demotion[]. This ensures that any updates are *eventually* observed. Consider the above loop again. The compiler could theoretically read the entirety of node_demotion[] into local storage (registers) and never go back to memory, and *permanently* observe bad values for node_demotion[]. Note: RCU does not provide any universal compiler-ordering guarantees: https://lore.kernel.org/lkml/20150921204327.GH4029@linux.vnet.ibm.com/ This code is unused for now. It will be called later in the series. Link: https://lkml.kernel.org/r/20210721063926.3024591-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-2-ying.huang@intel.com Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Zi Yan Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: Wei Xu Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 5 ++ mm/migrate.c | 216 ++++++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 2 +- 3 files changed, 222 insertions(+), 1 deletion(-) diff --git a/mm/internal.h b/mm/internal.h index 57e28261a3b1..cf3cb933eba3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -543,12 +543,17 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #ifdef CONFIG_NUMA extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); +extern int find_next_best_node(int node, nodemask_t *used_node_mask); #else static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order) { return NODE_RECLAIM_NOSCAN; } +static inline int find_next_best_node(int node, nodemask_t *used_node_mask) +{ + return NUMA_NO_NODE; +} #endif extern int hwpoison_filter(struct page *p); diff --git a/mm/migrate.c b/mm/migrate.c index 7e240437e7d9..57aeb9b491da 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1099,6 +1099,80 @@ out: return rc; } + +/* + * node_demotion[] example: + * + * Consider a system with two sockets. Each socket has + * three classes of memory attached: fast, medium and slow. + * Each memory class is placed in its own NUMA node. The + * CPUs are placed in the node with the "fast" memory. The + * 6 NUMA nodes (0-5) might be split among the sockets like + * this: + * + * Socket A: 0, 1, 2 + * Socket B: 3, 4, 5 + * + * When Node 0 fills up, its memory should be migrated to + * Node 1. When Node 1 fills up, it should be migrated to + * Node 2. The migration path start on the nodes with the + * processors (since allocations default to this node) and + * fast memory, progress through medium and end with the + * slow memory: + * + * 0 -> 1 -> 2 -> stop + * 3 -> 4 -> 5 -> stop + * + * This is represented in the node_demotion[] like this: + * + * { 1, // Node 0 migrates to 1 + * 2, // Node 1 migrates to 2 + * -1, // Node 2 does not migrate + * 4, // Node 3 migrates to 4 + * 5, // Node 4 migrates to 5 + * -1} // Node 5 does not migrate + */ + +/* + * Writes to this array occur without locking. Cycles are + * not allowed: Node X demotes to Y which demotes to X... + * + * If multiple reads are performed, a single rcu_read_lock() + * must be held over all reads to ensure that no cycles are + * observed. + */ +static int node_demotion[MAX_NUMNODES] __read_mostly = + {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE}; + +/** + * next_demotion_node() - Get the next node in the demotion path + * @node: The starting node to lookup the next node + * + * @returns: node id for next memory node in the demotion path hierarchy + * from @node; NUMA_NO_NODE if @node is terminal. This does not keep + * @node online or guarantee that it *continues* to be the next demotion + * target. + */ +int next_demotion_node(int node) +{ + int target; + + /* + * node_demotion[] is updated without excluding this + * function from running. RCU doesn't provide any + * compiler barriers, so the READ_ONCE() is required + * to avoid compiler reordering or read merging. + * + * Make sure to use RCU over entire code blocks if + * node_demotion[] reads need to be consistent. + */ + rcu_read_lock(); + target = READ_ONCE(node_demotion[node]); + rcu_read_unlock(); + + return target; +} + /* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. @@ -2982,3 +3056,145 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } EXPORT_SYMBOL(migrate_vma_finalize); #endif /* CONFIG_DEVICE_PRIVATE */ + +/* Disable reclaim-based migration. */ +static void __disable_all_migrate_targets(void) +{ + int node; + + for_each_online_node(node) + node_demotion[node] = NUMA_NO_NODE; +} + +static void disable_all_migrate_targets(void) +{ + __disable_all_migrate_targets(); + + /* + * Ensure that the "disable" is visible across the system. + * Readers will see either a combination of before+disable + * state or disable+after. They will never see before and + * after state together. + * + * The before+after state together might have cycles and + * could cause readers to do things like loop until this + * function finishes. This ensures they can only see a + * single "bad" read and would, for instance, only loop + * once. + */ + synchronize_rcu(); +} + +/* + * Find an automatic demotion target for 'node'. + * Failing here is OK. It might just indicate + * being at the end of a chain. + */ +static int establish_migrate_target(int node, nodemask_t *used) +{ + int migration_target; + + /* + * Can not set a migration target on a + * node with it already set. + * + * No need for READ_ONCE() here since this + * in the write path for node_demotion[]. + * This should be the only thread writing. + */ + if (node_demotion[node] != NUMA_NO_NODE) + return NUMA_NO_NODE; + + migration_target = find_next_best_node(node, used); + if (migration_target == NUMA_NO_NODE) + return NUMA_NO_NODE; + + node_demotion[node] = migration_target; + + return migration_target; +} + +/* + * When memory fills up on a node, memory contents can be + * automatically migrated to another node instead of + * discarded at reclaim. + * + * Establish a "migration path" which will start at nodes + * with CPUs and will follow the priorities used to build the + * page allocator zonelists. + * + * The difference here is that cycles must be avoided. If + * node0 migrates to node1, then neither node1, nor anything + * node1 migrates to can migrate to node0. + * + * This function can run simultaneously with readers of + * node_demotion[]. However, it can not run simultaneously + * with itself. Exclusion is provided by memory hotplug events + * being single-threaded. + */ +static void __set_migration_target_nodes(void) +{ + nodemask_t next_pass = NODE_MASK_NONE; + nodemask_t this_pass = NODE_MASK_NONE; + nodemask_t used_targets = NODE_MASK_NONE; + int node; + + /* + * Avoid any oddities like cycles that could occur + * from changes in the topology. This will leave + * a momentary gap when migration is disabled. + */ + disable_all_migrate_targets(); + + /* + * Allocations go close to CPUs, first. Assume that + * the migration path starts at the nodes with CPUs. + */ + next_pass = node_states[N_CPU]; +again: + this_pass = next_pass; + next_pass = NODE_MASK_NONE; + /* + * To avoid cycles in the migration "graph", ensure + * that migration sources are not future targets by + * setting them in 'used_targets'. Do this only + * once per pass so that multiple source nodes can + * share a target node. + * + * 'used_targets' will become unavailable in future + * passes. This limits some opportunities for + * multiple source nodes to share a destination. + */ + nodes_or(used_targets, used_targets, this_pass); + for_each_node_mask(node, this_pass) { + int target_node = establish_migrate_target(node, &used_targets); + + if (target_node == NUMA_NO_NODE) + continue; + + /* + * Visit targets from this pass in the next pass. + * Eventually, every node will have been part of + * a pass, and will become set in 'used_targets'. + */ + node_set(target_node, next_pass); + } + /* + * 'next_pass' contains nodes which became migration + * targets in this pass. Make additional passes until + * no more migrations targets are available. + */ + if (!nodes_empty(next_pass)) + goto again; +} + +/* + * For callers that do not hold get_online_mems() already. + */ +__maybe_unused // <- temporay to prevent warnings during bisects +static void set_migration_target_nodes(void) +{ + get_online_mems(); + __set_migration_target_nodes(); + put_online_mems(); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eaa936efad7e..cafdca874e0d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6157,7 +6157,7 @@ static int node_load[MAX_NUMNODES]; * * Return: node id of the found node or %NUMA_NO_NODE if no node is found. */ -static int find_next_best_node(int node, nodemask_t *used_node_mask) +int find_next_best_node(int node, nodemask_t *used_node_mask) { int n, val; int min_val = INT_MAX; From 884a6e5d1f93b5032e5d6dd2a183f8b3f008416b Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Sep 2021 14:59:09 -0700 Subject: [PATCH 436/474] mm/migrate: update node demotion order on hotplug events Reclaim-based migration is attempting to optimize data placement in memory based on the system topology. If the system changes, so must the migration ordering. The implementation is conceptually simple and entirely unoptimized. On any memory or CPU hotplug events, assume that a node was added or removed and recalculate all migration targets. This ensures that the node_demotion[] array is always ready to be used in case the new reclaim mode is enabled. This recalculation is far from optimal, most glaringly that it does not even attempt to figure out the hotplug event would have some *actual* effect on the demotion order. But, given the expected paucity of hotplug events, this should be fine. Link: https://lkml.kernel.org/r/20210721063926.3024591-2-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-3-ying.huang@intel.com Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Zi Yan Cc: Michal Hocko Cc: Wei Xu Cc: Oscar Salvador Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index 57aeb9b491da..0c12af203b68 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -3057,6 +3058,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate) EXPORT_SYMBOL(migrate_vma_finalize); #endif /* CONFIG_DEVICE_PRIVATE */ +#if defined(CONFIG_MEMORY_HOTPLUG) /* Disable reclaim-based migration. */ static void __disable_all_migrate_targets(void) { @@ -3191,10 +3193,96 @@ again: /* * For callers that do not hold get_online_mems() already. */ -__maybe_unused // <- temporay to prevent warnings during bisects static void set_migration_target_nodes(void) { get_online_mems(); __set_migration_target_nodes(); put_online_mems(); } + +/* + * React to hotplug events that might affect the migration targets + * like events that online or offline NUMA nodes. + * + * The ordering is also currently dependent on which nodes have + * CPUs. That means we need CPU on/offline notification too. + */ +static int migration_online_cpu(unsigned int cpu) +{ + set_migration_target_nodes(); + return 0; +} + +static int migration_offline_cpu(unsigned int cpu) +{ + set_migration_target_nodes(); + return 0; +} + +/* + * This leaves migrate-on-reclaim transiently disabled between + * the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs + * whether reclaim-based migration is enabled or not, which + * ensures that the user can turn reclaim-based migration at + * any time without needing to recalculate migration targets. + * + * These callbacks already hold get_online_mems(). That is why + * __set_migration_target_nodes() can be used as opposed to + * set_migration_target_nodes(). + */ +static int __meminit migrate_on_reclaim_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_GOING_OFFLINE: + /* + * Make sure there are not transient states where + * an offline node is a migration target. This + * will leave migration disabled until the offline + * completes and the MEM_OFFLINE case below runs. + */ + disable_all_migrate_targets(); + break; + case MEM_OFFLINE: + case MEM_ONLINE: + /* + * Recalculate the target nodes once the node + * reaches its final state (online or offline). + */ + __set_migration_target_nodes(); + break; + case MEM_CANCEL_OFFLINE: + /* + * MEM_GOING_OFFLINE disabled all the migration + * targets. Reenable them. + */ + __set_migration_target_nodes(); + break; + case MEM_GOING_ONLINE: + case MEM_CANCEL_ONLINE: + break; + } + + return notifier_from_errno(0); +} + +static int __init migrate_on_reclaim_init(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim", + migration_online_cpu, + migration_offline_cpu); + /* + * In the unlikely case that this fails, the automatic + * migration targets may become suboptimal for nodes + * where N_CPU changes. With such a small impact in a + * rare case, do not bother trying to do anything special. + */ + WARN_ON(ret < 0); + + hotplug_memory_notifier(migrate_on_reclaim_callback, 100); + return 0; +} +late_initcall(migrate_on_reclaim_init); +#endif /* CONFIG_MEMORY_HOTPLUG */ From 5ac95884a784e822b8cbe3d4bd6e9f96b3b71e3f Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 2 Sep 2021 14:59:13 -0700 Subject: [PATCH 437/474] mm/migrate: enable returning precise migrate_pages() success count Under normal circumstances, migrate_pages() returns the number of pages migrated. In error conditions, it returns an error code. When returning an error code, there is no way to know how many pages were migrated or not migrated. Make migrate_pages() return how many pages are demoted successfully for all cases, including when encountering errors. Page reclaim behavior will depend on this in subsequent patches. Link: https://lkml.kernel.org/r/20210721063926.3024591-3-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-4-ying.huang@intel.com Signed-off-by: Yang Shi Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Suggested-by: Oscar Salvador [optional parameter] Reviewed-by: Yang Shi Reviewed-by: Zi Yan Cc: Michal Hocko Cc: Wei Xu Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 5 +++-- mm/compaction.c | 2 +- mm/gup.c | 2 +- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 4 ++-- mm/migrate.c | 11 ++++++++--- mm/page_alloc.c | 2 +- 8 files changed, 18 insertions(+), 12 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 23dadf7aeba8..8ab88d46318e 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -41,7 +41,8 @@ extern int migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, - unsigned long private, enum migrate_mode mode, int reason); + unsigned long private, enum migrate_mode mode, int reason, + unsigned int *ret_succeeded); extern struct page *alloc_migration_target(struct page *page, unsigned long private); extern int isolate_movable_page(struct page *page, isolate_mode_t mode); @@ -56,7 +57,7 @@ extern int migrate_page_move_mapping(struct address_space *mapping, static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, - int reason) + int reason, unsigned int *ret_succeeded) { return -ENOSYS; } static inline struct page *alloc_migration_target(struct page *page, unsigned long private) diff --git a/mm/compaction.c b/mm/compaction.c index 621508e0ecd5..61fb64f47a06 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2398,7 +2398,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, - MR_COMPACTION); + MR_COMPACTION, NULL); trace_mm_compaction_migratepages(cc->nr_migratepages, err, &cc->migratepages); diff --git a/mm/gup.c b/mm/gup.c index 1c7f4ec6990b..9935a4480710 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1772,7 +1772,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, if (!list_empty(&movable_page_list)) { ret = migrate_pages(&movable_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, - MR_LONGTERM_PIN); + MR_LONGTERM_PIN, NULL); if (ret && !list_empty(&movable_page_list)) putback_movable_pages(&movable_page_list); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2f925615e573..517789b03961 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2099,7 +2099,7 @@ static int __soft_offline_page(struct page *page) if (isolate_page(hpage, &pagelist)) { ret = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE); + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL); if (!ret) { bool release = !huge; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 86c3af79e874..4c527a80b6c9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1469,7 +1469,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (nodes_empty(nmask)) node_set(mtc.nid, nmask); ret = migrate_pages(&source, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL); if (ret) { list_for_each_entry(page, &source, lru) { if (__ratelimit(&migrate_rs)) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e32360e90274..939eabcaf488 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1084,7 +1084,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } @@ -1338,7 +1338,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_page, NULL, - start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); + start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); if (nr_failed) putback_movable_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 0c12af203b68..ae923e9b8874 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1429,6 +1429,8 @@ static inline int try_split_thp(struct page *page, struct page **page2, * @mode: The migration mode that specifies the constraints for * page migration, if any. * @reason: The reason for page migration. + * @ret_succeeded: Set to the number of pages migrated successfully if + * the caller passes a non-NULL pointer. * * The function returns after 10 attempts or if no pages are movable any more * because the list has become empty or no retryable pages exist any more. @@ -1439,7 +1441,7 @@ static inline int try_split_thp(struct page *page, struct page **page2, */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, - enum migrate_mode mode, int reason) + enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { int retry = 1; int thp_retry = 1; @@ -1594,6 +1596,9 @@ out: if (!swapwrite) current->flags &= ~PF_SWAPWRITE; + if (ret_succeeded) + *ret_succeeded = nr_succeeded; + return rc; } @@ -1663,7 +1668,7 @@ static int do_move_pages_to_node(struct mm_struct *mm, }; err = migrate_pages(pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(pagelist); return err; @@ -2178,7 +2183,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, *new, NULL, node, - MIGRATE_ASYNC, MR_NUMA_MISPLACED); + MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cafdca874e0d..f95e1d2386a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8990,7 +8990,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migration_target, - NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); + NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); /* * On -ENOMEM, migrate_pages() bails out right away. It is pointless From 26aa2d199d6f2cfa6f2ef2a5dfe891f2250e71a0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Sep 2021 14:59:16 -0700 Subject: [PATCH 438/474] mm/migrate: demote pages during reclaim This is mostly derived from a patch from Yang Shi: https://lore.kernel.org/linux-mm/1560468577-101178-10-git-send-email-yang.shi@linux.alibaba.com/ Add code to the reclaim path (shrink_page_list()) to "demote" data to another NUMA node instead of discarding the data. This always avoids the cost of I/O needed to read the page back in and sometimes avoids the writeout cost when the page is dirty. A second pass through shrink_page_list() will be made if any demotions fail. This essentially falls back to normal reclaim behavior in the case that demotions fail. Previous versions of this patch may have simply failed to reclaim pages which were eligible for demotion but were unable to be demoted in practice. For some cases, for example, MADV_PAGEOUT, the pages are always discarded instead of demoted to follow the kernel API definition. Because MADV_PAGEOUT is defined as freeing specified pages regardless in which tier they are. Note: This just adds the start of infrastructure for migration. It is actually disabled next to the FIXME in migrate_demote_page_ok(). [dave.hansen@linux.intel.com: v11] Link: https://lkml.kernel.org/r/20210715055145.195411-5-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210721063926.3024591-4-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-5-ying.huang@intel.com Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Wei Xu Reviewed-by: Oscar Salvador Reviewed-by: Zi Yan Cc: Michal Hocko Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 9 ++++ include/trace/events/migrate.h | 3 +- mm/vmscan.c | 85 ++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 1 deletion(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 8ab88d46318e..326250996b4e 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -28,6 +28,7 @@ enum migrate_reason { MR_NUMA_MISPLACED, MR_CONTIG_RANGE, MR_LONGTERM_PIN, + MR_DEMOTION, MR_TYPES }; @@ -167,6 +168,14 @@ struct migrate_vma { int migrate_vma_setup(struct migrate_vma *args); void migrate_vma_pages(struct migrate_vma *migrate); void migrate_vma_finalize(struct migrate_vma *migrate); +int next_demotion_node(int node); + +#else /* CONFIG_MIGRATION disabled: */ + +static inline int next_demotion_node(int node) +{ + return NUMA_NO_NODE; +} #endif /* CONFIG_MIGRATION */ diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 9fb2a3bbcdfb..779f3fad9ecd 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -21,7 +21,8 @@ EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \ EM( MR_NUMA_MISPLACED, "numa_misplaced") \ EM( MR_CONTIG_RANGE, "contig_range") \ - EMe(MR_LONGTERM_PIN, "longterm_pin") + EM( MR_LONGTERM_PIN, "longterm_pin") \ + EMe(MR_DEMOTION, "demotion") /* * First define the enums in the above macros to be exported to userspace diff --git a/mm/vmscan.c b/mm/vmscan.c index 6c401b44a245..f26b247f5daf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -121,6 +122,9 @@ struct scan_control { /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1; + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + /* Allocation order */ s8 order; @@ -518,6 +522,17 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker, return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); } +static bool can_demote(int nid, struct scan_control *sc) +{ + if (sc->no_demotion) + return false; + if (next_demotion_node(nid) == NUMA_NO_NODE) + return false; + + // FIXME: actually enable this later in the series + return false; +} + /* * This misses isolated pages which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is @@ -1263,6 +1278,49 @@ static void page_check_dirty_writeback(struct page *page, mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } +static struct page *alloc_demote_page(struct page *page, unsigned long node) +{ + struct migration_target_control mtc = { + /* + * Allocate from 'node', or fail quickly and quietly. + * When this happens, 'page' will likely just be discarded + * instead of migrated. + */ + .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | + __GFP_THISNODE | __GFP_NOWARN | + __GFP_NOMEMALLOC | GFP_NOWAIT, + .nid = node + }; + + return alloc_migration_target(page, (unsigned long)&mtc); +} + +/* + * Take pages on @demote_list and attempt to demote them to + * another node. Pages which are not demoted are left on + * @demote_pages. + */ +static unsigned int demote_page_list(struct list_head *demote_pages, + struct pglist_data *pgdat) +{ + int target_nid = next_demotion_node(pgdat->node_id); + unsigned int nr_succeeded; + int err; + + if (list_empty(demote_pages)) + return 0; + + if (target_nid == NUMA_NO_NODE) + return 0; + + /* Demotion ignores all cpuset and mempolicy settings */ + err = migrate_pages(demote_pages, alloc_demote_page, NULL, + target_nid, MIGRATE_ASYNC, MR_DEMOTION, + &nr_succeeded); + + return nr_succeeded; +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -1274,12 +1332,16 @@ static unsigned int shrink_page_list(struct list_head *page_list, { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); + LIST_HEAD(demote_pages); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; + bool do_demote_pass; memset(stat, 0, sizeof(*stat)); cond_resched(); + do_demote_pass = can_demote(pgdat->node_id, sc); +retry: while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; @@ -1428,6 +1490,17 @@ static unsigned int shrink_page_list(struct list_head *page_list, ; /* try to reclaim the page below */ } + /* + * Before reclaiming the page, try to relocate + * its contents to another node. + */ + if (do_demote_pass && + (thp_migration_supported() || !PageTransHuge(page))) { + list_add(&page->lru, &demote_pages); + unlock_page(page); + continue; + } + /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. @@ -1679,6 +1752,17 @@ keep: list_add(&page->lru, &ret_pages); VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } + /* 'page_list' is always empty here */ + + /* Migrate pages selected for demotion */ + nr_reclaimed += demote_page_list(&demote_pages, pgdat); + /* Pages that could not be demoted are still in @demote_pages */ + if (!list_empty(&demote_pages)) { + /* Pages which failed to demoted go back on @page_list for retry: */ + list_splice_init(&demote_pages, page_list); + do_demote_pass = false; + goto retry; + } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; @@ -2326,6 +2410,7 @@ unsigned long reclaim_pages(struct list_head *page_list) .may_writepage = 1, .may_unmap = 1, .may_swap = 1, + .no_demotion = 1, }; noreclaim_flag = memalloc_noreclaim_save(); From 668e4147d8850df32ca41e28f52c146025ca45c6 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 2 Sep 2021 14:59:19 -0700 Subject: [PATCH 439/474] mm/vmscan: add page demotion counter Account the number of demoted pages. Add pgdemote_kswapd and pgdemote_direct VM counters showed in /proc/vmstat. [ daveh: - __count_vm_events() a bit, and made them look at the THP size directly rather than getting data from migrate_pages() ] Link: https://lkml.kernel.org/r/20210721063926.3024591-5-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-6-ying.huang@intel.com Signed-off-by: Yang Shi Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Wei Xu Reviewed-by: Zi Yan Cc: Michal Hocko Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Oscar Salvador Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 2 ++ mm/vmscan.c | 5 +++++ mm/vmstat.c | 2 ++ 3 files changed, 9 insertions(+) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index ae0dd1948c2b..a185cc75ff52 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -33,6 +33,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGREUSE, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, + PGDEMOTE_KSWAPD, + PGDEMOTE_DIRECT, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_DIRECT_THROTTLE, diff --git a/mm/vmscan.c b/mm/vmscan.c index f26b247f5daf..88593b82a8df 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1318,6 +1318,11 @@ static unsigned int demote_page_list(struct list_head *demote_pages, target_nid, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); + if (current_is_kswapd()) + __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded); + else + __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); + return nr_succeeded; } diff --git a/mm/vmstat.c b/mm/vmstat.c index b0534e068166..ec5a2e789dd2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1217,6 +1217,8 @@ const char * const vmstat_text[] = { "pgreuse", "pgsteal_kswapd", "pgsteal_direct", + "pgdemote_kswapd", + "pgdemote_direct", "pgscan_kswapd", "pgscan_direct", "pgscan_direct_throttle", From 2f368a9fb7f408ba7d4e6d588e1958fe8b780d08 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Sep 2021 14:59:23 -0700 Subject: [PATCH 440/474] mm/vmscan: add helper for querying ability to age anonymous pages Anonymous pages are kept on their own LRU(s). These lists could theoretically always be scanned and maintained. But, without swap, there is currently nothing the kernel can *do* with the results of a scanned, sorted LRU for anonymous pages. A check for '!total_swap_pages' currently serves as a valid check as to whether anonymous LRUs should be maintained. However, another method will be added shortly: page demotion. Abstract out the 'total_swap_pages' checks into a helper, give it a logically significant name, and check for the possibility of page demotion. [dave.hansen@linux.intel.com: v11] Link: https://lkml.kernel.org/r/20210715055145.195411-7-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210721063926.3024591-6-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-7-ying.huang@intel.com Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Greg Thelen Reviewed-by: Zi Yan Cc: Michal Hocko Cc: Wei Xu Cc: Oscar Salvador Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Keith Busch Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 88593b82a8df..c84aba26d257 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2734,6 +2734,21 @@ out: } } +/* + * Anonymous LRU management is a waste if there is + * ultimately no way to reclaim the memory. + */ +static bool can_age_anon_pages(struct pglist_data *pgdat, + struct scan_control *sc) +{ + /* Aging the anon LRU is valuable if swap is present: */ + if (total_swap_pages > 0) + return true; + + /* Also valuable if anon pages can be demoted: */ + return can_demote(pgdat->node_id, sc); +} + static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; @@ -2843,7 +2858,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON)) + if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && + inactive_is_low(lruvec, LRU_INACTIVE_ANON)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -3678,7 +3694,7 @@ static void age_active_anon(struct pglist_data *pgdat, struct mem_cgroup *memcg; struct lruvec *lruvec; - if (!total_swap_pages) + if (!can_age_anon_pages(pgdat, sc)) return; lruvec = mem_cgroup_lruvec(NULL, pgdat); From a2a36488a61cefe3129295c6e75b3987b9d7fd13 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 2 Sep 2021 14:59:26 -0700 Subject: [PATCH 441/474] mm/vmscan: Consider anonymous pages without swap Reclaim anonymous pages if a migration path is available now that demotion provides a non-swap recourse for reclaiming anon pages. Note that this check is subtly different from the can_age_anon_pages() checks. This mechanism checks whether a specific page in a specific context can actually be reclaimed, given current swap space and cgroup limits. can_age_anon_pages() is a much simpler and more preliminary check which just says whether there is a possibility of future reclaim. [kbusch@kernel.org: v11] Link: https://lkml.kernel.org/r/20210715055145.195411-8-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210721063926.3024591-7-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-8-ying.huang@intel.com Cc: Keith Busch Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Zi Yan Cc: Michal Hocko Cc: Wei Xu Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index c84aba26d257..b0970769b100 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -524,7 +524,7 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker, static bool can_demote(int nid, struct scan_control *sc) { - if (sc->no_demotion) + if (sc && sc->no_demotion) return false; if (next_demotion_node(nid) == NUMA_NO_NODE) return false; @@ -533,6 +533,31 @@ static bool can_demote(int nid, struct scan_control *sc) return false; } +static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, + int nid, + struct scan_control *sc) +{ + if (memcg == NULL) { + /* + * For non-memcg reclaim, is there + * space in any swap device? + */ + if (get_nr_swap_pages() > 0) + return true; + } else { + /* Is the memcg below its swap limit? */ + if (mem_cgroup_get_nr_swap_pages(memcg) > 0) + return true; + } + + /* + * The page can not be swapped. + * + * Can it be reclaimed from this node via demotion? + */ + return can_demote(nid, sc); +} + /* * This misses isolated pages which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is @@ -544,7 +569,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); - if (get_nr_swap_pages() > 0) + if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); @@ -2541,6 +2566,7 @@ enum scan_balance { static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) { + struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup *memcg = lruvec_memcg(lruvec); unsigned long anon_cost, file_cost, total_cost; int swappiness = mem_cgroup_swappiness(memcg); @@ -2551,7 +2577,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru; /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { + if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { scan_balance = SCAN_FILE; goto out; } @@ -2929,7 +2955,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, */ pages_for_compaction = compact_gap(sc->order); inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); - if (get_nr_swap_pages() > 0) + if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); return inactive_lru_pages > pages_for_compaction; From 3a235693d3930e1276c8d9cc0ca5807ef292cf0a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Sep 2021 14:59:30 -0700 Subject: [PATCH 442/474] mm/vmscan: never demote for memcg reclaim Global reclaim aims to reduce the amount of memory used on a given node or set of nodes. Migrating pages to another node serves this purpose. memcg reclaim is different. Its goal is to reduce the total memory consumption of the entire memcg, across all nodes. Migration does not assist memcg reclaim because it just moves page contents between nodes rather than actually reducing memory consumption. Link: https://lkml.kernel.org/r/20210715055145.195411-9-ying.huang@intel.com Signed-off-by: Dave Hansen Signed-off-by: "Huang, Ying" Suggested-by: Yang Shi Reviewed-by: Yang Shi Reviewed-by: Zi Yan Cc: Michal Hocko Cc: Wei Xu Cc: Oscar Salvador Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index b0970769b100..43289f5f8488 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -524,8 +524,13 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker, static bool can_demote(int nid, struct scan_control *sc) { - if (sc && sc->no_demotion) - return false; + if (sc) { + if (sc->no_demotion) + return false; + /* It is pointless to do demotion in memcg reclaim */ + if (cgroup_reclaim(sc)) + return false; + } if (next_demotion_node(nid) == NUMA_NO_NODE) return false; From 20b51af15e014cac63b58a4f8b8b323ac35bccce Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 2 Sep 2021 14:59:33 -0700 Subject: [PATCH 443/474] mm/migrate: add sysfs interface to enable reclaim migration Some method is obviously needed to enable reclaim-based migration. Just like traditional autonuma, there will be some workloads that will benefit like workloads with more "static" configurations where hot pages stay hot and cold pages stay cold. If pages come and go from the hot and cold sets, the benefits of this approach will be more limited. The benefits are truly workload-based and *not* hardware-based. We do not believe that there is a viable threshold where certain hardware configurations should have this mechanism enabled while others do not. To be conservative, earlier work defaulted to disable reclaim- based migration and did not include a mechanism to enable it. This proposes add a new sysfs file /sys/kernel/mm/numa/demotion_enabled as a method to enable it. We are open to any alternative that allows end users to enable this mechanism or disable it if workload harm is detected (just like traditional autonuma). Once this is enabled page demotion may move data to a NUMA node that does not fall into the cpuset of the allocating process. This could be construed to violate the guarantees of cpusets. However, since this is an opt-in mechanism, the assumption is that anyone enabling it is content to relax the guarantees. Link: https://lkml.kernel.org/r/20210721063926.3024591-9-ying.huang@intel.com Link: https://lkml.kernel.org/r/20210715055145.195411-10-ying.huang@intel.com Signed-off-by: Huang Ying Originally-by: Dave Hansen Cc: Michal Hocko Cc: Wei Xu Cc: Yang Shi Cc: Zi Yan Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../ABI/testing/sysfs-kernel-mm-numa | 24 ++++++++ include/linux/mempolicy.h | 4 ++ mm/mempolicy.c | 61 +++++++++++++++++++ mm/vmscan.c | 5 +- 4 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-numa diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-numa b/Documentation/ABI/testing/sysfs-kernel-mm-numa new file mode 100644 index 000000000000..77e559d4ed80 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-numa @@ -0,0 +1,24 @@ +What: /sys/kernel/mm/numa/ +Date: June 2021 +Contact: Linux memory management mailing list +Description: Interface for NUMA + +What: /sys/kernel/mm/numa/demotion_enabled +Date: June 2021 +Contact: Linux memory management mailing list +Description: Enable/disable demoting pages during reclaim + + Page migration during reclaim is intended for systems + with tiered memory configurations. These systems have + multiple types of memory with varied performance + characteristics instead of plain NUMA systems where + the same kind of memory is found at varied distances. + Allowing page migration during reclaim enables these + systems to migrate pages from fast tiers to slow tiers + when the fast tier is under pressure. This migration + is performed before swap. It may move data to a NUMA + node that does not fall into the cpuset of the + allocating process which might be construed to violate + the guarantees of cpusets. This should not be enabled + on systems which need strict cpuset location + guarantees. diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 0aaf91b496e2..4ca025e2a77e 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -184,6 +184,8 @@ extern bool vma_migratable(struct vm_area_struct *vma); extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); extern void mpol_put_task_policy(struct task_struct *); +extern bool numa_demotion_enabled; + #else struct mempolicy {}; @@ -292,5 +294,7 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp) { return NULL; } + +#define numa_demotion_enabled false #endif /* CONFIG_NUMA */ #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 939eabcaf488..e675bfb856da 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -3021,3 +3021,64 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); } + +bool numa_demotion_enabled = false; + +#ifdef CONFIG_SYSFS +static ssize_t numa_demotion_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", + numa_demotion_enabled? "true" : "false"); +} + +static ssize_t numa_demotion_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + numa_demotion_enabled = true; + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + numa_demotion_enabled = false; + else + return -EINVAL; + + return count; +} + +static struct kobj_attribute numa_demotion_enabled_attr = + __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, + numa_demotion_enabled_store); + +static struct attribute *numa_attrs[] = { + &numa_demotion_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group numa_attr_group = { + .attrs = numa_attrs, +}; + +static int __init numa_init_sysfs(void) +{ + int err; + struct kobject *numa_kobj; + + numa_kobj = kobject_create_and_add("numa", mm_kobj); + if (!numa_kobj) { + pr_err("failed to create numa kobject\n"); + return -ENOMEM; + } + err = sysfs_create_group(numa_kobj, &numa_attr_group); + if (err) { + pr_err("failed to register numa group\n"); + goto delete_obj; + } + return 0; + +delete_obj: + kobject_put(numa_kobj); + return err; +} +subsys_initcall(numa_init_sysfs); +#endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 43289f5f8488..2255025f1891 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -524,6 +524,8 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker, static bool can_demote(int nid, struct scan_control *sc) { + if (!numa_demotion_enabled) + return false; if (sc) { if (sc->no_demotion) return false; @@ -534,8 +536,7 @@ static bool can_demote(int nid, struct scan_control *sc) if (next_demotion_node(nid) == NUMA_NO_NODE) return false; - // FIXME: actually enable this later in the series - return false; + return true; } static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, From 9647875be52b33fe22cb034ec3074896c581543f Mon Sep 17 00:00:00 2001 From: Hui Su Date: Thu, 2 Sep 2021 14:59:36 -0700 Subject: [PATCH 444/474] mm/vmpressure: replace vmpressure_to_css() with vmpressure_to_memcg() We can get memcg directly form vmpr instead of vmpr->memcg->css->memcg, so add a new func helper vmpressure_to_memcg(). And no code will use vmpressure_to_css(), so delete it. Link: https://lkml.kernel.org/r/20210630112146.455103-1-suhui@zeku.com Signed-off-by: Hui Su Acked-by: Michal Hocko Acked-by: Chris Down Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmpressure.h | 2 +- mm/memcontrol.c | 4 ++-- mm/vmpressure.c | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 6d28bc433c1c..6a2f51ebbfd3 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -37,7 +37,7 @@ extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); extern void vmpressure_init(struct vmpressure *vmpr); extern void vmpressure_cleanup(struct vmpressure *vmpr); extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); -extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); +extern struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr); extern int vmpressure_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 06ae0075e864..896f0f403c52 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -256,9 +256,9 @@ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) return &memcg->vmpressure; } -struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) +struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) { - return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; + return container_of(vmpr, struct mem_cgroup, vmpressure); } #ifdef CONFIG_MEMCG_KMEM diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 9b172561fded..76518e4166dc 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -74,8 +74,7 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work) static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) { - struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr); memcg = parent_mem_cgroup(memcg); if (!memcg) From d17be2d9ff6c689fd70d2d451153d613508a56ae Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:59:39 -0700 Subject: [PATCH 445/474] mm/vmscan: remove the PageDirty check after MADV_FREE pages are page_ref_freezed Patch series "Cleanups for vmscan", v2. This series contains cleanups to remove unneeded return value, misleading setting and so on. Also this remove the PageDirty check after MADV_FREE pages are page_ref_freezed. More details can be found in the respective changelogs. This patch (of 4): If the MADV_FREE pages are redirtied before they could be reclaimed, put the pages back to anonymous LRU list by setting SwapBacked flag and the pages will be reclaimed in normal swapout way. But as Yu Zhao pointed out, "The page has only one reference left, which is from the isolation. After the caller puts the page back on lru and drops the reference, the page will be freed anyway. It doesn't matter which lru it goes." So we don't bother checking PageDirty here. [Yu Zhao's comment is also quoted in the code.] Link: https://lkml.kernel.org/r/20210717065911.61497-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210717065911.61497-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Yu Zhao Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Michal Hocko Cc: Jens Axboe Cc: Joonsoo Kim Cc: Alex Shi Cc: Alistair Popple Cc: Matthew Wilcox Cc: Minchan Kim Cc: David Hildenbrand Cc: Shaohua Li Cc: Hillf Danton Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2255025f1891..044207d0bb66 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1732,11 +1732,14 @@ retry: /* follow __remove_mapping for reference */ if (!page_ref_freeze(page, 1)) goto keep_locked; - if (PageDirty(page)) { - page_ref_unfreeze(page, 1); - goto keep_locked; - } - + /* + * The page has only one reference left, which is + * from the isolation. After the caller puts the + * page back on lru and drops the reference, the + * page will be freed anyway. It doesn't matter + * which lru it goes. So we don't bother checking + * PageDirty here. + */ count_vm_event(PGLAZYFREED); count_memcg_page_event(page, PGLAZYFREED); } else if (!mapping || !__remove_mapping(mapping, page, true, From eaad1ae7819fa2b8616a31c66d48982b1bb85d62 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:59:43 -0700 Subject: [PATCH 446/474] mm/vmscan: remove misleading setting to sc->priority The priority field of sc is used to control how many pages we should scan at once while we always traverse the list to shrink the pages in these functions. So these settings are unneeded and misleading. Link: https://lkml.kernel.org/r/20210717065911.61497-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Michal Hocko Cc: Minchan Kim Cc: Shaohua Li Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 044207d0bb66..8857e4dcbfd3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1820,7 +1820,6 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, { struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .priority = DEF_PRIORITY, .may_unmap = 1, }; struct reclaim_stat stat; @@ -2445,7 +2444,6 @@ unsigned long reclaim_pages(struct list_head *page_list) unsigned int noreclaim_flag; struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .priority = DEF_PRIORITY, .may_writepage = 1, .may_unmap = 1, .may_swap = 1, From b87c517ac5de168aec6e8318ca0707b11b2ccfaf Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:59:46 -0700 Subject: [PATCH 447/474] mm/vmscan: remove unneeded return value of kswapd_run() The return value of kswapd_run() is unused now. Clean it up. Link: https://lkml.kernel.org/r/20210717065911.61497-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Michal Hocko Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Minchan Kim Cc: Shaohua Li Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- mm/vmscan.c | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index f30d26b0f71d..ba52f3a3478e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -408,7 +408,7 @@ static inline bool node_reclaim_enabled(void) extern void check_move_unevictable_pages(struct pagevec *pvec); -extern int kswapd_run(int nid); +extern void kswapd_run(int nid); extern void kswapd_stop(int nid); #ifdef CONFIG_SWAP diff --git a/mm/vmscan.c b/mm/vmscan.c index 8857e4dcbfd3..ab5019700dc3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4434,23 +4434,20 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ -int kswapd_run(int nid) +void kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); - int ret = 0; if (pgdat->kswapd) - return 0; + return; pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ BUG_ON(system_state < SYSTEM_RUNNING); pr_err("Failed to start kswapd on node %d\n", nid); - ret = PTR_ERR(pgdat->kswapd); pgdat->kswapd = NULL; } - return ret; } /* From 2e786d9e5a2014c327d9b2eec83fa60b16af26f9 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 14:59:50 -0700 Subject: [PATCH 448/474] mm/vmscan: add 'else' to remove check_pending label We could add 'else' to remove the somewhat odd check_pending label to make code core succinct. Link: https://lkml.kernel.org/r/20210717065911.61497-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Michal Hocko Cc: Alex Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Hillf Danton Cc: Jens Axboe Cc: Johannes Weiner Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Minchan Kim Cc: Shaohua Li Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index ab5019700dc3..1b14a1b2539c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3578,18 +3578,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, * blocked waiting on the same lock. Instead, throttle for up to a * second before continuing. */ - if (!(gfp_mask & __GFP_FS)) { + if (!(gfp_mask & __GFP_FS)) wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, allow_direct_reclaim(pgdat), HZ); + else + /* Throttle until kswapd wakes the process */ + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, + allow_direct_reclaim(pgdat)); - goto check_pending; - } - - /* Throttle until kswapd wakes the process */ - wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - allow_direct_reclaim(pgdat)); - -check_pending: if (fatal_signal_pending(current)) return true; From 1399af7e54896c774d67f1c1acc491b07149421d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 2 Sep 2021 14:59:53 -0700 Subject: [PATCH 449/474] mm, vmscan: guarantee drop_slab_node() termination drop_slab_node() is called as part of echo 2>/proc/sys/vm/drop_caches operation. It iterates over all memcgs and calls shrink_slab() which in turn iterates over all slab shrinkers. Freed objects are counted and as long as the total number of freed objects from all memcgs and shrinkers is higher than 10, drop_slab_node() loops for another full memcgs*shrinkers iteration. This arbitrary constant threshold of 10 can result in effectively an infinite loop on a system with large number of memcgs and/or parallel activity that allocates new objects. This has been reported previously by Chunxin Zang [1] and recently by our customer. The previous report [1] has resulted in commit 069c411de40a ("mm/vmscan: fix infinite loop in drop_slab_node") which added a check for signals allowing the user to terminate the command writing to drop_caches. At the time it was also considered to make the threshold grow with each iteration to guarantee termination, but such patch hasn't been formally proposed yet. This patch implements the dynamically growing threshold. At first iteration it's enough to free one object to continue, and this threshold effectively doubles with each iteration. Our customer's feedback was positive. There is always a risk that this change will result on some system in a previously terminating drop_caches operation to terminate sooner and free fewer objects. Ideally the semantics would guarantee freeing all freeable objects that existed at the moment of starting the operation, while not looping forever for newly allocated objects, but that's not feasible to track. In the less ideal solution based on thresholds, arguably the termination guarantee is more important than the exhaustiveness guarantee. If there are reports of large regression wrt being exhaustive, we can tune how fast the threshold grows. [1] https://lore.kernel.org/lkml/20200909152047.27905-1-zangchunxin@bytedance.com/T/#u [vbabka@suse.cz: avoid undefined shift behaviour] Link: https://lkml.kernel.org/r/2f034e6f-a753-550a-f374-e4e23899d3d5@suse.cz Link: https://lkml.kernel.org/r/20210818152239.25502-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Chunxin Zang Cc: Muchun Song Cc: Chris Down Cc: Michal Hocko Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1b14a1b2539c..740d03e6dae2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -939,6 +939,7 @@ out: void drop_slab_node(int nid) { unsigned long freed; + int shift = 0; do { struct mem_cgroup *memcg = NULL; @@ -951,7 +952,7 @@ void drop_slab_node(int nid) do { freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - } while (freed > 10); + } while ((freed >> shift++) > 1); } void drop_slab(void) From e1e92bfa3825b72be4957f9fef267b3106d20aa6 Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Thu, 2 Sep 2021 14:59:56 -0700 Subject: [PATCH 450/474] mm: compaction: optimize proactive compaction deferrals Vlastimil Babka figured out that when fragmentation score didn't go down across the proactive compaction i.e. when no progress is made, next wake up for proactive compaction is deferred for 1 << COMPACT_MAX_DEFER_SHIFT, i.e. 64 times, with each wakeup interval of HPAGE_FRAG_CHECK_INTERVAL_MSEC(=500). In each of this wakeup, it just decrement 'proactive_defer' counter and goes sleep i.e. it is getting woken to just decrement a counter. The same deferral time can also achieved by simply doing the HPAGE_FRAG_CHECK_INTERVAL_MSEC << COMPACT_MAX_DEFER_SHIFT thus unnecessary wakeup of kcompact thread is avoided thus also removes the need of 'proactive_defer' thread counter. [akpm@linux-foundation.org: tweak comment] Link: https://lore.kernel.org/linux-fsdevel/88abfdb6-2c13-b5a6-5b46-742d12d1c910@suse.cz/ Link: https://lkml.kernel.org/r/1626869599-25412-1-git-send-email-charante@codeaurora.org Signed-off-by: Charan Teja Reddy Acked-by: Vlastimil Babka Reviewed-by: Khalid Aziz Acked-by: David Rientjes Cc: Nitin Gupta Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 61fb64f47a06..4ee0d40d93f2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2885,7 +2885,8 @@ static int kcompactd(void *p) { pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; - unsigned int proactive_defer = 0; + long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC); + long timeout = default_timeout; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -2902,23 +2903,30 @@ static int kcompactd(void *p) trace_mm_compaction_kcompactd_sleep(pgdat->node_id); if (wait_event_freezable_timeout(pgdat->kcompactd_wait, - kcompactd_work_requested(pgdat), - msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) { + kcompactd_work_requested(pgdat), timeout)) { psi_memstall_enter(&pflags); kcompactd_do_work(pgdat); psi_memstall_leave(&pflags); + /* + * Reset the timeout value. The defer timeout from + * proactive compaction is lost here but that is fine + * as the condition of the zone changing substantionally + * then carrying on with the previous defer interval is + * not useful. + */ + timeout = default_timeout; continue; } - /* kcompactd wait timeout */ + /* + * Start the proactive work with default timeout. Based + * on the fragmentation score, this timeout is updated. + */ + timeout = default_timeout; if (should_proactive_compact_node(pgdat)) { unsigned int prev_score, score; - if (proactive_defer) { - proactive_defer--; - continue; - } prev_score = fragmentation_score_node(pgdat); proactive_compact_node(pgdat); score = fragmentation_score_node(pgdat); @@ -2926,8 +2934,9 @@ static int kcompactd(void *p) * Defer proactive compaction if the fragmentation * score did not go down i.e. no progress made. */ - proactive_defer = score < prev_score ? - 0 : 1 << COMPACT_MAX_DEFER_SHIFT; + if (unlikely(score >= prev_score)) + timeout = + default_timeout << COMPACT_MAX_DEFER_SHIFT; } } From 65d759c8f9f57b96c199f3fe5cfb93ac7da095e9 Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Thu, 2 Sep 2021 14:59:59 -0700 Subject: [PATCH 451/474] mm: compaction: support triggering of proactive compaction by user The proactive compaction[1] gets triggered for every 500msec and run compaction on the node for COMPACTION_HPAGE_ORDER (usually order-9) pages based on the value set to sysctl.compaction_proactiveness. Triggering the compaction for every 500msec in search of COMPACTION_HPAGE_ORDER pages is not needed for all applications, especially on the embedded system usecases which may have few MB's of RAM. Enabling the proactive compaction in its state will endup in running almost always on such systems. Other side, proactive compaction can still be very much useful for getting a set of higher order pages in some controllable manner(controlled by using the sysctl.compaction_proactiveness). So, on systems where enabling the proactive compaction always may proove not required, can trigger the same from user space on write to its sysctl interface. As an example, say app launcher decide to launch the memory heavy application which can be launched fast if it gets more higher order pages thus launcher can prepare the system in advance by triggering the proactive compaction from userspace. This triggering of proactive compaction is done on a write to sysctl.compaction_proactiveness by user. [1]https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=facdaa917c4d5a376d09d25865f5a863f906234a [akpm@linux-foundation.org: tweak vm.rst, per Mike] Link: https://lkml.kernel.org/r/1627653207-12317-1-git-send-email-charante@codeaurora.org Signed-off-by: Charan Teja Reddy Acked-by: Vlastimil Babka Acked-by: Rafael Aquini Cc: Mike Rapoport Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Dave Hansen Cc: Mel Gorman Cc: Nitin Gupta Cc: Jonathan Corbet Cc: Khalid Aziz Cc: David Rientjes Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/vm.rst | 3 +- include/linux/compaction.h | 2 ++ include/linux/mmzone.h | 1 + kernel/sysctl.c | 2 +- mm/compaction.c | 38 +++++++++++++++++++++++-- 5 files changed, 42 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 003d5cc3751b..5e795202111f 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -118,7 +118,8 @@ compaction_proactiveness This tunable takes a value in the range [0, 100] with a default value of 20. This tunable determines how aggressively compaction is done in the -background. Setting it to 0 disables proactive compaction. +background. Write of a non zero value to this tunable will immediately +trigger the proactive compaction. Setting it to 0 disables proactive compaction. Note that compaction has a non-trivial system-wide impact as pages belonging to different processes are moved around, which could also lead diff --git a/include/linux/compaction.h b/include/linux/compaction.h index c24098c7acca..34bce35c808d 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -84,6 +84,8 @@ static inline unsigned long compact_gap(unsigned int order) extern unsigned int sysctl_compaction_proactiveness; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); +extern int compaction_proactiveness_sysctl_handler(struct ctl_table *table, + int write, void *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; extern int sysctl_compact_unevictable_allowed; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 59bad25ce78e..1bd5f5955f9a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -846,6 +846,7 @@ typedef struct pglist_data { enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; + bool proactive_compact_trigger; #endif /* * This is a per-node reserve of pages that are not available diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 272f4a272f8c..297f0b3966bd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2871,7 +2871,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_compaction_proactiveness, .maxlen = sizeof(sysctl_compaction_proactiveness), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = compaction_proactiveness_sysctl_handler, .extra1 = SYSCTL_ZERO, .extra2 = &one_hundred, }, diff --git a/mm/compaction.c b/mm/compaction.c index 4ee0d40d93f2..fa9b2b598eab 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2706,6 +2706,30 @@ static void compact_nodes(void) */ unsigned int __read_mostly sysctl_compaction_proactiveness = 20; +int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + int rc, nid; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write && sysctl_compaction_proactiveness) { + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + + if (pgdat->proactive_compact_trigger) + continue; + + pgdat->proactive_compact_trigger = true; + wake_up_interruptible(&pgdat->kcompactd_wait); + } + } + + return 0; +} + /* * This is the entry point for compacting all nodes via * /proc/sys/vm/compact_memory @@ -2750,7 +2774,8 @@ void compaction_unregister_node(struct node *node) static inline bool kcompactd_work_requested(pg_data_t *pgdat) { - return pgdat->kcompactd_max_order > 0 || kthread_should_stop(); + return pgdat->kcompactd_max_order > 0 || kthread_should_stop() || + pgdat->proactive_compact_trigger; } static bool kcompactd_node_suitable(pg_data_t *pgdat) @@ -2901,9 +2926,16 @@ static int kcompactd(void *p) while (!kthread_should_stop()) { unsigned long pflags; + /* + * Avoid the unnecessary wakeup for proactive compaction + * when it is disabled. + */ + if (!sysctl_compaction_proactiveness) + timeout = MAX_SCHEDULE_TIMEOUT; trace_mm_compaction_kcompactd_sleep(pgdat->node_id); if (wait_event_freezable_timeout(pgdat->kcompactd_wait, - kcompactd_work_requested(pgdat), timeout)) { + kcompactd_work_requested(pgdat), timeout) && + !pgdat->proactive_compact_trigger) { psi_memstall_enter(&pflags); kcompactd_do_work(pgdat); @@ -2938,6 +2970,8 @@ static int kcompactd(void *p) timeout = default_timeout << COMPACT_MAX_DEFER_SHIFT; } + if (unlikely(pgdat->proactive_compact_trigger)) + pgdat->proactive_compact_trigger = false; } return 0; From 062db29358c9bd40d8aa9e96ce7b492b03d669d5 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 2 Sep 2021 15:00:03 -0700 Subject: [PATCH 452/474] mm/mempolicy: use readable NUMA_NO_NODE macro instead of magic number The caller of mpol_misplaced() already use NUMA_NO_NODE to check whether current page node is misplaced, thus using NUMA_NO_NODE in mpol_misplaced() instead of magic number is more readable. Link: https://lkml.kernel.org/r/1b77c0ce21183fa86f4db250b115cf5e27396528.1627558356.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e675bfb856da..c473d2931708 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2425,8 +2425,8 @@ static void sp_free(struct sp_node *n) * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. * - * Return: -1 if the page is in a node that is valid for this policy, or a - * suitable node ID to allocate a replacement page from. + * Return: NUMA_NO_NODE if the page is in a node that is valid for this + * policy, or a suitable node ID to allocate a replacement page from. */ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) { @@ -2437,7 +2437,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); int polnid = NUMA_NO_NODE; - int ret = -1; + int ret = NUMA_NO_NODE; pol = get_vma_policy(vma, addr); if (!(pol->flags & MPOL_F_MOF)) From b27abaccf8e8b012f126da0c2a1ab32723ec8b9f Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Sep 2021 15:00:06 -0700 Subject: [PATCH 453/474] mm/mempolicy: add MPOL_PREFERRED_MANY for multiple preferred nodes Patch series "Introduce multi-preference mempolicy", v7. This patch series introduces the concept of the MPOL_PREFERRED_MANY mempolicy. This mempolicy mode can be used with either the set_mempolicy(2) or mbind(2) interfaces. Like the MPOL_PREFERRED interface, it allows an application to set a preference for nodes which will fulfil memory allocation requests. Unlike the MPOL_PREFERRED mode, it takes a set of nodes. Like the MPOL_BIND interface, it works over a set of nodes. Unlike MPOL_BIND, it will not cause a SIGSEGV or invoke the OOM killer if those preferred nodes are not available. Along with these patches are patches for libnuma, numactl, numademo, and memhog. They still need some polish, but can be found here: https://gitlab.com/bwidawsk/numactl/-/tree/prefer-many It allows new usage: `numactl -P 0,3,4` The goal of the new mode is to enable some use-cases when using tiered memory usage models which I've lovingly named. 1a. The Hare - The interconnect is fast enough to meet bandwidth and latency requirements allowing preference to be given to all nodes with "fast" memory. 1b. The Indiscriminate Hare - An application knows it wants fast memory (or perhaps slow memory), but doesn't care which node it runs on. The application can prefer a set of nodes and then xpu bind to the local node (cpu, accelerator, etc). This reverses the nodes are chosen today where the kernel attempts to use local memory to the CPU whenever possible. This will attempt to use the local accelerator to the memory. 2. The Tortoise - The administrator (or the application itself) is aware it only needs slow memory, and so can prefer that. Much of this is almost achievable with the bind interface, but the bind interface suffers from an inability to fallback to another set of nodes if binding fails to all nodes in the nodemask. Like MPOL_BIND a nodemask is given. Inherently this removes ordering from the preference. > /* Set first two nodes as preferred in an 8 node system. */ > const unsigned long nodes = 0x3 > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); > /* Mimic interleave policy, but have fallback *. > const unsigned long nodes = 0xaa > set_mempolicy(MPOL_PREFER_MANY, &nodes, 8); Some internal discussion took place around the interface. There are two alternatives which we have discussed, plus one I stuck in: 1. Ordered list of nodes. Currently it's believed that the added complexity is nod needed for expected usecases. 2. A flag for bind to allow falling back to other nodes. This confuses the notion of binding and is less flexible than the current solution. 3. Create flags or new modes that helps with some ordering. This offers both a friendlier API as well as a solution for more customized usage. It's unknown if it's worth the complexity to support this. Here is sample code for how this might work: > // Prefer specific nodes for some something wacky > set_mempolicy(MPOL_PREFER_MANY, 0x17c, 1024); > > // Default > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_SOCKET, NULL, 0); > // which is the same as > set_mempolicy(MPOL_DEFAULT, NULL, 0); > > // The Hare > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, NULL, 0); > > // The Tortoise > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE_REV, NULL, 0); > > // Prefer the fast memory of the first two sockets > set_mempolicy(MPOL_PREFER_MANY | MPOL_F_PREFER_ORDER_TYPE, -1, 2); > This patch (of 5): The NUMA APIs currently allow passing in a "preferred node" as a single bit set in a nodemask. If more than one bit it set, bits after the first are ignored. This single node is generally OK for location-based NUMA where memory being allocated will eventually be operated on by a single CPU. However, in systems with multiple memory types, folks want to target a *type* of memory instead of a location. For instance, someone might want some high-bandwidth memory but do not care about the CPU next to which it is allocated. Or, they want a cheap, high capacity allocation and want to target all NUMA nodes which have persistent memory in volatile mode. In both of these cases, the application wants to target a *set* of nodes, but does not want strict MPOL_BIND behavior as that could lead to OOM killer or SIGSEGV. So add MPOL_PREFERRED_MANY policy to support the multiple preferred nodes requirement. This is not a pie-in-the-sky dream for an API. This was a response to a specific ask of more than one group at Intel. Specifically: 1. There are existing libraries that target memory types such as https://github.com/memkind/memkind. These are known to suffer from SIGSEGV's when memory is low on targeted memory "kinds" that span more than one node. The MCDRAM on a Xeon Phi in "Cluster on Die" mode is an example of this. 2. Volatile-use persistent memory users want to have a memory policy which is targeted at either "cheap and slow" (PMEM) or "expensive and fast" (DRAM). However, they do not want to experience allocation failures when the targeted type is unavailable. 3. Allocate-then-run. Generally, we let the process scheduler decide on which physical CPU to run a task. That location provides a default allocation policy, and memory availability is not generally considered when placing tasks. For situations where memory is valuable and constrained, some users want to allocate memory first, *then* allocate close compute resources to the allocation. This is the reverse of the normal (CPU) model. Accelerators such as GPUs that operate on core-mm-managed memory are interested in this model. A check is added in sanitize_mpol_flags() to not permit 'prefer_many' policy to be used for now, and will be removed in later patch after all implementations for 'prefer_many' are ready, as suggested by Michal Hocko. [mhocko@kernel.org: suggest to refine policy_node/policy_nodemask handling] Link: https://lkml.kernel.org/r/1627970362-61305-1-git-send-email-feng.tang@intel.com Link: https://lore.kernel.org/r/20200630212517.308045-4-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-2-git-send-email-feng.tang@intel.com Co-developed-by: Ben Widawsky Signed-off-by: Ben Widawsky Signed-off-by: Dave Hansen Signed-off-by: Feng Tang Cc: Michal Hocko Acked-by: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying b Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/mempolicy.h | 1 + mm/mempolicy.c | 73 +++++++++++++++++++++++++++------- 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 19a00bc7fe86..046d0ccba4cd 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -22,6 +22,7 @@ enum { MPOL_BIND, MPOL_INTERLEAVE, MPOL_LOCAL, + MPOL_PREFERRED_MANY, MPOL_MAX, /* always last member of enum */ }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c473d2931708..f8f2c18e61cb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -31,6 +31,9 @@ * but useful to set in a VMA when you have a non default * process policy. * + * preferred many Try a set of nodes first before normal fallback. This is + * similar to preferred without the special case. + * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. @@ -207,6 +210,14 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) return 0; } +static int mpol_new_preferred_many(struct mempolicy *pol, const nodemask_t *nodes) +{ + if (nodes_empty(*nodes)) + return -EINVAL; + pol->nodes = *nodes; + return 0; +} + static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) @@ -408,6 +419,10 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { [MPOL_LOCAL] = { .rebind = mpol_rebind_default, }, + [MPOL_PREFERRED_MANY] = { + .create = mpol_new_preferred_many, + .rebind = mpol_rebind_preferred, + }, }; static int migrate_page_add(struct page *page, struct list_head *pagelist, @@ -900,6 +915,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: *nodes = p->nodes; break; case MPOL_LOCAL: @@ -1446,7 +1462,13 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) { *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; - if ((unsigned int)(*mode) >= MPOL_MAX) + + /* + * The check should be 'mode >= MPOL_MAX', but as 'prefer_many' + * is not fully implemented, don't permit it to be used for now, + * and the logic will be restored in following patch + */ + if ((unsigned int)(*mode) >= MPOL_PREFERRED_MANY) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; @@ -1875,16 +1897,27 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) */ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) { + int mode = policy->mode; + /* Lower zones don't get a nodemask applied for MPOL_BIND */ - if (unlikely(policy->mode == MPOL_BIND) && - apply_policy_zone(policy, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&policy->nodes)) + if (unlikely(mode == MPOL_BIND) && + apply_policy_zone(policy, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&policy->nodes)) + return &policy->nodes; + + if (mode == MPOL_PREFERRED_MANY) return &policy->nodes; return NULL; } -/* Return the node id preferred by the given mempolicy, or the given id */ +/* + * Return the preferred node id for 'prefer' mempolicy, and return + * the given id for all other policies. + * + * policy_node() is always coupled with policy_nodemask(), which + * secures the nodemask limit for 'bind' and 'prefer-many' policy. + */ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) { if (policy->mode == MPOL_PREFERRED) { @@ -1936,7 +1969,9 @@ unsigned int mempolicy_slab_node(void) case MPOL_INTERLEAVE: return interleave_nodes(policy); - case MPOL_BIND: { + case MPOL_BIND: + case MPOL_PREFERRED_MANY: + { struct zoneref *z; /* @@ -2008,12 +2043,12 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy - * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask + * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy * * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. - * If the effective policy is 'BIND, returns a pointer to the mempolicy's - * @nodemask for filtering the zonelist. + * If the effective policy is 'bind' or 'prefer-many', returns a pointer + * to the mempolicy's @nodemask for filtering the zonelist. * * Must be protected by read_mems_allowed_begin() */ @@ -2021,16 +2056,18 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { int nid; + int mode; *mpol = get_vma_policy(vma, addr); - *nodemask = NULL; /* assume !MPOL_BIND */ + *nodemask = NULL; + mode = (*mpol)->mode; - if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { + if (unlikely(mode == MPOL_INTERLEAVE)) { nid = interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))); } else { nid = policy_node(gfp_flags, *mpol, numa_node_id()); - if ((*mpol)->mode == MPOL_BIND) + if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) *nodemask = &(*mpol)->nodes; } return nid; @@ -2063,6 +2100,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: *mask = mempolicy->nodes; @@ -2173,7 +2211,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, * node and don't fall back to other nodes, as the cost of * remote accesses would likely offset THP benefits. * - * If the policy is interleave, or does not allow the current + * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ if (pol->mode == MPOL_PREFERRED) @@ -2311,6 +2349,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: return !!nodes_equal(a->nodes, b->nodes); case MPOL_LOCAL: return true; @@ -2451,6 +2490,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_PREFERRED: + if (node_isset(curnid, pol->nodes)) + goto out; polnid = first_node(pol->nodes); break; @@ -2465,9 +2506,10 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; goto out; } + fallthrough; + case MPOL_PREFERRED_MANY: /* - * allows binding to multiple nodes. * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. @@ -2829,6 +2871,7 @@ static const char * const policy_modes[] = [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_LOCAL] = "local", + [MPOL_PREFERRED_MANY] = "prefer (many)", }; @@ -2907,6 +2950,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) if (!nodelist) err = 0; goto out; + case MPOL_PREFERRED_MANY: case MPOL_BIND: /* * Insist on a nodelist @@ -2993,6 +3037,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) case MPOL_LOCAL: break; case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: case MPOL_BIND: case MPOL_INTERLEAVE: nodes = pol->nodes; From 4c54d94908e089e9741513797eac30a8b8217034 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 2 Sep 2021 15:00:10 -0700 Subject: [PATCH 454/474] mm/memplicy: add page allocation function for MPOL_PREFERRED_MANY policy The semantics of MPOL_PREFERRED_MANY is similar to MPOL_PREFERRED, that it will first try to allocate memory from the preferred node(s), and fallback to all nodes in system when first try fails. Add a dedicated function alloc_pages_preferred_many() for it just like for 'interleave' policy, which will be used by 2 general memoory allocation APIs: alloc_pages() and alloc_pages_vma() Link: https://lore.kernel.org/r/20200630212517.308045-9-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-3-git-send-email-feng.tang@intel.com Suggested-by: Michal Hocko Originally-by: Ben Widawsky Co-developed-by: Ben Widawsky Signed-off-by: Ben Widawsky Signed-off-by: Feng Tang Acked-by: Michal Hocko Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Huang Ying Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f8f2c18e61cb..64be3565ab7f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2166,6 +2166,27 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, return page; } +static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, + int nid, struct mempolicy *pol) +{ + struct page *page; + gfp_t preferred_gfp; + + /* + * This is a two pass approach. The first pass will only try the + * preferred nodes but skip the direct reclaim and allow the + * allocation to fail, while the second pass will try all the + * nodes in system. + */ + preferred_gfp = gfp | __GFP_NOWARN; + preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); + if (!page) + page = __alloc_pages(gfp, order, numa_node_id(), NULL); + + return page; +} + /** * alloc_pages_vma - Allocate a page for a VMA. * @gfp: GFP flags. @@ -2201,6 +2222,12 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, goto out; } + if (pol->mode == MPOL_PREFERRED_MANY) { + page = alloc_pages_preferred_many(gfp, order, node, pol); + mpol_cond_put(pol); + goto out; + } + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { int hpage_node = node; @@ -2278,6 +2305,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) */ if (pol->mode == MPOL_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); + else if (pol->mode == MPOL_PREFERRED_MANY) + page = alloc_pages_preferred_many(gfp, order, + numa_node_id(), pol); else page = __alloc_pages(gfp, order, policy_node(gfp, pol, numa_node_id()), From cfcaa66f803233c50e17239469f6c96136a673a1 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Thu, 2 Sep 2021 15:00:13 -0700 Subject: [PATCH 455/474] mm/hugetlb: add support for mempolicy MPOL_PREFERRED_MANY Implement the missing huge page allocation functionality while obeying the preferred node semantics. This is similar to the implementation for general page allocation, as it uses a fallback mechanism to try multiple preferred nodes first, and then all other nodes. To avoid adding too many "#ifdef CONFIG_NUMA" check, add a helper function in mempolicy.h to check whether a mempolicy is MPOL_PREFERRED_MANY. [akpm@linux-foundation.org: fix compiling issue when merging with other hugetlb patch] [Thanks to 0day bot for catching the !CONFIG_NUMA compiling issue] [mhocko@suse.com: suggest to remove the #ifdef CONFIG_NUMA check] [ben.widawsky@intel.com: add helpers to avoid ifdefs] Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-4-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/20210809024430.GA46432@shbuild999.sh.intel.com [nathan@kernel.org: initialize page to NULL in alloc_buddy_huge_page_with_mpol()] Link: https://lkml.kernel.org/r/20210810200632.3812797-1-nathan@kernel.org Link: https://lore.kernel.org/r/20200630212517.308045-12-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-4-git-send-email-feng.tang@intel.com Link: https://lkml.kernel.org/r/20210809024430.GA46432@shbuild999.sh.intel.com Signed-off-by: Ben Widawsky Signed-off-by: Feng Tang Signed-off-by: Nathan Chancellor Co-developed-by: Feng Tang Suggested-by: Michal Hocko Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 12 ++++++++++++ mm/hugetlb.c | 30 +++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 4ca025e2a77e..4091692bed8c 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -186,6 +186,12 @@ extern void mpol_put_task_policy(struct task_struct *); extern bool numa_demotion_enabled; +static inline bool mpol_is_preferred_many(struct mempolicy *pol) +{ + return (pol->mode == MPOL_PREFERRED_MANY); +} + + #else struct mempolicy {}; @@ -296,5 +302,11 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp) } #define numa_demotion_enabled false + +static inline bool mpol_is_preferred_many(struct mempolicy *pol) +{ + return false; +} + #endif /* CONFIG_NUMA */ #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 41a1778d3f67..95dc7b83381f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1145,7 +1145,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, unsigned long address, int avoid_reserve, long chg) { - struct page *page; + struct page *page = NULL; struct mempolicy *mpol; gfp_t gfp_mask; nodemask_t *nodemask; @@ -1166,7 +1166,17 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + + if (mpol_is_preferred_many(mpol)) { + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + + /* Fallback to all nodes if page==NULL */ + nodemask = NULL; + } + + if (!page) + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { SetHPageRestoreReserve(page); h->resv_huge_pages--; @@ -2142,16 +2152,26 @@ static struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct page *page; + struct page *page = NULL; struct mempolicy *mpol; gfp_t gfp_mask = htlb_alloc_mask(h); int nid; nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); - mpol_cond_put(mpol); + if (mpol_is_preferred_many(mpol)) { + gfp_t gfp = gfp_mask | __GFP_NOWARN; + gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false); + + /* Fallback to all nodes if page==NULL */ + nodemask = NULL; + } + + if (!page) + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); + mpol_cond_put(mpol); return page; } From a38a59fdfa10be55d08e4530923d950e739ac6a2 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Thu, 2 Sep 2021 15:00:16 -0700 Subject: [PATCH 456/474] mm/mempolicy: advertise new MPOL_PREFERRED_MANY Adds a new mode to the existing mempolicy modes, MPOL_PREFERRED_MANY. MPOL_PREFERRED_MANY will be adequately documented in the internal admin-guide with this patch. Eventually, the man pages for mbind(2), get_mempolicy(2), set_mempolicy(2) and numactl(8) will also have text about this mode. Those shall contain the canonical reference. NUMA systems continue to become more prevalent. New technologies like PMEM make finer grain control over memory access patterns increasingly desirable. MPOL_PREFERRED_MANY allows userspace to specify a set of nodes that will be tried first when performing allocations. If those allocations fail, all remaining nodes will be tried. It's a straight forward API which solves many of the presumptive needs of system administrators wanting to optimize workloads on such machines. The mode will work either per VMA, or per thread. [Michal Hocko: refine kernel doc for MPOL_PREFERRED_MANY] Link: https://lore.kernel.org/r/20200630212517.308045-13-ben.widawsky@intel.com Link: https://lkml.kernel.org/r/1627970362-61305-5-git-send-email-feng.tang@intel.com Signed-off-by: Ben Widawsky Signed-off-by: Feng Tang Acked-by: Michal Hocko Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Huang Ying Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/mm/numa_memory_policy.rst | 15 +++++++++++---- mm/mempolicy.c | 7 +------ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index 067a90a1499c..64fd0ba0d057 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -245,6 +245,13 @@ MPOL_INTERLEAVED address range or file. During system boot up, the temporary interleaved system default policy works in this mode. +MPOL_PREFERRED_MANY + This mode specifices that the allocation should be preferrably + satisfied from the nodemask specified in the policy. If there is + a memory pressure on all nodes in the nodemask, the allocation + can fall back to all existing numa nodes. This is effectively + MPOL_PREFERRED allowed for a mask rather than a single node. + NUMA memory policy supports the following optional mode flags: MPOL_F_STATIC_NODES @@ -253,10 +260,10 @@ MPOL_F_STATIC_NODES nodes changes after the memory policy has been defined. Without this flag, any time a mempolicy is rebound because of a - change in the set of allowed nodes, the node (Preferred) or - nodemask (Bind, Interleave) is remapped to the new set of - allowed nodes. This may result in nodes being used that were - previously undesired. + change in the set of allowed nodes, the preferred nodemask (Preferred + Many), preferred node (Preferred) or nodemask (Bind, Interleave) is + remapped to the new set of allowed nodes. This may result in nodes + being used that were previously undesired. With this flag, if the user-specified nodes overlap with the nodes allowed by the task's cpuset, then the memory policy is diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 64be3565ab7f..9e58854d7929 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1463,12 +1463,7 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; - /* - * The check should be 'mode >= MPOL_MAX', but as 'prefer_many' - * is not fully implemented, don't permit it to be used for now, - * and the logic will be restored in following patch - */ - if ((unsigned int)(*mode) >= MPOL_PREFERRED_MANY) + if ((unsigned int)(*mode) >= MPOL_MAX) return -EINVAL; if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; From be897d48a971e36daadbd9289967e7e4f3749528 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 2 Sep 2021 15:00:19 -0700 Subject: [PATCH 457/474] mm/mempolicy: unify the create() func for bind/interleave/prefer-many policies As they all do the same thing: sanity check and save nodemask info, create one mpol_new_nodemask() to reduce redundancy. Link: https://lkml.kernel.org/r/1627970362-61305-6-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Acked-by: Michal Hocko Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Ben Widawsky Cc: Dan Williams Cc: Dave Hansen Cc: David Rientjes Cc: Huang Ying Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9e58854d7929..bd4f249aada3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -192,7 +192,7 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, nodes_onto(*ret, tmp, *rel); } -static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) +static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; @@ -210,22 +210,6 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) return 0; } -static int mpol_new_preferred_many(struct mempolicy *pol, const nodemask_t *nodes) -{ - if (nodes_empty(*nodes)) - return -EINVAL; - pol->nodes = *nodes; - return 0; -} - -static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) -{ - if (nodes_empty(*nodes)) - return -EINVAL; - pol->nodes = *nodes; - return 0; -} - /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes @@ -405,7 +389,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .rebind = mpol_rebind_default, }, [MPOL_INTERLEAVE] = { - .create = mpol_new_interleave, + .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_PREFERRED] = { @@ -413,14 +397,14 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { .rebind = mpol_rebind_preferred, }, [MPOL_BIND] = { - .create = mpol_new_bind, + .create = mpol_new_nodemask, .rebind = mpol_rebind_nodemask, }, [MPOL_LOCAL] = { .rebind = mpol_rebind_default, }, [MPOL_PREFERRED_MANY] = { - .create = mpol_new_preferred_many, + .create = mpol_new_nodemask, .rebind = mpol_rebind_preferred, }, }; From 38b031dd4d03542d963eebe600d67ea34f47eb65 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 2 Sep 2021 15:00:23 -0700 Subject: [PATCH 458/474] mm/mempolicy.c: use in_task() in mempolicy_slab_node() Obsoleted in_intrrupt() include task context with disabled BH, it's better to use in_task() instead. Link: https://lkml.kernel.org/r/984ee771-4834-21da-801f-c15c18ddf4d1@virtuozzo.com Signed-off-by: Vasily Averin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index bd4f249aada3..5e90b3fb7794 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1934,7 +1934,7 @@ unsigned int mempolicy_slab_node(void) struct mempolicy *policy; int node = numa_mem_id(); - if (in_interrupt()) + if (!in_task()) return node; policy = current->mempolicy; From a7259df7670240ee03b0cfce8a3e5d3773911e24 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 2 Sep 2021 15:00:26 -0700 Subject: [PATCH 459/474] memblock: make memblock_find_in_range method private There are a lot of uses of memblock_find_in_range() along with memblock_reserve() from the times memblock allocation APIs did not exist. memblock_find_in_range() is the very core of memblock allocations, so any future changes to its internal behaviour would mandate updates of all the users outside memblock. Replace the calls to memblock_find_in_range() with an equivalent calls to memblock_phys_alloc() and memblock_phys_alloc_range() and make memblock_find_in_range() private method of memblock. This simplifies the callers, ensures that (unlikely) errors in memblock_reserve() are handled and improves maintainability of memblock_find_in_range(). Link: https://lkml.kernel.org/r/20210816122622.30279-1-rppt@kernel.org Signed-off-by: Mike Rapoport Reviewed-by: Catalin Marinas [arm64] Acked-by: Kirill A. Shutemov Acked-by: Rafael J. Wysocki [ACPI] Acked-by: Russell King (Oracle) Acked-by: Nick Kossifidis [riscv] Tested-by: Guenter Roeck Acked-by: Rob Herring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/setup.c | 20 +++++--------- arch/arm64/kvm/hyp/reserved_mem.c | 9 ++---- arch/arm64/mm/init.c | 36 ++++++++---------------- arch/mips/kernel/setup.c | 14 ++++------ arch/riscv/mm/init.c | 46 ++++++++++--------------------- arch/s390/kernel/setup.c | 9 ++++-- arch/x86/kernel/aperture_64.c | 5 ++-- arch/x86/mm/init.c | 23 ++++++++++------ arch/x86/mm/numa.c | 5 ++-- arch/x86/mm/numa_emulation.c | 5 ++-- arch/x86/realmode/init.c | 2 +- drivers/acpi/tables.c | 5 ++-- drivers/base/arch_numa.c | 5 +--- drivers/of/of_reserved_mem.c | 12 +++++--- include/linux/memblock.h | 2 -- mm/memblock.c | 2 +- 16 files changed, 82 insertions(+), 118 deletions(-) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index f97eb2371672..284a80c0b6e1 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -1012,31 +1012,25 @@ static void __init reserve_crashkernel(void) unsigned long long lowmem_max = __pa(high_memory - 1) + 1; if (crash_max > lowmem_max) crash_max = lowmem_max; - crash_base = memblock_find_in_range(CRASH_ALIGN, crash_max, - crash_size, CRASH_ALIGN); + + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, + CRASH_ALIGN, crash_max); if (!crash_base) { pr_err("crashkernel reservation failed - No suitable area found.\n"); return; } } else { + unsigned long long crash_max = crash_base + crash_size; unsigned long long start; - start = memblock_find_in_range(crash_base, - crash_base + crash_size, - crash_size, SECTION_SIZE); - if (start != crash_base) { + start = memblock_phys_alloc_range(crash_size, SECTION_SIZE, + crash_base, crash_max); + if (!start) { pr_err("crashkernel reservation failed - memory is in use.\n"); return; } } - ret = memblock_reserve(crash_base, crash_size); - if (ret < 0) { - pr_warn("crashkernel reservation failed - memory is in use (0x%lx)\n", - (unsigned long)crash_base); - return; - } - pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n", (unsigned long)(crash_size >> 20), (unsigned long)(crash_base >> 20), diff --git a/arch/arm64/kvm/hyp/reserved_mem.c b/arch/arm64/kvm/hyp/reserved_mem.c index d654921dd09b..578670e3f608 100644 --- a/arch/arm64/kvm/hyp/reserved_mem.c +++ b/arch/arm64/kvm/hyp/reserved_mem.c @@ -92,12 +92,10 @@ void __init kvm_hyp_reserve(void) * this is unmapped from the host stage-2, and fallback to PAGE_SIZE. */ hyp_mem_size = hyp_mem_pages << PAGE_SHIFT; - hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(), - ALIGN(hyp_mem_size, PMD_SIZE), - PMD_SIZE); + hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE), + PMD_SIZE); if (!hyp_mem_base) - hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(), - hyp_mem_size, PAGE_SIZE); + hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE); else hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE); @@ -105,7 +103,6 @@ void __init kvm_hyp_reserve(void) kvm_err("Failed to reserve hyp memory\n"); return; } - memblock_reserve(hyp_mem_base, hyp_mem_size); kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, hyp_mem_base); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 1fdb7bb7c198..bf5b8a5cd451 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -74,6 +74,7 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init; static void __init reserve_crashkernel(void) { unsigned long long crash_base, crash_size; + unsigned long long crash_max = arm64_dma_phys_limit; int ret; ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), @@ -84,33 +85,18 @@ static void __init reserve_crashkernel(void) crash_size = PAGE_ALIGN(crash_size); - if (crash_base == 0) { - /* Current arm64 boot protocol requires 2MB alignment */ - crash_base = memblock_find_in_range(0, arm64_dma_phys_limit, - crash_size, SZ_2M); - if (crash_base == 0) { - pr_warn("cannot allocate crashkernel (size:0x%llx)\n", - crash_size); - return; - } - } else { - /* User specifies base address explicitly. */ - if (!memblock_is_region_memory(crash_base, crash_size)) { - pr_warn("cannot reserve crashkernel: region is not memory\n"); - return; - } + /* User specifies base address explicitly. */ + if (crash_base) + crash_max = crash_base + crash_size; - if (memblock_is_region_reserved(crash_base, crash_size)) { - pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n"); - return; - } - - if (!IS_ALIGNED(crash_base, SZ_2M)) { - pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n"); - return; - } + /* Current arm64 boot protocol requires 2MB alignment */ + crash_base = memblock_phys_alloc_range(crash_size, SZ_2M, + crash_base, crash_max); + if (!crash_base) { + pr_warn("cannot allocate crashkernel (size:0x%llx)\n", + crash_size); + return; } - memblock_reserve(crash_base, crash_size); pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", crash_base, crash_base + crash_size, crash_size >> 20); diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 23a140327a0b..f979adfd4fc2 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -452,8 +452,9 @@ static void __init mips_parse_crashkernel(void) return; if (crash_base <= 0) { - crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_MAX, - crash_size, CRASH_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, + CRASH_ALIGN, + CRASH_ADDR_MAX); if (!crash_base) { pr_warn("crashkernel reservation failed - No suitable area found.\n"); return; @@ -461,8 +462,9 @@ static void __init mips_parse_crashkernel(void) } else { unsigned long long start; - start = memblock_find_in_range(crash_base, crash_base + crash_size, - crash_size, 1); + start = memblock_phys_alloc_range(crash_size, 1, + crash_base, + crash_base + crash_size); if (start != crash_base) { pr_warn("Invalid memory region reserved for crash kernel\n"); return; @@ -656,10 +658,6 @@ static void __init arch_mem_init(char **cmdline_p) mips_reserve_vmcore(); mips_parse_crashkernel(); -#ifdef CONFIG_KEXEC - if (crashk_res.start != crashk_res.end) - memblock_reserve(crashk_res.start, resource_size(&crashk_res)); -#endif device_tree_init(); /* diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 7cb4f391d106..e6cac495a9e8 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -819,38 +819,22 @@ static void __init reserve_crashkernel(void) crash_size = PAGE_ALIGN(crash_size); - if (crash_base == 0) { - /* - * Current riscv boot protocol requires 2MB alignment for - * RV64 and 4MB alignment for RV32 (hugepage size) - */ - crash_base = memblock_find_in_range(search_start, search_end, - crash_size, PMD_SIZE); - - if (crash_base == 0) { - pr_warn("crashkernel: couldn't allocate %lldKB\n", - crash_size >> 10); - return; - } - } else { - /* User specifies base address explicitly. */ - if (!memblock_is_region_memory(crash_base, crash_size)) { - pr_warn("crashkernel: requested region is not memory\n"); - return; - } - - if (memblock_is_region_reserved(crash_base, crash_size)) { - pr_warn("crashkernel: requested region is reserved\n"); - return; - } - - - if (!IS_ALIGNED(crash_base, PMD_SIZE)) { - pr_warn("crashkernel: requested region is misaligned\n"); - return; - } + if (crash_base) { + search_start = crash_base; + search_end = crash_base + crash_size; + } + + /* + * Current riscv boot protocol requires 2MB alignment for + * RV64 and 4MB alignment for RV32 (hugepage size) + */ + crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, + search_start, search_end); + if (crash_base == 0) { + pr_warn("crashkernel: couldn't allocate %lldKB\n", + crash_size >> 10); + return; } - memblock_reserve(crash_base, crash_size); pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n", crash_base, crash_base + crash_size, crash_size >> 20); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index ff0f9e838916..0bab57d6413b 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -626,8 +626,9 @@ static void __init reserve_crashkernel(void) return; } low = crash_base ?: low; - crash_base = memblock_find_in_range(low, high, crash_size, - KEXEC_CRASH_MEM_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, + KEXEC_CRASH_MEM_ALIGN, + low, high); } if (!crash_base) { @@ -636,8 +637,10 @@ static void __init reserve_crashkernel(void) return; } - if (register_memory_notifier(&kdump_mem_nb)) + if (register_memory_notifier(&kdump_mem_nb)) { + memblock_free(crash_base, crash_size); return; + } if (!OLDMEM_BASE && MACHINE_IS_VM) diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size)); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 294ed4392a0e..10562885f5fc 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -109,14 +109,13 @@ static u32 __init allocate_aperture(void) * memory. Unfortunately we cannot move it up because that would * make the IOMMU useless. */ - addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, - aper_size, aper_size); + addr = memblock_phys_alloc_range(aper_size, aper_size, + GART_MIN_ADDR, GART_MAX_ADDR); if (!addr) { pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n", addr, addr + aper_size - 1, aper_size >> 10); return 0; } - memblock_reserve(addr, aper_size); pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n", addr, addr + aper_size - 1, aper_size >> 10); register_nosave_region(addr >> PAGE_SHIFT, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 75ef19aa8903..23a14d82e783 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -127,14 +127,12 @@ __ref void *alloc_low_pages(unsigned int num) unsigned long ret = 0; if (min_pfn_mapped < max_pfn_mapped) { - ret = memblock_find_in_range( + ret = memblock_phys_alloc_range( + PAGE_SIZE * num, PAGE_SIZE, min_pfn_mapped << PAGE_SHIFT, - max_pfn_mapped << PAGE_SHIFT, - PAGE_SIZE * num , PAGE_SIZE); + max_pfn_mapped << PAGE_SHIFT); } - if (ret) - memblock_reserve(ret, PAGE_SIZE * num); - else if (can_use_brk_pgt) + if (!ret && can_use_brk_pgt) ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE)); if (!ret) @@ -610,8 +608,17 @@ static void __init memory_map_top_down(unsigned long map_start, unsigned long addr; unsigned long mapped_ram_size = 0; - /* xen has big range in reserved near end of ram, skip it at first.*/ - addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); + /* + * Systems that have many reserved areas near top of the memory, + * e.g. QEMU with less than 1G RAM and EFI enabled, or Xen, will + * require lots of 4K mappings which may exhaust pgt_buf. + * Start with top-most PMD_SIZE range aligned at PMD_SIZE to ensure + * there is enough mapped memory that can be allocated from + * memblock. + */ + addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start, + map_end); + memblock_free(addr, PMD_SIZE); real_end = addr + PMD_SIZE; /* step_size need to be small so pgt_buf from BRK could cover it */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index e94da744386f..a1b5c71099e6 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -376,15 +376,14 @@ static int __init numa_alloc_distance(void) cnt++; size = cnt * cnt * sizeof(numa_distance[0]); - phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), - size, PAGE_SIZE); + phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); if (!phys) { pr_warn("Warning: can't allocate distance table!\n"); /* don't retry until explicitly reset */ numa_distance = (void *)1LU; return -ENOMEM; } - memblock_reserve(phys, size); numa_distance = __va(phys); numa_distance_cnt = cnt; diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 87d77cc52f86..737491b13728 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -447,13 +447,12 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) if (numa_dist_cnt) { u64 phys; - phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), - phys_size, PAGE_SIZE); + phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); if (!phys) { pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); goto no_emu; } - memblock_reserve(phys, phys_size); phys_dist = __va(phys); for (i = 0; i < numa_dist_cnt; i++) diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 6534c92d0f83..31b5856010cb 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -28,7 +28,7 @@ void __init reserve_real_mode(void) WARN_ON(slab_is_available()); /* Has to be under 1M so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); + mem = memblock_phys_alloc_range(size, PAGE_SIZE, 0, 1<<20); if (!mem) pr_info("No sub-1M memory is available for the trampoline\n"); else diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index a37a1532a575..f9383736fa0f 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -583,8 +583,8 @@ void __init acpi_table_upgrade(void) } acpi_tables_addr = - memblock_find_in_range(0, ACPI_TABLE_UPGRADE_MAX_PHYS, - all_tables_size, PAGE_SIZE); + memblock_phys_alloc_range(all_tables_size, PAGE_SIZE, + 0, ACPI_TABLE_UPGRADE_MAX_PHYS); if (!acpi_tables_addr) { WARN_ON(1); return; @@ -599,7 +599,6 @@ void __init acpi_table_upgrade(void) * Both memblock_reserve and e820__range_add (via arch_reserve_mem_area) * works fine. */ - memblock_reserve(acpi_tables_addr, all_tables_size); arch_reserve_mem_area(acpi_tables_addr, all_tables_size); /* diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 4cc4e117727d..46c503486e96 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -279,13 +279,10 @@ static int __init numa_alloc_distance(void) int i, j; size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]); - phys = memblock_find_in_range(0, PFN_PHYS(max_pfn), - size, PAGE_SIZE); + phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, PFN_PHYS(max_pfn)); if (WARN_ON(!phys)) return -ENOMEM; - memblock_reserve(phys, size); - numa_distance = __va(phys); numa_distance_cnt = nr_node_ids; diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index fd3964d24224..59c1390cdf42 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -33,18 +33,22 @@ static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size, phys_addr_t *res_base) { phys_addr_t base; + int err = 0; end = !end ? MEMBLOCK_ALLOC_ANYWHERE : end; align = !align ? SMP_CACHE_BYTES : align; - base = memblock_find_in_range(start, end, size, align); + base = memblock_phys_alloc_range(size, align, start, end); if (!base) return -ENOMEM; *res_base = base; - if (nomap) - return memblock_mark_nomap(base, size); + if (nomap) { + err = memblock_mark_nomap(base, size); + if (err) + memblock_free(base, size); + } - return memblock_reserve(base, size); + return err; } /* diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 4a53c3ca86bd..b066024c62e3 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -99,8 +99,6 @@ void memblock_discard(void); static inline void memblock_discard(void) {} #endif -phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align); void memblock_allow_resize(void); int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); int memblock_add(phys_addr_t base, phys_addr_t size); diff --git a/mm/memblock.c b/mm/memblock.c index a69449bffc8d..e6b4654f9dfd 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -315,7 +315,7 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, * Return: * Found address on success, 0 on failure. */ -phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, +static phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align) { From 884a7e5964e06ed93c7771c0d7cf19c09a8946f1 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 15:00:29 -0700 Subject: [PATCH 460/474] mm: introduce process_mrelease system call In modern systems it's not unusual to have a system component monitoring memory conditions of the system and tasked with keeping system memory pressure under control. One way to accomplish that is to kill non-essential processes to free up memory for more important ones. Examples of this are Facebook's OOM killer daemon called oomd and Android's low memory killer daemon called lmkd. For such system component it's important to be able to free memory quickly and efficiently. Unfortunately the time process takes to free up its memory after receiving a SIGKILL might vary based on the state of the process (uninterruptible sleep), size and OPP level of the core the process is running. A mechanism to free resources of the target process in a more predictable way would improve system's ability to control its memory pressure. Introduce process_mrelease system call that releases memory of a dying process from the context of the caller. This way the memory is freed in a more controllable way with CPU affinity and priority of the caller. The workload of freeing the memory will also be charged to the caller. The operation is allowed only on a dying process. After previous discussions [1, 2, 3] the decision was made [4] to introduce a dedicated system call to cover this use case. The API is as follows, int process_mrelease(int pidfd, unsigned int flags); DESCRIPTION The process_mrelease() system call is used to free the memory of an exiting process. The pidfd selects the process referred to by the PID file descriptor. (See pidfd_open(2) for further information) The flags argument is reserved for future use; currently, this argument must be specified as 0. RETURN VALUE On success, process_mrelease() returns 0. On error, -1 is returned and errno is set to indicate the error. ERRORS EBADF pidfd is not a valid PID file descriptor. EAGAIN Failed to release part of the address space. EINTR The call was interrupted by a signal; see signal(7). EINVAL flags is not 0. EINVAL The memory of the task cannot be released because the process is not exiting, the address space is shared with another live process or there is a core dump in progress. ENOSYS This system call is not supported, for example, without MMU support built into Linux. ESRCH The target process does not exist (i.e., it has terminated and been waited on). [1] https://lore.kernel.org/lkml/20190411014353.113252-3-surenb@google.com/ [2] https://lore.kernel.org/linux-api/20201113173448.1863419-1-surenb@google.com/ [3] https://lore.kernel.org/linux-api/20201124053943.1684874-3-surenb@google.com/ [4] https://lore.kernel.org/linux-api/20201223075712.GA4719@lst.de/ Link: https://lkml.kernel.org/r/20210809185259.405936-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Acked-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Christian Brauner Cc: David Rientjes Cc: Matthew Wilcox (Oracle) Cc: Johannes Weiner Cc: Roman Gushchin Cc: Rik van Riel Cc: Minchan Kim Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: Jann Horn Cc: Geert Uytterhoeven Cc: Andy Lutomirski Cc: Christian Brauner Cc: Florian Weimer Cc: Jan Engelhardt Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c729a4c4a1ac..831340e7ad8b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -1141,3 +1142,72 @@ void pagefault_out_of_memory(void) out_of_memory(&oc); mutex_unlock(&oom_lock); } + +SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) +{ +#ifdef CONFIG_MMU + struct mm_struct *mm = NULL; + struct task_struct *task; + struct task_struct *p; + unsigned int f_flags; + bool reap = true; + struct pid *pid; + long ret = 0; + + if (flags) + return -EINVAL; + + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + task = get_pid_task(pid, PIDTYPE_TGID); + if (!task) { + ret = -ESRCH; + goto put_pid; + } + + /* + * Make sure to choose a thread which still has a reference to mm + * during the group exit + */ + p = find_lock_task_mm(task); + if (!p) { + ret = -ESRCH; + goto put_task; + } + + mm = p->mm; + mmgrab(mm); + + /* If the work has been done already, just exit with success */ + if (test_bit(MMF_OOM_SKIP, &mm->flags)) + reap = false; + else if (!task_will_free_mem(p)) { + reap = false; + ret = -EINVAL; + } + task_unlock(p); + + if (!reap) + goto drop_mm; + + if (mmap_read_lock_killable(mm)) { + ret = -EINTR; + goto drop_mm; + } + if (!__oom_reap_task_mm(mm)) + ret = -EAGAIN; + mmap_read_unlock(mm); + +drop_mm: + mmdrop(mm); +put_task: + put_task_struct(task); +put_pid: + put_pid(pid); + return ret; +#else + return -ENOSYS; +#endif /* CONFIG_MMU */ +} From dce49103962840dd61423d7627748d6c558d58c5 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 2 Sep 2021 15:00:33 -0700 Subject: [PATCH 461/474] mm: wire up syscall process_mrelease Split off from prev patch in the series that implements the syscall. Link: https://lkml.kernel.org/r/20210809185259.405936-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Geert Uytterhoeven Cc: Andy Lutomirski Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Rientjes Cc: Florian Weimer Cc: Jan Engelhardt Cc: Jann Horn Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/syscalls/syscall.tbl | 2 ++ arch/arm/tools/syscall.tbl | 2 ++ arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 ++ arch/ia64/kernel/syscalls/syscall.tbl | 2 ++ arch/m68k/kernel/syscalls/syscall.tbl | 2 ++ arch/microblaze/kernel/syscalls/syscall.tbl | 2 ++ arch/mips/kernel/syscalls/syscall_n32.tbl | 2 ++ arch/mips/kernel/syscalls/syscall_n64.tbl | 2 ++ arch/mips/kernel/syscalls/syscall_o32.tbl | 2 ++ arch/parisc/kernel/syscalls/syscall.tbl | 2 ++ arch/powerpc/kernel/syscalls/syscall.tbl | 2 ++ arch/s390/kernel/syscalls/syscall.tbl | 2 ++ arch/sh/kernel/syscalls/syscall.tbl | 2 ++ arch/sparc/kernel/syscalls/syscall.tbl | 2 ++ arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 2 ++ include/linux/syscalls.h | 1 + include/uapi/asm-generic/unistd.h | 4 +++- kernel/sys_ni.c | 1 + 21 files changed, 38 insertions(+), 2 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index a17687ed4b51..605645eae04c 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -486,3 +486,5 @@ 554 common landlock_create_ruleset sys_landlock_create_ruleset 555 common landlock_add_rule sys_landlock_add_rule 556 common landlock_restrict_self sys_landlock_restrict_self +# 557 reserved for memfd_secret +558 common process_mrelease sys_process_mrelease diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index c5df1179fc5d..2f32eb8beca8 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -460,3 +460,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 727bfc3be99b..3cb206aea3db 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 447 +#define __NR_compat_syscalls 449 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 99ffcafc736c..0f49cdb180dd 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -901,6 +901,8 @@ __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset) __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) #define __NR_landlock_restrict_self 446 __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) +#define __NR_process_mrelease 448 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 6d07742c57b8..9bf45f2be966 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -367,3 +367,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 541bc1b3a8f9..f1f98ee6c82d 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -446,3 +446,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index a176faca2927..da49ddd4bb54 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -452,3 +452,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index c2d2e19abea8..56c8d3cf42ed 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -385,3 +385,5 @@ 444 n32 landlock_create_ruleset sys_landlock_create_ruleset 445 n32 landlock_add_rule sys_landlock_add_rule 446 n32 landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 n32 process_mrelease sys_process_mrelease diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index ac653d08b1ea..1ca7bc337932 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -361,3 +361,5 @@ 444 n64 landlock_create_ruleset sys_landlock_create_ruleset 445 n64 landlock_add_rule sys_landlock_add_rule 446 n64 landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 n64 process_mrelease sys_process_mrelease diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 253f2cd70b6b..fd3a9df60ec2 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -434,3 +434,5 @@ 444 o32 landlock_create_ruleset sys_landlock_create_ruleset 445 o32 landlock_add_rule sys_landlock_add_rule 446 o32 landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 o32 process_mrelease sys_process_mrelease diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index e26187b9ab87..040df1b7a589 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -444,3 +444,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index aef2a290e71a..d8ebd7d37c0f 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -526,3 +526,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 64d51ab5a8b4..57233ace30cb 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -449,3 +449,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease sys_process_mrelease diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index e0a70be77d84..2f6e95eb4690 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -449,3 +449,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 603f5a821502..42fc2906215d 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -492,3 +492,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index ce763a12311c..661a03bcfbd1 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -452,3 +452,4 @@ 445 i386 landlock_add_rule sys_landlock_add_rule 446 i386 landlock_restrict_self sys_landlock_restrict_self 447 i386 memfd_secret sys_memfd_secret +448 i386 process_mrelease sys_process_mrelease diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index f6b57799c1ea..807b6a1de8e8 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -369,6 +369,7 @@ 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self 447 common memfd_secret sys_memfd_secret +448 common process_mrelease sys_process_mrelease # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 235d67d6ceb4..f4384951f393 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -417,3 +417,5 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +# 447 reserved for memfd_secret +448 common process_mrelease sys_process_mrelease diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 69c9a7010081..00bc170a50f0 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -915,6 +915,7 @@ asmlinkage long sys_mincore(unsigned long start, size_t len, asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, size_t vlen, int behavior, unsigned int flags); +asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index a9d6fcd95f42..14c8fe863c6d 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -877,9 +877,11 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) #define __NR_memfd_secret 447 __SYSCALL(__NR_memfd_secret, sys_memfd_secret) #endif +#define __NR_process_mrelease 448 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease) #undef __NR_syscalls -#define __NR_syscalls 448 +#define __NR_syscalls 449 /* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 30971b1dd4a9..18a9c2cde767 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -289,6 +289,7 @@ COND_SYSCALL(munlockall); COND_SYSCALL(mincore); COND_SYSCALL(madvise); COND_SYSCALL(process_madvise); +COND_SYSCALL(process_mrelease); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL_COMPAT(mbind); From c9bd7d183673b5136e56210003e1d94338d47c45 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 2 Sep 2021 15:00:36 -0700 Subject: [PATCH 462/474] mm/migrate: correct kernel-doc notation Use the expected "Return:" format to prevent a kernel-doc warning. mm/migrate.c:1157: warning: Excess function parameter 'returns' description in 'next_demotion_node' Link: https://lkml.kernel.org/r/20210808203151.10632-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index ae923e9b8874..a0aeb3fe46a7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1149,7 +1149,7 @@ static int node_demotion[MAX_NUMNODES] __read_mostly = * next_demotion_node() - Get the next node in the demotion path * @node: The starting node to lookup the next node * - * @returns: node id for next memory node in the demotion path hierarchy + * Return: node id for next memory node in the demotion path hierarchy * from @node; NUMA_NO_NODE if @node is terminal. This does not keep * @node online or guarantee that it *continues* to be the next demotion * target. From 68d6289baa35c5e59b5359577d3dc01c00c437d2 Mon Sep 17 00:00:00 2001 From: Zhansaya Bagdauletkyzy Date: Thu, 2 Sep 2021 15:00:39 -0700 Subject: [PATCH 463/474] selftests: vm: add KSM merge test Patch series "add KSM selftests". Introduce selftests to validate the functionality of KSM. The tests are run on private anonymous pages. Since some KSM tunables are modified, their starting values are saved and restored after testing. At the start, run is set to 2 to ensure that only test pages will be merged (we assume that no applications make madvise syscalls in the background). If KSM config not enabled, all tests will be skipped. This patch (of 4): Add check_ksm_merge() function to check the basic merging feature of KSM. First, some number of identical pages are allocated and the MADV_MERGEABLE advice is given to merge these pages. Then, pages_shared and pages_sharing values are compared with the expected numbers using assert_ksm_pages_count() function. The number of pages can be changed using -p option. Link: https://lkml.kernel.org/r/cover.1626252248.git.zhansayabagdaulet@gmail.com Link: https://lkml.kernel.org/r/90287685c13300972ea84de93d1f3f900373f9fe.1626252248.git.zhansayabagdaulet@gmail.com Signed-off-by: Zhansaya Bagdauletkyzy Reviewed-by: Pavel Tatashin Reviewed-by: Tyler Hicks Cc: Shuah Khan Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/ksm_tests.c | 306 ++++++++++++++++++++++ tools/testing/selftests/vm/run_vmtests.sh | 16 ++ 4 files changed, 324 insertions(+) create mode 100644 tools/testing/selftests/vm/ksm_tests.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index f0fd80ef17df..b02eac613fdd 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -27,3 +27,4 @@ hmm-tests memfd_secret local_config.* split_huge_page_test +ksm_tests diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 521243770f26..e6f22a801b71 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -45,6 +45,7 @@ TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd TEST_GEN_FILES += split_huge_page_test +TEST_GEN_FILES += ksm_tests ifeq ($(MACHINE),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c new file mode 100644 index 000000000000..d74d5aa34b16 --- /dev/null +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +#include "../kselftest.h" + +#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" +#define KSM_FP(s) (KSM_SYSFS_PATH s) +#define KSM_SCAN_LIMIT_SEC_DEFAULT 120 +#define KSM_PAGE_COUNT_DEFAULT 10l +#define KSM_PROT_STR_DEFAULT "rw" + +struct ksm_sysfs { + unsigned long max_page_sharing; + unsigned long merge_across_nodes; + unsigned long pages_to_scan; + unsigned long run; + unsigned long sleep_millisecs; + unsigned long stable_node_chains_prune_millisecs; + unsigned long use_zero_pages; +}; + +static int ksm_write_sysfs(const char *file_path, unsigned long val) +{ + FILE *f = fopen(file_path, "w"); + + if (!f) { + fprintf(stderr, "f %s\n", file_path); + perror("fopen"); + return 1; + } + if (fprintf(f, "%lu", val) < 0) { + perror("fprintf"); + return 1; + } + fclose(f); + + return 0; +} + +static int ksm_read_sysfs(const char *file_path, unsigned long *val) +{ + FILE *f = fopen(file_path, "r"); + + if (!f) { + fprintf(stderr, "f %s\n", file_path); + perror("fopen"); + return 1; + } + if (fscanf(f, "%lu", val) != 1) { + perror("fscanf"); + return 1; + } + fclose(f); + + return 0; +} + +static int str_to_prot(char *prot_str) +{ + int prot = 0; + + if ((strchr(prot_str, 'r')) != NULL) + prot |= PROT_READ; + if ((strchr(prot_str, 'w')) != NULL) + prot |= PROT_WRITE; + if ((strchr(prot_str, 'x')) != NULL) + prot |= PROT_EXEC; + + return prot; +} + +static void print_help(void) +{ + printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n"); + printf(" -a: specify the access protections of pages.\n" + " must be of the form [rwx].\n" + " Default: %s\n", KSM_PROT_STR_DEFAULT); + printf(" -p: specify the number of pages to test.\n" + " Default: %ld\n", KSM_PAGE_COUNT_DEFAULT); + printf(" -l: limit the maximum running time (in seconds) for a test.\n" + " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT); + + exit(0); +} + +static void *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size) +{ + void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0); + + if (!map_ptr) { + perror("mmap"); + return NULL; + } + memset(map_ptr, data, map_size); + if (mprotect(map_ptr, map_size, prot)) { + perror("mprotect"); + munmap(map_ptr, map_size); + return NULL; + } + + return map_ptr; +} + +static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout) +{ + struct timespec cur_time; + unsigned long cur_scan, init_scan; + + if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan)) + return 1; + cur_scan = init_scan; + + while (cur_scan < init_scan + scan_count) { + if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan)) + return 1; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) { + perror("clock_gettime"); + return 1; + } + if ((cur_time.tv_sec - start_time.tv_sec) > timeout) { + printf("Scan time limit exceeded\n"); + return 1; + } + } + + return 0; +} + +static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout) +{ + if (madvise(addr, size, MADV_MERGEABLE)) { + perror("madvise"); + return 1; + } + if (ksm_write_sysfs(KSM_FP("run"), 1)) + return 1; + + /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */ + if (ksm_do_scan(2, start_time, timeout)) + return 1; + + return 0; +} + +static bool assert_ksm_pages_count(long dupl_page_count) +{ + unsigned long max_page_sharing, pages_sharing, pages_shared; + + if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) || + ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) || + ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing)) + return false; + + /* + * Since there must be at least 2 pages for merging and 1 page can be + * shared with the limited number of pages (max_page_sharing), sometimes + * there are 'leftover' pages that cannot be merged. For example, if there + * are 11 pages and max_page_sharing = 10, then only 10 pages will be + * merged and the 11th page won't be affected. As a result, when the number + * of duplicate pages is divided by max_page_sharing and the remainder is 1, + * pages_shared and pages_sharing values will be equal between dupl_page_count + * and dupl_page_count - 1. + */ + if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) { + if (pages_shared == dupl_page_count / max_page_sharing && + pages_sharing == pages_shared * (max_page_sharing - 1)) + return true; + } else { + if (pages_shared == (dupl_page_count / max_page_sharing + 1) && + pages_sharing == dupl_page_count - pages_shared) + return true; + } + + return false; +} + +static int ksm_save_def(struct ksm_sysfs *ksm_sysfs) +{ + if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) || + ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) || + ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) || + ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) || + ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) || + ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), + &ksm_sysfs->stable_node_chains_prune_millisecs) || + ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages)) + return 1; + + return 0; +} + +static int ksm_restore(struct ksm_sysfs *ksm_sysfs) +{ + if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) || + ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) || + ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) || + ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) || + ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) || + ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), + ksm_sysfs->stable_node_chains_prune_millisecs) || + ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages)) + return 1; + + return 0; +} + +static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + /* fill pages with the same data and merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* verify that the right number of pages are merged */ + if (assert_ksm_pages_count(page_count)) { + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + } + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +int main(int argc, char *argv[]) +{ + int ret, opt; + int prot = 0; + int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT; + long page_count = KSM_PAGE_COUNT_DEFAULT; + size_t page_size = sysconf(_SC_PAGESIZE); + struct ksm_sysfs ksm_sysfs_old; + + while ((opt = getopt(argc, argv, "ha:p:l:")) != -1) { + switch (opt) { + case 'a': + prot = str_to_prot(optarg); + break; + case 'p': + page_count = atol(optarg); + if (page_count <= 0) { + printf("The number of pages must be greater than 0\n"); + return KSFT_FAIL; + } + break; + case 'l': + ksm_scan_limit_sec = atoi(optarg); + if (ksm_scan_limit_sec <= 0) { + printf("Timeout value must be greater than 0\n"); + return KSFT_FAIL; + } + break; + case 'h': + print_help(); + break; + default: + return KSFT_FAIL; + } + } + + if (prot == 0) + prot = str_to_prot(KSM_PROT_STR_DEFAULT); + + if (access(KSM_SYSFS_PATH, F_OK)) { + printf("Config KSM not enabled\n"); + return KSFT_SKIP; + } + + if (ksm_save_def(&ksm_sysfs_old)) { + printf("Cannot save default tunables\n"); + return KSFT_FAIL; + } + + if (ksm_write_sysfs(KSM_FP("run"), 2) || + ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) || + ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) || + ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count)) + return KSFT_FAIL; + + ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, ksm_scan_limit_sec, + page_size); + + if (ksm_restore(&ksm_sysfs_old)) { + printf("Cannot restore default tunables\n"); + return KSFT_FAIL; + } + + return ret; +} diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index d09a6b71f1e9..97b6f712134d 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -377,6 +377,22 @@ else exitcode=1 fi +echo "-------------------------------------------------------" +echo "running KSM MADV_MERGEABLE test with 10 identical pages" +echo "-------------------------------------------------------" +./ksm_tests -p 10 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + exit $exitcode exit $exitcode From a40c80e348fac4ecff8abcc3fae31e2e84c055d6 Mon Sep 17 00:00:00 2001 From: Zhansaya Bagdauletkyzy Date: Thu, 2 Sep 2021 15:00:42 -0700 Subject: [PATCH 464/474] selftests: vm: add KSM unmerge test Add check_ksm_unmerge() function to verify that KSM is properly unmerging shared pages. For this, two duplicate pages are merged first and then their contents are modified. Since they are not identical anymore, the pages must be unmerged and the number of merged pages has to be 0. The test is run as follows: ./ksm_tests -U Link: https://lkml.kernel.org/r/c0f55420440d704d5b094275b4365aa1b2ad46b5.1626252248.git.zhansayabagdaulet@gmail.com Signed-off-by: Zhansaya Bagdauletkyzy Reviewed-by: Pavel Tatashin Reviewed-by: Tyler Hicks Cc: Hugh Dickins Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/ksm_tests.c | 72 +++++++++++++++++++++-- tools/testing/selftests/vm/run_vmtests.sh | 18 +++++- 2 files changed, 85 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index d74d5aa34b16..80302bb8f64c 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -23,6 +23,11 @@ struct ksm_sysfs { unsigned long use_zero_pages; }; +enum ksm_test_name { + CHECK_KSM_MERGE, + CHECK_KSM_UNMERGE +}; + static int ksm_write_sysfs(const char *file_path, unsigned long val) { FILE *f = fopen(file_path, "w"); @@ -75,7 +80,12 @@ static int str_to_prot(char *prot_str) static void print_help(void) { - printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n"); + printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n"); + + printf("Supported :\n" + " -M (page merging)\n" + " -U (page unmerging)\n\n"); + printf(" -a: specify the access protections of pages.\n" " must be of the form [rwx].\n" " Default: %s\n", KSM_PROT_STR_DEFAULT); @@ -239,6 +249,46 @@ err_out: return KSFT_FAIL; } +static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + int page_count = 2; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + /* fill pages with the same data and merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */ + memset(map_ptr, '-', 1); + memset(map_ptr + page_size, '+', 1); + + /* get at least 1 scan, so KSM can detect that the pages were modified */ + if (ksm_do_scan(1, start_time, timeout)) + goto err_out; + + /* check that unmerging was successful and 0 pages are currently merged */ + if (assert_ksm_pages_count(0)) { + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + } + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + int main(int argc, char *argv[]) { int ret, opt; @@ -247,8 +297,9 @@ int main(int argc, char *argv[]) long page_count = KSM_PAGE_COUNT_DEFAULT; size_t page_size = sysconf(_SC_PAGESIZE); struct ksm_sysfs ksm_sysfs_old; + int test_name = CHECK_KSM_MERGE; - while ((opt = getopt(argc, argv, "ha:p:l:")) != -1) { + while ((opt = getopt(argc, argv, "ha:p:l:MU")) != -1) { switch (opt) { case 'a': prot = str_to_prot(optarg); @@ -270,6 +321,11 @@ int main(int argc, char *argv[]) case 'h': print_help(); break; + case 'M': + break; + case 'U': + test_name = CHECK_KSM_UNMERGE; + break; default: return KSFT_FAIL; } @@ -294,8 +350,16 @@ int main(int argc, char *argv[]) ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count)) return KSFT_FAIL; - ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, ksm_scan_limit_sec, - page_size); + switch (test_name) { + case CHECK_KSM_MERGE: + ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, + ksm_scan_limit_sec, page_size); + break; + case CHECK_KSM_UNMERGE: + ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + page_size); + break; + } if (ksm_restore(&ksm_sysfs_old)) { printf("Cannot restore default tunables\n"); diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 97b6f712134d..3a23c6b47da2 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -380,7 +380,23 @@ fi echo "-------------------------------------------------------" echo "running KSM MADV_MERGEABLE test with 10 identical pages" echo "-------------------------------------------------------" -./ksm_tests -p 10 +./ksm_tests -M -p 10 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +echo "------------------------" +echo "running KSM unmerge test" +echo "------------------------" +./ksm_tests -U ret_val=$? if [ $ret_val -eq 0 ]; then From 39619982c5be6ed57390f17aabee6bc11e4af37e Mon Sep 17 00:00:00 2001 From: Zhansaya Bagdauletkyzy Date: Thu, 2 Sep 2021 15:00:45 -0700 Subject: [PATCH 465/474] selftests: vm: add KSM zero page merging test Add check_ksm_zero_page_merge() function to test that empty pages are being handled properly. For this, several zero pages are allocated and merged using madvise. If use_zero_pages is enabled, the pages must be shared with the special kernel zero pages; otherwise, they are merged as usual duplicate pages. The test is run as follows: ./ksm_tests -Z Link: https://lkml.kernel.org/r/6d0caab00d4bdccf5e3791cb95cf6dfd5eb85e45.1626252248.git.zhansayabagdaulet@gmail.com Signed-off-by: Zhansaya Bagdauletkyzy Reviewed-by: Pavel Tatashin Reviewed-by: Tyler Hicks Cc: Hugh Dickins Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/ksm_tests.c | 70 ++++++++++++++++++++++- tools/testing/selftests/vm/run_vmtests.sh | 32 +++++++++++ 2 files changed, 99 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index 80302bb8f64c..5843526471e1 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -12,6 +12,7 @@ #define KSM_SCAN_LIMIT_SEC_DEFAULT 120 #define KSM_PAGE_COUNT_DEFAULT 10l #define KSM_PROT_STR_DEFAULT "rw" +#define KSM_USE_ZERO_PAGES_DEFAULT false struct ksm_sysfs { unsigned long max_page_sharing; @@ -25,7 +26,8 @@ struct ksm_sysfs { enum ksm_test_name { CHECK_KSM_MERGE, - CHECK_KSM_UNMERGE + CHECK_KSM_UNMERGE, + CHECK_KSM_ZERO_PAGE_MERGE }; static int ksm_write_sysfs(const char *file_path, unsigned long val) @@ -80,10 +82,12 @@ static int str_to_prot(char *prot_str) static void print_help(void) { - printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n"); + printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n" + "[-z use_zero_pages]\n"); printf("Supported :\n" " -M (page merging)\n" + " -Z (zero pages merging)\n" " -U (page unmerging)\n\n"); printf(" -a: specify the access protections of pages.\n" @@ -93,6 +97,8 @@ static void print_help(void) " Default: %ld\n", KSM_PAGE_COUNT_DEFAULT); printf(" -l: limit the maximum running time (in seconds) for a test.\n" " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT); + printf(" -z: change use_zero_pages tunable\n" + " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT); exit(0); } @@ -289,6 +295,50 @@ err_out: return KSFT_FAIL; } +static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout, + bool use_zero_pages, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages)) + return KSFT_FAIL; + + /* fill pages with zero and try to merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* + * verify that the right number of pages are merged: + * 1) if use_zero_pages is set to 1, empty pages are merged + * with the kernel zero page instead of with each other; + * 2) if use_zero_pages is set to 0, empty pages are not treated specially + * and merged as usual. + */ + if (use_zero_pages && !assert_ksm_pages_count(0)) + goto err_out; + else if (!use_zero_pages && !assert_ksm_pages_count(page_count)) + goto err_out; + + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + int main(int argc, char *argv[]) { int ret, opt; @@ -298,8 +348,9 @@ int main(int argc, char *argv[]) size_t page_size = sysconf(_SC_PAGESIZE); struct ksm_sysfs ksm_sysfs_old; int test_name = CHECK_KSM_MERGE; + bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT; - while ((opt = getopt(argc, argv, "ha:p:l:MU")) != -1) { + while ((opt = getopt(argc, argv, "ha:p:l:z:MUZ")) != -1) { switch (opt) { case 'a': prot = str_to_prot(optarg); @@ -321,11 +372,20 @@ int main(int argc, char *argv[]) case 'h': print_help(); break; + case 'z': + if (strcmp(optarg, "0") == 0) + use_zero_pages = 0; + else + use_zero_pages = 1; + break; case 'M': break; case 'U': test_name = CHECK_KSM_UNMERGE; break; + case 'Z': + test_name = CHECK_KSM_ZERO_PAGE_MERGE; + break; default: return KSFT_FAIL; } @@ -359,6 +419,10 @@ int main(int argc, char *argv[]) ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, page_size); break; + case CHECK_KSM_ZERO_PAGE_MERGE: + ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, + ksm_scan_limit_sec, use_zero_pages, page_size); + break; } if (ksm_restore(&ksm_sysfs_old)) { diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 3a23c6b47da2..9b4e444fc4ed 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -409,6 +409,38 @@ else exitcode=1 fi +echo "----------------------------------------------------------" +echo "running KSM test with 10 zero pages and use_zero_pages = 0" +echo "----------------------------------------------------------" +./ksm_tests -Z -p 10 -z 0 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +echo "----------------------------------------------------------" +echo "running KSM test with 10 zero pages and use_zero_pages = 1" +echo "----------------------------------------------------------" +./ksm_tests -Z -p 10 -z 1 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + exit $exitcode exit $exitcode From 82e717ad35018ce59ba1b66297dfd898b2a1a03f Mon Sep 17 00:00:00 2001 From: Zhansaya Bagdauletkyzy Date: Thu, 2 Sep 2021 15:00:48 -0700 Subject: [PATCH 466/474] selftests: vm: add KSM merging across nodes test Add check_ksm_numa_merge() function to test that pages in different NUMA nodes are being handled properly. First, two duplicate pages are allocated in two separate NUMA nodes using the libnuma library. Since there is one unique page in each node, with merge_across_nodes = 0, there won't be any shared pages. If merge_across_nodes is set to 1, the pages will be treated as usual duplicate pages and will be merged. If NUMA config is not enabled or the number of NUMA nodes is less than two, then the test is skipped. The test is run as follows: ./ksm_tests -N Link: https://lkml.kernel.org/r/071c17b5b04ebb0dfeba137acc495e5dd9d2a719.1626252248.git.zhansayabagdaulet@gmail.com Signed-off-by: Zhansaya Bagdauletkyzy Reviewed-by: Pavel Tatashin Reviewed-by: Tyler Hicks Cc: Hugh Dickins Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/Makefile | 2 + tools/testing/selftests/vm/ksm_tests.c | 88 ++++++++++++++++++++++- tools/testing/selftests/vm/run_vmtests.sh | 32 +++++++++ 3 files changed, 119 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index e6f22a801b71..d9605bd10f2d 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -146,6 +146,8 @@ $(OUTPUT)/hmm-tests: local_config.h # HMM_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. $(OUTPUT)/hmm-tests: LDLIBS += $(HMM_EXTRA_LIBS) +$(OUTPUT)/ksm_tests: LDLIBS += -lnuma + local_config.mk local_config.h: check_config.sh /bin/sh ./check_config.sh $(CC) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index 5843526471e1..cdeb4a028538 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "../kselftest.h" @@ -13,6 +14,7 @@ #define KSM_PAGE_COUNT_DEFAULT 10l #define KSM_PROT_STR_DEFAULT "rw" #define KSM_USE_ZERO_PAGES_DEFAULT false +#define KSM_MERGE_ACROSS_NODES_DEFAULT true struct ksm_sysfs { unsigned long max_page_sharing; @@ -27,7 +29,8 @@ struct ksm_sysfs { enum ksm_test_name { CHECK_KSM_MERGE, CHECK_KSM_UNMERGE, - CHECK_KSM_ZERO_PAGE_MERGE + CHECK_KSM_ZERO_PAGE_MERGE, + CHECK_KSM_NUMA_MERGE }; static int ksm_write_sysfs(const char *file_path, unsigned long val) @@ -83,11 +86,12 @@ static int str_to_prot(char *prot_str) static void print_help(void) { printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n" - "[-z use_zero_pages]\n"); + "[-z use_zero_pages] [-m merge_across_nodes]\n"); printf("Supported :\n" " -M (page merging)\n" " -Z (zero pages merging)\n" + " -N (merging of pages in different NUMA nodes)\n" " -U (page unmerging)\n\n"); printf(" -a: specify the access protections of pages.\n" @@ -99,6 +103,8 @@ static void print_help(void) " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT); printf(" -z: change use_zero_pages tunable\n" " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT); + printf(" -m: change merge_across_nodes tunable\n" + " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT); exit(0); } @@ -339,6 +345,68 @@ err_out: return KSFT_FAIL; } +static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes, + size_t page_size) +{ + void *numa1_map_ptr, *numa2_map_ptr; + struct timespec start_time; + int page_count = 2; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + if (numa_available() < 0) { + perror("NUMA support not enabled"); + return KSFT_SKIP; + } + if (numa_max_node() < 1) { + printf("At least 2 NUMA nodes must be available\n"); + return KSFT_SKIP; + } + if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes)) + return KSFT_FAIL; + + /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */ + numa1_map_ptr = numa_alloc_onnode(page_size, 0); + numa2_map_ptr = numa_alloc_onnode(page_size, 1); + if (!numa1_map_ptr || !numa2_map_ptr) { + perror("numa_alloc_onnode"); + return KSFT_FAIL; + } + + memset(numa1_map_ptr, '*', page_size); + memset(numa2_map_ptr, '*', page_size); + + /* try to merge the pages */ + if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) || + ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout)) + goto err_out; + + /* + * verify that the right number of pages are merged: + * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged; + * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is + * only 1 unique page in each node and they can't be shared. + */ + if (merge_across_nodes && !assert_ksm_pages_count(page_count)) + goto err_out; + else if (!merge_across_nodes && !assert_ksm_pages_count(0)) + goto err_out; + + numa_free(numa1_map_ptr, page_size); + numa_free(numa2_map_ptr, page_size); + printf("OK\n"); + return KSFT_PASS; + +err_out: + numa_free(numa1_map_ptr, page_size); + numa_free(numa2_map_ptr, page_size); + printf("Not OK\n"); + return KSFT_FAIL; +} + int main(int argc, char *argv[]) { int ret, opt; @@ -349,8 +417,9 @@ int main(int argc, char *argv[]) struct ksm_sysfs ksm_sysfs_old; int test_name = CHECK_KSM_MERGE; bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT; + bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; - while ((opt = getopt(argc, argv, "ha:p:l:z:MUZ")) != -1) { + while ((opt = getopt(argc, argv, "ha:p:l:z:m:MUZN")) != -1) { switch (opt) { case 'a': prot = str_to_prot(optarg); @@ -378,6 +447,12 @@ int main(int argc, char *argv[]) else use_zero_pages = 1; break; + case 'm': + if (strcmp(optarg, "0") == 0) + merge_across_nodes = 0; + else + merge_across_nodes = 1; + break; case 'M': break; case 'U': @@ -386,6 +461,9 @@ int main(int argc, char *argv[]) case 'Z': test_name = CHECK_KSM_ZERO_PAGE_MERGE; break; + case 'N': + test_name = CHECK_KSM_NUMA_MERGE; + break; default: return KSFT_FAIL; } @@ -423,6 +501,10 @@ int main(int argc, char *argv[]) ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, ksm_scan_limit_sec, use_zero_pages, page_size); break; + case CHECK_KSM_NUMA_MERGE: + ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + merge_across_nodes, page_size); + break; } if (ksm_restore(&ksm_sysfs_old)) { diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 9b4e444fc4ed..45e803af7c77 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -441,6 +441,38 @@ else exitcode=1 fi +echo "-------------------------------------------------------------" +echo "running KSM test with 2 NUMA nodes and merge_across_nodes = 1" +echo "-------------------------------------------------------------" +./ksm_tests -N -m 1 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +echo "-------------------------------------------------------------" +echo "running KSM test with 2 NUMA nodes and merge_across_nodes = 0" +echo "-------------------------------------------------------------" +./ksm_tests -N -m 0 +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + exit $exitcode exit $exitcode From 584ff0dfb09a81e1addf02543f7c1fa8e71ce6d2 Mon Sep 17 00:00:00 2001 From: Zhansaya Bagdauletkyzy Date: Thu, 2 Sep 2021 15:00:51 -0700 Subject: [PATCH 467/474] mm: KSM: fix data type ksm_stable_node_chains_prune_millisecs is declared as int, but in stable__node_chains_prune_millisecs_store(), it can store values up to UINT_MAX. Change its type to unsigned int. Link: https://lkml.kernel.org/r/20210806111351.GA71845@asus Signed-off-by: Zhansaya Bagdauletkyzy Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 3fa9bc8a67cf..025338128cd9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -259,7 +259,7 @@ static unsigned long ksm_stable_node_chains; static unsigned long ksm_stable_node_dups; /* Delay in pruning stale stable_node_dups in the stable_node_chains */ -static int ksm_stable_node_chains_prune_millisecs = 2000; +static unsigned int ksm_stable_node_chains_prune_millisecs = 2000; /* Maximum number of page slots sharing a stable node */ static int ksm_max_page_sharing = 256; @@ -3105,11 +3105,11 @@ stable_node_chains_prune_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - unsigned long msecs; + unsigned int msecs; int err; - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > UINT_MAX) + err = kstrtouint(buf, 10, &msecs); + if (err) return -EINVAL; ksm_stable_node_chains_prune_millisecs = msecs; From 9e7cb94ca218816bfd51e07b50bcfeadd3166d42 Mon Sep 17 00:00:00 2001 From: Zhansaya Bagdauletkyzy Date: Thu, 2 Sep 2021 15:00:54 -0700 Subject: [PATCH 468/474] selftests: vm: add KSM merging time test Patch series "add KSM performance tests", v3. Extend KSM self tests with a performance benchmark. These tests are not part of regular regression testing, as they are mainly intended to be used by developers making changes to the memory management subsystem. This patch (of 2): Add ksm_merge_time() function to determine speed and time needed for merging. The total spent time is shown in seconds while speed is in MiB/s. User must specify the size of duplicated memory area (in MiB) before running the test. The test is run as follows: ./ksm_tests -P -s 100 The output: Total size: 100 MiB Total time: 0.201106786 s Average speed: 497.248 MiB/s Link: https://lkml.kernel.org/r/cover.1629386192.git.zhansayabagdaulet@gmail.com Link: https://lkml.kernel.org/r/318b946ac80cc9205c89d0962048378f7ce0705b.1629386192.git.zhansayabagdaulet@gmail.com Signed-off-by: Zhansaya Bagdauletkyzy Reviewed-by: Tyler Hicks Reviewed-by: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/ksm_tests.c | 74 ++++++++++++++++++++++++-- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index cdeb4a028538..432dfe615e50 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -7,6 +7,7 @@ #include #include "../kselftest.h" +#include "../../../../include/vdso/time64.h" #define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" #define KSM_FP(s) (KSM_SYSFS_PATH s) @@ -15,6 +16,7 @@ #define KSM_PROT_STR_DEFAULT "rw" #define KSM_USE_ZERO_PAGES_DEFAULT false #define KSM_MERGE_ACROSS_NODES_DEFAULT true +#define MB (1ul << 20) struct ksm_sysfs { unsigned long max_page_sharing; @@ -30,7 +32,8 @@ enum ksm_test_name { CHECK_KSM_MERGE, CHECK_KSM_UNMERGE, CHECK_KSM_ZERO_PAGE_MERGE, - CHECK_KSM_NUMA_MERGE + CHECK_KSM_NUMA_MERGE, + KSM_MERGE_TIME }; static int ksm_write_sysfs(const char *file_path, unsigned long val) @@ -86,13 +89,16 @@ static int str_to_prot(char *prot_str) static void print_help(void) { printf("usage: ksm_tests [-h] [-a prot] [-p page_count] [-l timeout]\n" - "[-z use_zero_pages] [-m merge_across_nodes]\n"); + "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n"); printf("Supported :\n" " -M (page merging)\n" " -Z (zero pages merging)\n" " -N (merging of pages in different NUMA nodes)\n" - " -U (page unmerging)\n\n"); + " -U (page unmerging)\n" + " -P evaluate merging time and speed.\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n\n"); printf(" -a: specify the access protections of pages.\n" " must be of the form [rwx].\n" @@ -105,6 +111,7 @@ static void print_help(void) " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT); printf(" -m: change merge_across_nodes tunable\n" " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT); + printf(" -s: the size of duplicated memory area (in MiB)\n"); exit(0); } @@ -407,6 +414,47 @@ err_out: return KSFT_FAIL; } +static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + + map_size *= MB; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); + if (!map_ptr) + return KSFT_FAIL; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, map_size); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, map_size); + return KSFT_FAIL; +} + int main(int argc, char *argv[]) { int ret, opt; @@ -418,8 +466,9 @@ int main(int argc, char *argv[]) int test_name = CHECK_KSM_MERGE; bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT; bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; + long size_MB = 0; - while ((opt = getopt(argc, argv, "ha:p:l:z:m:MUZN")) != -1) { + while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNP")) != -1) { switch (opt) { case 'a': prot = str_to_prot(optarg); @@ -453,6 +502,12 @@ int main(int argc, char *argv[]) else merge_across_nodes = 1; break; + case 's': + size_MB = atoi(optarg); + if (size_MB <= 0) { + printf("Size must be greater than 0\n"); + return KSFT_FAIL; + } case 'M': break; case 'U': @@ -464,6 +519,9 @@ int main(int argc, char *argv[]) case 'N': test_name = CHECK_KSM_NUMA_MERGE; break; + case 'P': + test_name = KSM_MERGE_TIME; + break; default: return KSFT_FAIL; } @@ -505,6 +563,14 @@ int main(int argc, char *argv[]) ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, merge_across_nodes, page_size); break; + case KSM_MERGE_TIME: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + size_MB); + break; } if (ksm_restore(&ksm_sysfs_old)) { From 924a11bd1623787c6604590202e0920eb572fa42 Mon Sep 17 00:00:00 2001 From: Zhansaya Bagdauletkyzy Date: Thu, 2 Sep 2021 15:00:57 -0700 Subject: [PATCH 469/474] selftests: vm: add COW time test for KSM pages Since merged pages are copied every time they need to be modified, the write access time is different between shared and non-shared pages. Add ksm_cow_time() function which evaluates latency of these COW breaks. First, 4000 pages are allocated and the time, required to modify 1 byte in every other page, is measured. After this, the pages are merged into 2000 pairs and in each pair, 1 page is modified (i.e. they are decoupled) to detect COW breaks. The time needed to break COW of merged pages is then compared with performance of non-shared pages. The test is run as follows: ./ksm_tests -C The output: Total size: 15 MiB Not merged pages: Total time: 0.002185489 s Average speed: 3202.945 MiB/s Merged pages: Total time: 0.004386872 s Average speed: 1595.670 MiB/s Link: https://lkml.kernel.org/r/1d03ee0d1b341959d4b61672c6401d498bff5652.1629386192.git.zhansayabagdaulet@gmail.com Signed-off-by: Zhansaya Bagdauletkyzy Reviewed-by: Tyler Hicks Reviewed-by: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/ksm_tests.c | 86 +++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index 432dfe615e50..b61dcdb44c5b 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -33,7 +33,8 @@ enum ksm_test_name { CHECK_KSM_UNMERGE, CHECK_KSM_ZERO_PAGE_MERGE, CHECK_KSM_NUMA_MERGE, - KSM_MERGE_TIME + KSM_MERGE_TIME, + KSM_COW_TIME }; static int ksm_write_sysfs(const char *file_path, unsigned long val) @@ -98,7 +99,8 @@ static void print_help(void) " -U (page unmerging)\n" " -P evaluate merging time and speed.\n" " For this test, the size of duplicated memory area (in MiB)\n" - " must be provided using -s option\n\n"); + " must be provided using -s option\n" + " -C evaluate the time required to break COW of merged pages.\n\n"); printf(" -a: specify the access protections of pages.\n" " must be of the form [rwx].\n" @@ -455,6 +457,77 @@ err_out: return KSFT_FAIL; } +static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long cow_time_ns; + + /* page_count must be less than 2*page_size */ + size_t page_count = 4000; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + for (size_t i = 0; i < page_count - 1; i = i + 2) + memset(map_ptr + page_size * i, '-', 1); + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n\n", (page_size * page_count) / MB); + printf("Not merged pages:\n"); + printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + cow_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) / + ((double)cow_time_ns / NSEC_PER_SEC)); + + /* Create 2000 pairs of duplicate pages */ + for (size_t i = 0; i < page_count - 1; i = i + 2) { + memset(map_ptr + page_size * i, '+', i / 2 + 1); + memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1); + } + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + for (size_t i = 0; i < page_count - 1; i = i + 2) + memset(map_ptr + page_size * i, '-', 1); + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Merged pages:\n"); + printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + cow_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) / + ((double)cow_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + int main(int argc, char *argv[]) { int ret, opt; @@ -468,7 +541,7 @@ int main(int argc, char *argv[]) bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; long size_MB = 0; - while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNP")) != -1) { + while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPC")) != -1) { switch (opt) { case 'a': prot = str_to_prot(optarg); @@ -522,6 +595,9 @@ int main(int argc, char *argv[]) case 'P': test_name = KSM_MERGE_TIME; break; + case 'C': + test_name = KSM_COW_TIME; + break; default: return KSFT_FAIL; } @@ -571,6 +647,10 @@ int main(int argc, char *argv[]) ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, size_MB); break; + case KSM_COW_TIME: + ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + page_size); + break; } if (ksm_restore(&ksm_sysfs_old)) { From 319814504992f51ed17af60edb1a237ada1892e8 Mon Sep 17 00:00:00 2001 From: Jing Xiangfeng Date: Thu, 2 Sep 2021 15:01:00 -0700 Subject: [PATCH 470/474] mm/percpu,c: remove obsolete comments of pcpu_chunk_populated() Commit b239f7daf553 ("percpu: set PCPU_BITMAP_BLOCK_SIZE to PAGE_SIZE") removed the parameter 'for_alloc', so remove this comment. Link: https://lkml.kernel.org/r/1630576043-21367-1-git-send-email-jingxiangfeng@huawei.com Signed-off-by: Jing Xiangfeng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/percpu.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 7f2e0151c4e2..e1c20837a42a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1520,9 +1520,6 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) * Pages in [@page_start,@page_end) have been populated to @chunk. Update * the bookkeeping information accordingly. Must be called after each * successful population. - * - * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it - * is to serve an allocation in that area. */ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, int page_end) From ea15ba17b434b7dd7f92bb85b7e5cf53707733ad Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 15:01:03 -0700 Subject: [PATCH 471/474] mm/vmstat: correct some wrong comments Patch series "Cleanup for vmstat". This series contains cleanups to remove unneeded return value, correct wrong comment and simplify the array size calculation. More details can be found in the respective changelogs. This patch (of 3): Correct wrong fls(mem+1) to fls(mem)+1 and remove the duplicated comment with quiet_vmstat(). Link: https://lkml.kernel.org/r/20210715122911.15700-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210715122911.15700-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index ec5a2e789dd2..5bd621eadc48 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -204,7 +204,7 @@ int calculate_normal_threshold(struct zone *zone) * * Some sample thresholds: * - * Threshold Processors (fls) Zonesize fls(mem+1) + * Threshold Processors (fls) Zonesize fls(mem)+1 * ------------------------------------------------------------------ * 8 1 1 0.9-1 GB 4 * 16 2 2 0.9-1 GB 4 @@ -1875,11 +1875,6 @@ static void vmstat_update(struct work_struct *w) } } -/* - * Switch off vmstat processing and then fold all the remaining differentials - * until the diffs stay at zero. The function is used by NOHZ and can only be - * invoked when tick processing is not active. - */ /* * Check if the diffs for a certain cpu indicate that * an update is needed. From 64632fd3eb4611780a1190362f2119d8f2bb5465 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 15:01:05 -0700 Subject: [PATCH 472/474] mm/vmstat: simplify the array size calculation We can replace the array_num * sizeof(array[0]) with sizeof(array) to simplify the code. Link: https://lkml.kernel.org/r/20210715122911.15700-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 5bd621eadc48..c4634dc83916 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1891,17 +1891,15 @@ static bool need_update(int cpu) /* * The fast way of checking if there are any vmstat diffs. */ - if (memchr_inv(pzstats->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS * - sizeof(pzstats->vm_stat_diff[0]))) + if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff))) return true; if (last_pgdat == zone->zone_pgdat) continue; last_pgdat = zone->zone_pgdat; n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu); - if (memchr_inv(n->vm_node_stat_diff, 0, NR_VM_NODE_STAT_ITEMS * - sizeof(n->vm_node_stat_diff[0]))) - return true; + if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff))) + return true; } return false; } From 33090af97350cee9ea0e347301cef704bd5b0d8e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 2 Sep 2021 15:01:08 -0700 Subject: [PATCH 473/474] mm/vmstat: remove unneeded return value The return value of pagetypeinfo_showfree and pagetypeinfo_showblockcount are unused now. Remove them. Link: https://lkml.kernel.org/r/20210715122911.15700-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index c4634dc83916..13ff25d0d96a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1454,7 +1454,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m, } /* Print out the free pages at each order for each migatetype */ -static int pagetypeinfo_showfree(struct seq_file *m, void *arg) +static void pagetypeinfo_showfree(struct seq_file *m, void *arg) { int order; pg_data_t *pgdat = (pg_data_t *)arg; @@ -1466,8 +1466,6 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg) seq_putc(m, '\n'); walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print); - - return 0; } static void pagetypeinfo_showblockcount_print(struct seq_file *m, @@ -1503,7 +1501,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, } /* Print out the number of pageblocks for each migratetype */ -static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) +static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg) { int mtype; pg_data_t *pgdat = (pg_data_t *)arg; @@ -1514,8 +1512,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) seq_putc(m, '\n'); walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showblockcount_print); - - return 0; } /* From d5fffc5aff269717a035baa087630adca612a6c4 Mon Sep 17 00:00:00 2001 From: zhangkui Date: Thu, 2 Sep 2021 15:01:11 -0700 Subject: [PATCH 474/474] mm/madvise: add MADV_WILLNEED to process_madvise() There is a usecase in Android that an app process's memory is swapped out by process_madvise() with MADV_PAGEOUT, such as the memory is swapped to zram or a backing device. When the process is scheduled to running, like switch to foreground, multiple page faults may cause the app dropped frames. To reduce the problem, System Management Software can read-ahead memory of the process immediately when the app switches to forground. Calling process_madvise() with MADV_WILLNEED can meet this need. Link: https://lkml.kernel.org/r/20210804082010.12482-1-zhangkui@oppo.com Signed-off-by: zhangkui Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/madvise.c b/mm/madvise.c index 5c065bc8b5f6..4a15a83031f6 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1048,6 +1048,7 @@ process_madvise_behavior_valid(int behavior) switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: + case MADV_WILLNEED: return true; default: return false;