From 9897713fe1077c90b4a86c9af0a878d56c8888a2 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 26 Sep 2024 19:11:50 +0200 Subject: [PATCH 01/12] bcachefs: do not use PF_MEMALLOC_NORECLAIM Patch series "remove PF_MEMALLOC_NORECLAIM" v3. This patch (of 2): bch2_new_inode relies on PF_MEMALLOC_NORECLAIM to try to allocate a new inode to achieve GFP_NOWAIT semantic while holding locks. If this allocation fails it will drop locks and use GFP_NOFS allocation context. We would like to drop PF_MEMALLOC_NORECLAIM because it is really dangerous to use if the caller doesn't control the full call chain with this flag set. E.g. if any of the function down the chain needed GFP_NOFAIL request the PF_MEMALLOC_NORECLAIM would override this and cause unexpected failure. While this is not the case in this particular case using the scoped gfp semantic is not really needed bacause we can easily pus the allocation context down the chain without too much clutter. [akpm@linux-foundation.org: fix kerneldoc warnings] Link: https://lkml.kernel.org/r/20240926172940.167084-1-mhocko@kernel.org Link: https://lkml.kernel.org/r/20240926172940.167084-2-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Jan Kara # For vfs changes Cc: Al Viro Cc: Christian Brauner Cc: James Morris Cc: Kent Overstreet Cc: Paul Moore Cc: Serge E. Hallyn Cc: Yafang Shao Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/bcachefs/fs.c | 14 ++++++-------- fs/inode.c | 10 ++++++---- include/linux/fs.h | 7 ++++++- include/linux/security.h | 4 ++-- security/security.c | 10 ++++++---- 5 files changed, 26 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 5bfc26d58270..02969dff165d 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -300,10 +300,10 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) BUG(); } -static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) +static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp) { struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb, - bch2_inode_cache, GFP_NOFS); + bch2_inode_cache, gfp); if (!inode) return NULL; @@ -315,7 +315,7 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) mutex_init(&inode->ei_quota_lock); memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); - if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) { + if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) { kmem_cache_free(bch2_inode_cache, inode); return NULL; } @@ -328,12 +328,10 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) */ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) { - struct bch_inode_info *inode = - memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, - __bch2_new_inode(trans->c)); + struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT); if (unlikely(!inode)) { - int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM); + int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM); if (ret && inode) { __destroy_inode(&inode->v); kmem_cache_free(bch2_inode_cache, inode); @@ -407,7 +405,7 @@ __bch2_create(struct mnt_idmap *idmap, if (ret) return ERR_PTR(ret); #endif - inode = __bch2_new_inode(c); + inode = __bch2_new_inode(c, GFP_NOFS); if (unlikely(!inode)) { inode = ERR_PTR(-ENOMEM); goto err; diff --git a/fs/inode.c b/fs/inode.c index 471ae4a31549..8dabb224f941 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -146,14 +146,16 @@ static int no_open(struct inode *inode, struct file *file) } /** - * inode_init_always - perform inode structure initialisation + * inode_init_always_gfp - perform inode structure initialisation * @sb: superblock inode belongs to * @inode: inode to initialise + * @gfp: allocation flags * * These are initializations that need to be done on every inode * allocation as the fields are not initialised by slab allocation. + * If there are additional allocations required @gfp is used. */ -int inode_init_always(struct super_block *sb, struct inode *inode) +int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp) { static const struct inode_operations empty_iops; static const struct file_operations no_open_fops = {.open = no_open}; @@ -230,14 +232,14 @@ int inode_init_always(struct super_block *sb, struct inode *inode) #endif inode->i_flctx = NULL; - if (unlikely(security_inode_alloc(inode))) + if (unlikely(security_inode_alloc(inode, gfp))) return -ENOMEM; this_cpu_inc(nr_inodes); return 0; } -EXPORT_SYMBOL(inode_init_always); +EXPORT_SYMBOL(inode_init_always_gfp); void free_inode_nonrcu(struct inode *inode) { diff --git a/include/linux/fs.h b/include/linux/fs.h index e3c603d01337..3559446279c1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3082,7 +3082,12 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int whence); extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence); -extern int inode_init_always(struct super_block *, struct inode *); +extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t); +static inline int inode_init_always(struct super_block *sb, struct inode *inode) +{ + return inode_init_always_gfp(sb, inode, GFP_NOFS); +} + extern void inode_init_once(struct inode *); extern void address_space_init_once(struct address_space *mapping); extern struct inode * igrab(struct inode *); diff --git a/include/linux/security.h b/include/linux/security.h index b86ec2afc691..2ec8f3014757 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -348,7 +348,7 @@ int security_dentry_create_files_as(struct dentry *dentry, int mode, struct cred *new); int security_path_notify(const struct path *path, u64 mask, unsigned int obj_type); -int security_inode_alloc(struct inode *inode); +int security_inode_alloc(struct inode *inode, gfp_t gfp); void security_inode_free(struct inode *inode); int security_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, @@ -789,7 +789,7 @@ static inline int security_path_notify(const struct path *path, u64 mask, return 0; } -static inline int security_inode_alloc(struct inode *inode) +static inline int security_inode_alloc(struct inode *inode, gfp_t gfp) { return 0; } diff --git a/security/security.c b/security/security.c index 6875eb4a59fc..c5981e558bc2 100644 --- a/security/security.c +++ b/security/security.c @@ -740,19 +740,20 @@ static int lsm_file_alloc(struct file *file) /** * lsm_inode_alloc - allocate a composite inode blob * @inode: the inode that needs a blob + * @gfp: allocation flags * * Allocate the inode blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ -static int lsm_inode_alloc(struct inode *inode) +static int lsm_inode_alloc(struct inode *inode, gfp_t gfp) { if (!lsm_inode_cache) { inode->i_security = NULL; return 0; } - inode->i_security = kmem_cache_zalloc(lsm_inode_cache, GFP_NOFS); + inode->i_security = kmem_cache_zalloc(lsm_inode_cache, gfp); if (inode->i_security == NULL) return -ENOMEM; return 0; @@ -1678,6 +1679,7 @@ int security_path_notify(const struct path *path, u64 mask, /** * security_inode_alloc() - Allocate an inode LSM blob * @inode: the inode + * @gfp: allocation flags * * Allocate and attach a security structure to @inode->i_security. The * i_security field is initialized to NULL when the inode structure is @@ -1685,9 +1687,9 @@ int security_path_notify(const struct path *path, u64 mask, * * Return: Return 0 if operation was successful. */ -int security_inode_alloc(struct inode *inode) +int security_inode_alloc(struct inode *inode, gfp_t gfp) { - int rc = lsm_inode_alloc(inode); + int rc = lsm_inode_alloc(inode, gfp); if (unlikely(rc)) return rc; From 9a8da05d7ad619beb84d0c6904c3fa7022c6fb9b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 26 Sep 2024 19:11:51 +0200 Subject: [PATCH 02/12] Revert "mm: introduce PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN" This reverts commit eab0af905bfc3e9c05da2ca163d76a1513159aa4. There is no existing user of those flags. PF_MEMALLOC_NOWARN is dangerous because a nested allocation context can use GFP_NOFAIL which could cause unexpected failure. Such a code would be hard to maintain because it could be deeper in the call chain. PF_MEMALLOC_NORECLAIM has been added even when it was pointed out [1] that such a allocation contex is inherently unsafe if the context doesn't fully control all allocations called from this context. While PF_MEMALLOC_NOWARN is not dangerous the way PF_MEMALLOC_NORECLAIM is it doesn't have any user and as Matthew has pointed out we are running out of those flags so better reclaim it without any real users. [1] https://lore.kernel.org/all/ZcM0xtlKbAOFjv5n@tiehlicka/ Link: https://lkml.kernel.org/r/20240926172940.167084-3-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Vlastimil Babka Cc: Al Viro Cc: Christian Brauner Cc: James Morris Cc: Jan Kara Cc: Kent Overstreet Cc: Paul Moore Cc: Serge E. Hallyn Cc: Yafang Shao Signed-off-by: Andrew Morton --- include/linux/sched.h | 4 ++-- include/linux/sched/mm.h | 17 ++++------------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index e6ee4258169a..449dd64ed9ac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1681,8 +1681,8 @@ extern struct pid *cad_pid; * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ -#define PF_MEMALLOC_NORECLAIM 0x00800000 /* All allocation requests will clear __GFP_DIRECT_RECLAIM */ -#define PF_MEMALLOC_NOWARN 0x01000000 /* All allocation requests will inherit __GFP_NOWARN */ +#define PF__HOLE__00800000 0x00800000 +#define PF__HOLE__01000000 0x01000000 #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 07bb8d4181d7..928a626725e6 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -251,25 +251,16 @@ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); - if (unlikely(pflags & (PF_MEMALLOC_NOIO | - PF_MEMALLOC_NOFS | - PF_MEMALLOC_NORECLAIM | - PF_MEMALLOC_NOWARN | - PF_MEMALLOC_PIN))) { + if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { /* - * Stronger flags before weaker flags: - * NORECLAIM implies NOIO, which in turn implies NOFS + * NOIO implies both NOIO and NOFS and it is a weaker context + * so always make sure it makes precedence */ - if (pflags & PF_MEMALLOC_NORECLAIM) - flags &= ~__GFP_DIRECT_RECLAIM; - else if (pflags & PF_MEMALLOC_NOIO) + if (pflags & PF_MEMALLOC_NOIO) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; - if (pflags & PF_MEMALLOC_NOWARN) - flags |= __GFP_NOWARN; - if (pflags & PF_MEMALLOC_PIN) flags &= ~__GFP_MOVABLE; } From 214e01ad4ed7158cab66498810094fac5d09b218 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Sep 2024 23:46:34 +0200 Subject: [PATCH 03/12] kthread: unpark only parked kthread Calling into kthread unparking unconditionally is mostly harmless when the kthread is already unparked. The wake up is then simply ignored because the target is not in TASK_PARKED state. However if the kthread is per CPU, the wake up is preceded by a call to kthread_bind() which expects the task to be inactive and in TASK_PARKED state, which obviously isn't the case if it is unparked. As a result, calling kthread_stop() on an unparked per-cpu kthread triggers such a warning: WARNING: CPU: 0 PID: 11 at kernel/kthread.c:525 __kthread_bind_mask kernel/kthread.c:525 kthread_stop+0x17a/0x630 kernel/kthread.c:707 destroy_workqueue+0x136/0xc40 kernel/workqueue.c:5810 wg_destruct+0x1e2/0x2e0 drivers/net/wireguard/device.c:257 netdev_run_todo+0xe1a/0x1000 net/core/dev.c:10693 default_device_exit_batch+0xa14/0xa90 net/core/dev.c:11769 ops_exit_list net/core/net_namespace.c:178 [inline] cleanup_net+0x89d/0xcc0 net/core/net_namespace.c:640 process_one_work kernel/workqueue.c:3231 [inline] process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3312 worker_thread+0x86d/0xd70 kernel/workqueue.c:3393 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Fix this with skipping unecessary unparking while stopping a kthread. Link: https://lkml.kernel.org/r/20240913214634.12557-1-frederic@kernel.org Fixes: 5c25b5ff89f0 ("workqueue: Tag bound workers with KTHREAD_IS_PER_CPU") Signed-off-by: Frederic Weisbecker Reported-by: syzbot+943d34fa3cf2191e3068@syzkaller.appspotmail.com Tested-by: syzbot+943d34fa3cf2191e3068@syzkaller.appspotmail.com Suggested-by: Thomas Gleixner Cc: Hillf Danton Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- kernel/kthread.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/kthread.c b/kernel/kthread.c index db4ceb0f503c..9bb36897b6c6 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -623,6 +623,8 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); + if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) + return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. From 7fcbd9785d4c17ea533c42f20a9083a83f301fa6 Mon Sep 17 00:00:00 2001 From: "Kun(llfl)" Date: Fri, 27 Sep 2024 15:45:09 +0800 Subject: [PATCH 04/12] device-dax: correct pgoff align in dax_set_mapping() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pgoff should be aligned using ALIGN_DOWN() instead of ALIGN(). Otherwise, vmf->address not aligned to fault_size will be aligned to the next alignment, that can result in memory failure getting the wrong address. It's a subtle situation that only can be observed in page_mapped_in_vma() after the page is page fault handled by dev_dax_huge_fault. Generally, there is little chance to perform page_mapped_in_vma in dev-dax's page unless in specific error injection to the dax device to trigger an MCE - memory-failure. In that case, page_mapped_in_vma() will be triggered to determine which task is accessing the failure address and kill that task in the end. We used self-developed dax device (which is 2M aligned mapping) , to perform error injection to random address. It turned out that error injected to non-2M-aligned address was causing endless MCE until panic. Because page_mapped_in_vma() kept resulting wrong address and the task accessing the failure address was never killed properly: [ 3783.719419] Memory failure: 0x200c9742: recovery action for dax page: Recovered [ 3784.049006] mce: Uncorrected hardware memory error in user-access at 200c9742380 [ 3784.049190] Memory failure: 0x200c9742: recovery action for dax page: Recovered [ 3784.448042] mce: Uncorrected hardware memory error in user-access at 200c9742380 [ 3784.448186] Memory failure: 0x200c9742: recovery action for dax page: Recovered [ 3784.792026] mce: Uncorrected hardware memory error in user-access at 200c9742380 [ 3784.792179] Memory failure: 0x200c9742: recovery action for dax page: Recovered [ 3785.162502] mce: Uncorrected hardware memory error in user-access at 200c9742380 [ 3785.162633] Memory failure: 0x200c9742: recovery action for dax page: Recovered [ 3785.461116] mce: Uncorrected hardware memory error in user-access at 200c9742380 [ 3785.461247] Memory failure: 0x200c9742: recovery action for dax page: Recovered [ 3785.764730] mce: Uncorrected hardware memory error in user-access at 200c9742380 [ 3785.764859] Memory failure: 0x200c9742: recovery action for dax page: Recovered [ 3786.042128] mce: Uncorrected hardware memory error in user-access at 200c9742380 [ 3786.042259] Memory failure: 0x200c9742: recovery action for dax page: Recovered [ 3786.464293] mce: Uncorrected hardware memory error in user-access at 200c9742380 [ 3786.464423] Memory failure: 0x200c9742: recovery action for dax page: Recovered [ 3786.818090] mce: Uncorrected hardware memory error in user-access at 200c9742380 [ 3786.818217] Memory failure: 0x200c9742: recovery action for dax page: Recovered [ 3787.085297] mce: Uncorrected hardware memory error in user-access at 200c9742380 [ 3787.085424] Memory failure: 0x200c9742: recovery action for dax page: Recovered It took us several weeks to pinpoint this problem,  but we eventually used bpftrace to trace the page fault and mce address and successfully identified the issue. Joao added: ; Likely we never reproduce in production because we always pin : device-dax regions in the region align they provide (Qemu does : similarly with prealloc in hugetlb/file backed memory). I think this : bug requires that we touch *unpinned* device-dax regions unaligned to : the device-dax selected alignment (page size i.e. 4K/2M/1G) Link: https://lkml.kernel.org/r/23c02a03e8d666fef11bbe13e85c69c8b4ca0624.1727421694.git.llfl@linux.alibaba.com Fixes: b9b5777f09be ("device-dax: use ALIGN() for determining pgoff") Signed-off-by: Kun(llfl) Tested-by: JianXiong Zhao Reviewed-by: Joao Martins Cc: Dan Williams Cc: Signed-off-by: Andrew Morton --- drivers/dax/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 9c1a729cd77e..6d74e62bbee0 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -86,7 +86,7 @@ static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn, nr_pages = 1; pgoff = linear_page_index(vmf->vma, - ALIGN(vmf->address, fault_size)); + ALIGN_DOWN(vmf->address, fault_size)); for (i = 0; i < nr_pages; i++) { struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i); From 76503e1fa1a53ef041a120825d5ce81c7fe7bdd7 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Fri, 27 Sep 2024 00:07:52 -0500 Subject: [PATCH 05/12] selftests/mm: fix incorrect buffer->mirror size in hmm2 double_map test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hmm2 double_map test was failing due to an incorrect buffer->mirror size. The buffer->mirror size was 6, while buffer->ptr size was 6 * PAGE_SIZE. The test failed because the kernel's copy_to_user function was attempting to copy a 6 * PAGE_SIZE buffer to buffer->mirror. Since the size of buffer->mirror was incorrect, copy_to_user failed. This patch corrects the buffer->mirror size to 6 * PAGE_SIZE. Test Result without this patch ============================== # RUN hmm2.hmm2_device_private.double_map ... # hmm-tests.c:1680:double_map:Expected ret (-14) == 0 (0) # double_map: Test terminated by assertion # FAIL hmm2.hmm2_device_private.double_map not ok 53 hmm2.hmm2_device_private.double_map Test Result with this patch =========================== # RUN hmm2.hmm2_device_private.double_map ... # OK hmm2.hmm2_device_private.double_map ok 53 hmm2.hmm2_device_private.double_map Link: https://lkml.kernel.org/r/20240927050752.51066-1-donettom@linux.ibm.com Fixes: fee9f6d1b8df ("mm/hmm/test: add selftests for HMM") Signed-off-by: Donet Tom Reviewed-by: Muhammad Usama Anjum Cc: Jérôme Glisse Cc: Kees Cook Cc: Mark Brown Cc: Przemek Kitszel Cc: Ritesh Harjani (IBM) Cc: Shuah Khan Cc: Ralph Campbell Cc: Jason Gunthorpe Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hmm-tests.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index d2cfc9b494a0..141bf63cbe05 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -1657,7 +1657,7 @@ TEST_F(hmm2, double_map) buffer->fd = -1; buffer->size = size; - buffer->mirror = malloc(npages); + buffer->mirror = malloc(size); ASSERT_NE(buffer->mirror, NULL); /* Reserve a range of addresses. */ From 3d5854d75e3187147613130561b58f0b06166172 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 30 Sep 2024 14:21:19 +0200 Subject: [PATCH 06/12] fs/proc/kcore.c: allow translation of physical memory addresses When /proc/kcore is read an attempt to read the first two pages results in HW-specific page swap on s390 and another (so called prefix) pages are accessed instead. That leads to a wrong read. Allow architecture-specific translation of memory addresses using kc_xlate_dev_mem_ptr() and kc_unxlate_dev_mem_ptr() callbacks similarily to /dev/mem xlate_dev_mem_ptr() and unxlate_dev_mem_ptr() callbacks. That way an architecture can deal with specific physical memory ranges. Re-use the existing /dev/mem callback implementation on s390, which handles the described prefix pages swapping correctly. For other architectures the default callback is basically NOP. It is expected the condition (vaddr == __va(__pa(vaddr))) always holds true for KCORE_RAM memory type. Link: https://lkml.kernel.org/r/20240930122119.1651546-1-agordeev@linux.ibm.com Signed-off-by: Alexander Gordeev Suggested-by: Heiko Carstens Cc: Vasily Gorbik Cc: Signed-off-by: Andrew Morton --- arch/s390/include/asm/io.h | 2 ++ fs/proc/kcore.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index 0fbc992d7a5e..fc9933a743d6 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -16,8 +16,10 @@ #include #define xlate_dev_mem_ptr xlate_dev_mem_ptr +#define kc_xlate_dev_mem_ptr xlate_dev_mem_ptr void *xlate_dev_mem_ptr(phys_addr_t phys); #define unxlate_dev_mem_ptr unxlate_dev_mem_ptr +#define kc_unxlate_dev_mem_ptr unxlate_dev_mem_ptr void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define IO_SPACE_LIMIT 0 diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 7d0acdad74e2..51446c59388f 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -50,6 +50,20 @@ static struct proc_dir_entry *proc_root_kcore; #define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET) #endif +#ifndef kc_xlate_dev_mem_ptr +#define kc_xlate_dev_mem_ptr kc_xlate_dev_mem_ptr +static inline void *kc_xlate_dev_mem_ptr(phys_addr_t phys) +{ + return __va(phys); +} +#endif +#ifndef kc_unxlate_dev_mem_ptr +#define kc_unxlate_dev_mem_ptr kc_unxlate_dev_mem_ptr +static inline void kc_unxlate_dev_mem_ptr(phys_addr_t phys, void *virt) +{ +} +#endif + static LIST_HEAD(kclist_head); static DECLARE_RWSEM(kclist_lock); static int kcore_need_update = 1; @@ -471,6 +485,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) while (buflen) { struct page *page; unsigned long pfn; + phys_addr_t phys; + void *__start; /* * If this is the first iteration or the address is not within @@ -537,7 +553,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) } break; case KCORE_RAM: - pfn = __pa(start) >> PAGE_SHIFT; + phys = __pa(start); + pfn = phys >> PAGE_SHIFT; page = pfn_to_online_page(pfn); /* @@ -557,13 +574,28 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) fallthrough; case KCORE_VMEMMAP: case KCORE_TEXT: + if (m->type == KCORE_RAM) { + __start = kc_xlate_dev_mem_ptr(phys); + if (!__start) { + ret = -ENOMEM; + if (iov_iter_zero(tsz, iter) != tsz) + ret = -EFAULT; + goto out; + } + } else { + __start = (void *)start; + } + /* * Sadly we must use a bounce buffer here to be able to * make use of copy_from_kernel_nofault(), as these * memory regions might not always be mapped on all * architectures. */ - if (copy_from_kernel_nofault(buf, (void *)start, tsz)) { + ret = copy_from_kernel_nofault(buf, __start, tsz); + if (m->type == KCORE_RAM) + kc_unxlate_dev_mem_ptr(phys, __start); + if (ret) { if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; From 0665d7a39bdf92c8ac3dc390501f303907c87f62 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 30 Sep 2024 15:06:11 +0800 Subject: [PATCH 07/12] resource, kunit: fix user-after-free in resource_test_region_intersects() In resource_test_insert_resource(), the pointer is used in error message after kfree(). This is user-after-free. To fix this, we need to call kunit_add_action_or_reset() to schedule memory freeing after usage. But kunit_add_action_or_reset() itself may fail and free the memory. So, its return value should be checked and abort the test for failure. Then, we found that other usage of kunit_add_action_or_reset() in resource_test_region_intersects() needs to be fixed too. We fix all these user-after-free bugs in this patch. Link: https://lkml.kernel.org/r/20240930070611.353338-1-ying.huang@intel.com Fixes: 99185c10d5d9 ("resource, kunit: add test case for region_intersects()") Signed-off-by: "Huang, Ying" Reported-by: Kees Bakker Closes: https://lore.kernel.org/lkml/87ldzaotcg.fsf@yhuang6-desk2.ccr.corp.intel.com/ Cc: Dan Williams Cc: David Hildenbrand Cc: Bjorn Helgaas Signed-off-by: Andrew Morton --- kernel/resource_kunit.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c index 42d2d8d20f5d..b8ef75b99eb2 100644 --- a/kernel/resource_kunit.c +++ b/kernel/resource_kunit.c @@ -169,6 +169,8 @@ static void resource_test_intersection(struct kunit *test) #define RES_TEST_RAM3_SIZE SZ_1M #define RES_TEST_TOTAL_SIZE ((RES_TEST_WIN1_OFFSET + RES_TEST_WIN1_SIZE)) +KUNIT_DEFINE_ACTION_WRAPPER(kfree_wrapper, kfree, const void *); + static void remove_free_resource(void *ctx) { struct resource *res = (struct resource *)ctx; @@ -177,6 +179,14 @@ static void remove_free_resource(void *ctx) kfree(res); } +static void resource_test_add_action_or_abort( + struct kunit *test, void (*action)(void *), void *ctx) +{ + KUNIT_ASSERT_EQ_MSG(test, 0, + kunit_add_action_or_reset(test, action, ctx), + "Fail to add action"); +} + static void resource_test_request_region(struct kunit *test, struct resource *parent, resource_size_t start, resource_size_t size, const char *name, unsigned long flags) @@ -185,7 +195,7 @@ static void resource_test_request_region(struct kunit *test, struct resource *pa res = __request_region(parent, start, size, name, flags); KUNIT_ASSERT_NOT_NULL(test, res); - kunit_add_action_or_reset(test, remove_free_resource, res); + resource_test_add_action_or_abort(test, remove_free_resource, res); } static void resource_test_insert_resource(struct kunit *test, struct resource *parent, @@ -202,11 +212,11 @@ static void resource_test_insert_resource(struct kunit *test, struct resource *p res->end = start + size - 1; res->flags = flags; if (insert_resource(parent, res)) { - kfree(res); + resource_test_add_action_or_abort(test, kfree_wrapper, res); KUNIT_FAIL_AND_ABORT(test, "Fail to insert resource %pR\n", res); } - kunit_add_action_or_reset(test, remove_free_resource, res); + resource_test_add_action_or_abort(test, remove_free_resource, res); } static void resource_test_region_intersects(struct kunit *test) @@ -220,7 +230,7 @@ static void resource_test_region_intersects(struct kunit *test) "test resources"); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); start = parent->start; - kunit_add_action_or_reset(test, remove_free_resource, parent); + resource_test_add_action_or_abort(test, remove_free_resource, parent); resource_test_request_region(test, parent, start + RES_TEST_RAM0_OFFSET, RES_TEST_RAM0_SIZE, "Test System RAM 0", flags); From 47fa30118f02dc50e1c57242c6b72542c871b178 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 26 Sep 2024 17:42:34 +0200 Subject: [PATCH 08/12] mm/huge_memory: check pmd_special() only after pmd_present() We should only check for pmd_special() after we made sure that we have a present PMD. For example, if we have a migration PMD, pmd_special() might indicate that we have a special PMD although we really don't. This fixes confusing migration entries as PFN mappings, and not doing what we are supposed to do in the "is_swap_pmd()" case further down in the function -- including messing up COW, page table handling and accounting. Link: https://lkml.kernel.org/r/20240926154234.2247217-1-david@redhat.com Fixes: bc02afbd4d73 ("mm/fork: accept huge pfnmap entries") Signed-off-by: David Hildenbrand Reported-by: syzbot+bf2c35fa302ebe3c7471@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/66f15c8d.050a0220.c23dd.000f.GAE@google.com/ Reviewed-by: Peter Xu Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3ca89e0279a7..87b49ecc7b1e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1586,7 +1586,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, int ret = -ENOMEM; pmd = pmdp_get_lockless(src_pmd); - if (unlikely(pmd_special(pmd))) { + if (unlikely(pmd_present(pmd) && pmd_special(pmd))) { dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); From 71e32fe63cb654fbf23933b10613714b2429118b Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 27 Sep 2024 12:29:12 -0700 Subject: [PATCH 09/12] .mailmap: update Fangrui's email I'm leaving Google. Link: https://lkml.kernel.org/r/20240927192912.31532-1-i@maskray.me Signed-off-by: Fangrui Song Acked-by: Nathan Chancellor Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 0374777cc662..3faaa5f400c6 100644 --- a/.mailmap +++ b/.mailmap @@ -203,6 +203,7 @@ Ezequiel Garcia Faith Ekstrand Faith Ekstrand Faith Ekstrand +Fangrui Song Felipe W Damasio Felix Kuhling Felix Moeller From 532b53cebe58f34ce1c0f34d866f5c0e335c53c6 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Tue, 1 Oct 2024 09:00:41 +0100 Subject: [PATCH 10/12] secretmem: disable memfd_secret() if arch cannot set direct map Return -ENOSYS from memfd_secret() syscall if !can_set_direct_map(). This is the case for example on some arm64 configurations, where marking 4k PTEs in the direct map not present can only be done if the direct map is set up at 4k granularity in the first place (as ARM's break-before-make semantics do not easily allow breaking apart large/gigantic pages). More precisely, on arm64 systems with !can_set_direct_map(), set_direct_map_invalid_noflush() is a no-op, however it returns success (0) instead of an error. This means that memfd_secret will seemingly "work" (e.g. syscall succeeds, you can mmap the fd and fault in pages), but it does not actually achieve its goal of removing its memory from the direct map. Note that with this patch, memfd_secret() will start erroring on systems where can_set_direct_map() returns false (arm64 with CONFIG_RODATA_FULL_DEFAULT_ENABLED=n, CONFIG_DEBUG_PAGEALLOC=n and CONFIG_KFENCE=n), but that still seems better than the current silent failure. Since CONFIG_RODATA_FULL_DEFAULT_ENABLED defaults to 'y', most arm64 systems actually have a working memfd_secret() and aren't be affected. From going through the iterations of the original memfd_secret patch series, it seems that disabling the syscall in these scenarios was the intended behavior [1] (preferred over having set_direct_map_invalid_noflush return an error as that would result in SIGBUSes at page-fault time), however the check for it got dropped between v16 [2] and v17 [3], when secretmem moved away from CMA allocations. [1]: https://lore.kernel.org/lkml/20201124164930.GK8537@kernel.org/ [2]: https://lore.kernel.org/lkml/20210121122723.3446-11-rppt@kernel.org/#t [3]: https://lore.kernel.org/lkml/20201125092208.12544-10-rppt@kernel.org/ Link: https://lkml.kernel.org/r/20241001080056.784735-1-roypat@amazon.co.uk Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas") Signed-off-by: Patrick Roy Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: David Hildenbrand Cc: James Gowans Cc: Signed-off-by: Andrew Morton --- mm/secretmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/secretmem.c b/mm/secretmem.c index 3afb5ad701e1..399552814fd0 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -238,7 +238,7 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) /* make sure local flags do not confict with global fcntl.h */ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); - if (!secretmem_enable) + if (!secretmem_enable || !can_set_direct_map()) return -ENOSYS; if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) @@ -280,7 +280,7 @@ static struct file_system_type secretmem_fs = { static int __init secretmem_init(void) { - if (!secretmem_enable) + if (!secretmem_enable || !can_set_direct_map()) return 0; secretmem_mnt = kern_mount(&secretmem_fs); From b1815690289449c2973b7ca77aea0e155677176f Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 2 Oct 2024 13:19:32 +0200 Subject: [PATCH 11/12] CREDITS: sort alphabetically by name Re-sort few misplaced entries in the CREDITS file. Link: https://lkml.kernel.org/r/20241002111932.46012-1-krzysztof.kozlowski@linaro.org Signed-off-by: Krzysztof Kozlowski Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- CREDITS | 54 +++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/CREDITS b/CREDITS index d439f5a1bc00..63f53feefa0a 100644 --- a/CREDITS +++ b/CREDITS @@ -1358,10 +1358,6 @@ D: Major kbuild rework during the 2.5 cycle D: ISDN Maintainer S: USA -N: Gerrit Renker -E: gerrit@erg.abdn.ac.uk -D: DCCP protocol support. - N: Philip Gladstone E: philip@gladstonefamily.net D: Kernel / timekeeping stuff @@ -1677,11 +1673,6 @@ W: http://www.carumba.com/ D: bug toaster (A1 sauce makes all the difference) D: Random linux hacker -N: James Hogan -E: jhogan@kernel.org -D: Metag architecture maintainer -D: TZ1090 SoC maintainer - N: Tim Hockin E: thockin@hockin.org W: http://www.hockin.org/~thockin @@ -1697,6 +1688,11 @@ D: hwmon subsystem maintainer D: i2c-sis96x and i2c-stub SMBus drivers S: USA +N: James Hogan +E: jhogan@kernel.org +D: Metag architecture maintainer +D: TZ1090 SoC maintainer + N: Dirk Hohndel E: hohndel@suse.de D: The XFree86[tm] Project @@ -1872,6 +1868,10 @@ S: K osmidomkum 723 S: 160 00 Praha 6 S: Czech Republic +N: Seth Jennings +E: sjenning@redhat.com +D: Creation and maintenance of zswap + N: Jeremy Kerr D: Maintainer of SPU File System @@ -2188,19 +2188,6 @@ N: Mike Kravetz E: mike.kravetz@oracle.com D: Maintenance and development of the hugetlb subsystem -N: Seth Jennings -E: sjenning@redhat.com -D: Creation and maintenance of zswap - -N: Dan Streetman -E: ddstreet@ieee.org -D: Maintenance and development of zswap -D: Creation and maintenance of the zpool API - -N: Vitaly Wool -E: vitaly.wool@konsulko.com -D: Maintenance and development of zswap - N: Andreas S. Krebs E: akrebs@altavista.net D: CYPRESS CY82C693 chipset IDE, Digital's PC-Alpha 164SX boards @@ -3191,6 +3178,11 @@ N: Ken Pizzini E: ken@halcyon.com D: CDROM driver "sonycd535" (Sony CDU-535/531) +N: Mathieu Poirier +E: mathieu.poirier@linaro.org +D: CoreSight kernel subsystem, Maintainer 2014-2022 +D: Perf tool support for CoreSight + N: Stelian Pop E: stelian@popies.net P: 1024D/EDBB6147 7B36 0E07 04BC 11DC A7A0 D3F7 7185 9E7A EDBB 6147 @@ -3300,6 +3292,10 @@ S: Schlossbergring 9 S: 79098 Freiburg S: Germany +N: Gerrit Renker +E: gerrit@erg.abdn.ac.uk +D: DCCP protocol support. + N: Thomas Renninger E: trenn@suse.de D: cpupowerutils @@ -3576,11 +3572,6 @@ D: several improvements to system programs S: Oldenburg S: Germany -N: Mathieu Poirier -E: mathieu.poirier@linaro.org -D: CoreSight kernel subsystem, Maintainer 2014-2022 -D: Perf tool support for CoreSight - N: Robert Schwebel E: robert@schwebel.de W: https://www.schwebel.de @@ -3771,6 +3762,11 @@ S: Chr. Winthersvej 1 B, st.th. S: DK-1860 Frederiksberg C S: Denmark +N: Dan Streetman +E: ddstreet@ieee.org +D: Maintenance and development of zswap +D: Creation and maintenance of the zpool API + N: Drew Sullivan E: drew@ss.org W: http://www.ss.org/ @@ -4286,6 +4282,10 @@ S: Pipers Way S: Swindon. SN3 1RJ S: England +N: Vitaly Wool +E: vitaly.wool@konsulko.com +D: Maintenance and development of zswap + N: Chris Wright E: chrisw@sous-sol.org D: hacking on LSM framework and security modules. From aa5f0fa6af38d96bc6f1b7e1534f5b5c025930a6 Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Wed, 2 Oct 2024 12:42:13 -0700 Subject: [PATCH 12/12] mm: zswap: delete comments for "value" member of 'struct zswap_entry'. Made a minor edit in the comments for 'struct zswap_entry' to delete the description of the 'value' member that was deleted in commit 20a5532ffa53 ("mm: remove code to handle same filled pages"). Link: https://lkml.kernel.org/r/20241002194213.30041-1-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Fixes: 20a5532ffa53 ("mm: remove code to handle same filled pages") Reviewed-by: Nhat Pham Acked-by: Yosry Ahmed Reviewed-by: Usama Arif Cc: Chengming Zhou Cc: Huang Ying Cc: Johannes Weiner Cc: Kanchana P Sridhar Cc: Ryan Roberts Cc: Wajdi Feghali Signed-off-by: Andrew Morton --- mm/zswap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index 449914ea9919..162013952074 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -190,7 +190,6 @@ static struct shrinker *zswap_shrinker; * section for context. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data - * value - value of the same-value filled pages which have same content * objcg - the obj_cgroup that the compressed memory is charged to * lru - handle to the pool's lru used to evict pages. */