From fce09ea314505a52f2436397608fa0a5d0934fb1 Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Sat, 2 Sep 2023 08:48:38 +0800 Subject: [PATCH 001/120] apparmor: Fix null pointer deref when receiving skb during sock creation The panic below is observed when receiving ICMP packets with secmark set while an ICMP raw socket is being created. SK_CTX(sk)->label is updated in apparmor_socket_post_create(), but the packet is delivered to the socket before that, causing the null pointer dereference. Drop the packet if label context is not set. BUG: kernel NULL pointer dereference, address: 000000000000004c #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 0 PID: 407 Comm: a.out Not tainted 6.4.12-arch1-1 #1 3e6fa2753a2d75925c34ecb78e22e85a65d083df Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/28/2020 RIP: 0010:aa_label_next_confined+0xb/0x40 Code: 00 00 48 89 ef e8 d5 25 0c 00 e9 66 ff ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 0f 1f 44 00 00 89 f0 <8b> 77 4c 39 c6 7e 1f 48 63 d0 48 8d 14 d7 eb 0b 83 c0 01 48 83 c2 RSP: 0018:ffffa92940003b08 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 000000000000000e RDX: ffffa92940003be8 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffff8b57471e7800 R08: ffff8b574c642400 R09: 0000000000000002 R10: ffffffffbd820eeb R11: ffffffffbeb7ff00 R12: ffff8b574c642400 R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000 FS: 00007fb092ea7640(0000) GS:ffff8b577bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000004c CR3: 00000001020f2005 CR4: 00000000007706f0 PKRU: 55555554 Call Trace: ? __die+0x23/0x70 ? page_fault_oops+0x171/0x4e0 ? exc_page_fault+0x7f/0x180 ? asm_exc_page_fault+0x26/0x30 ? aa_label_next_confined+0xb/0x40 apparmor_secmark_check+0xec/0x330 security_sock_rcv_skb+0x35/0x50 sk_filter_trim_cap+0x47/0x250 sock_queue_rcv_skb_reason+0x20/0x60 raw_rcv+0x13c/0x210 raw_local_deliver+0x1f3/0x250 ip_protocol_deliver_rcu+0x4f/0x2f0 ip_local_deliver_finish+0x76/0xa0 __netif_receive_skb_one_core+0x89/0xa0 netif_receive_skb+0x119/0x170 ? __netdev_alloc_skb+0x3d/0x140 vmxnet3_rq_rx_complete+0xb23/0x1010 [vmxnet3 56a84f9c97178c57a43a24ec073b45a9d6f01f3a] vmxnet3_poll_rx_only+0x36/0xb0 [vmxnet3 56a84f9c97178c57a43a24ec073b45a9d6f01f3a] __napi_poll+0x28/0x1b0 net_rx_action+0x2a4/0x380 __do_softirq+0xd1/0x2c8 __irq_exit_rcu+0xbb/0xf0 common_interrupt+0x86/0xa0 asm_common_interrupt+0x26/0x40 RIP: 0010:apparmor_socket_post_create+0xb/0x200 Code: 08 48 85 ff 75 a1 eb b1 0f 1f 80 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 41 54 <55> 48 89 fd 53 45 85 c0 0f 84 b2 00 00 00 48 8b 1d 80 56 3f 02 48 RSP: 0018:ffffa92940ce7e50 EFLAGS: 00000286 RAX: ffffffffbc756440 RBX: 0000000000000000 RCX: 0000000000000001 RDX: 0000000000000003 RSI: 0000000000000002 RDI: ffff8b574eaab740 RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000 R10: ffff8b57444cec70 R11: 0000000000000000 R12: 0000000000000003 R13: 0000000000000002 R14: ffff8b574eaab740 R15: ffffffffbd8e4748 ? __pfx_apparmor_socket_post_create+0x10/0x10 security_socket_post_create+0x4b/0x80 __sock_create+0x176/0x1f0 __sys_socket+0x89/0x100 __x64_sys_socket+0x17/0x20 do_syscall_64+0x5d/0x90 ? do_syscall_64+0x6c/0x90 ? do_syscall_64+0x6c/0x90 ? do_syscall_64+0x6c/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc Fixes: ab9f2115081a ("apparmor: Allow filtering based on secmark policy") Signed-off-by: Xiao Liang Signed-off-by: John Johansen --- security/apparmor/lsm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index cef8c466af80..c2f73d281724 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -1304,6 +1304,13 @@ static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) if (!skb->secmark) return 0; + /* + * If reach here before socket_post_create hook is called, in which + * case label is null, drop the packet. + */ + if (!ctx->label) + return -EACCES; + return apparmor_secmark_check(ctx->label, OP_RECVMSG, AA_MAY_RECEIVE, skb->secmark, sk); } From 2bc73505a5cd2a18a7a542022722f136c19e3b87 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Thu, 1 Feb 2024 17:24:48 +0300 Subject: [PATCH 002/120] apparmor: use kvfree_sensitive to free data->data Inside unpack_profile() data->data is allocated using kvmemdup() so it should be freed with the corresponding kvfree_sensitive(). Also add missing data->data release for rhashtable insertion failure path in unpack_profile(). Found by Linux Verification Center (linuxtesting.org). Fixes: e025be0f26d5 ("apparmor: support querying extended trusted helper extra data") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Signed-off-by: John Johansen --- security/apparmor/policy.c | 2 +- security/apparmor/policy_unpack.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 957654d253dd..14df15e35695 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -225,7 +225,7 @@ static void aa_free_data(void *ptr, void *arg) { struct aa_data *data = ptr; - kfree_sensitive(data->data); + kvfree_sensitive(data->data, data->size); kfree_sensitive(data->key); kfree_sensitive(data); } diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 5e578ef0ddff..75452acd0e35 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -1071,6 +1071,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name) if (rhashtable_insert_fast(profile->data, &data->head, profile->data->p)) { + kvfree_sensitive(data->data, data->size); kfree_sensitive(data->key); kfree_sensitive(data); info = "failed to insert data to table"; From 4a8db3678403e34c31df6b99a26414d5e183538c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 4 Mar 2024 16:36:55 +0000 Subject: [PATCH 003/120] apparmor: remove useless static inline function is_deleted The inlined function is_deleted is redundant, it is not called at all from any function in security/apparmor/file.c and so it can be removed. Cleans up clang scan build warning: security/apparmor/file.c:153:20: warning: unused function 'is_deleted' [-Wunused-function] Signed-off-by: Colin Ian King Signed-off-by: John Johansen --- security/apparmor/file.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/security/apparmor/file.c b/security/apparmor/file.c index c03eb7c19f16..d52a5b14dad4 100644 --- a/security/apparmor/file.c +++ b/security/apparmor/file.c @@ -144,19 +144,6 @@ int aa_audit_file(const struct cred *subj_cred, return aa_audit(type, profile, &ad, file_audit_cb); } -/** - * is_deleted - test if a file has been completely unlinked - * @dentry: dentry of file to test for deletion (NOT NULL) - * - * Returns: true if deleted else false - */ -static inline bool is_deleted(struct dentry *dentry) -{ - if (d_unlinked(dentry) && d_backing_inode(dentry)->i_nlink == 0) - return true; - return false; -} - static int path_name(const char *op, const struct cred *subj_cred, struct aa_label *label, const struct path *path, int flags, char *buffer, From b2c858148acf96290b9a9af259a04e080a169f51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Fri, 15 Mar 2024 13:54:09 +0100 Subject: [PATCH 004/120] apparmor: fix typo in kernel doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the typo in the function documentation to please kernel doc warnings. Signed-off-by: Christian Göttsche Reviewed-by: Kees Cook Signed-off-by: John Johansen --- security/apparmor/lsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index c2f73d281724..717204fba938 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -1124,7 +1124,7 @@ static int apparmor_socket_create(int family, int type, int protocol, int kern) * @sock: socket that is being setup * @family: family of socket being created * @type: type of the socket - * @ptotocol: protocol of the socket + * @protocol: protocol of the socket * @kern: socket is a special kernel socket * * Note: From 3dd384108d53834002be5630132ad5c3f32166ad Mon Sep 17 00:00:00 2001 From: Leesoo Ahn Date: Wed, 8 May 2024 01:12:29 +0900 Subject: [PATCH 005/120] apparmor: fix possible NULL pointer dereference profile->parent->dents[AAFS_PROF_DIR] could be NULL only if its parent is made from __create_missing_ancestors(..) and 'ent->old' is NULL in aa_replace_profiles(..). In that case, it must return an error code and the code, -ENOENT represents its state that the path of its parent is not existed yet. BUG: kernel NULL pointer dereference, address: 0000000000000030 PGD 0 P4D 0 PREEMPT SMP PTI CPU: 4 PID: 3362 Comm: apparmor_parser Not tainted 6.8.0-24-generic #24 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 RIP: 0010:aafs_create.constprop.0+0x7f/0x130 Code: 4c 63 e0 48 83 c4 18 4c 89 e0 5b 41 5c 41 5d 41 5e 41 5f 5d 31 d2 31 c9 31 f6 31 ff 45 31 c0 45 31 c9 45 31 d2 c3 cc cc cc cc <4d> 8b 55 30 4d 8d ba a0 00 00 00 4c 89 55 c0 4c 89 ff e8 7a 6a ae RSP: 0018:ffffc9000b2c7c98 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 00000000000041ed RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffc9000b2c7cd8 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82baac10 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007be9f22cf740(0000) GS:ffff88817bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000030 CR3: 0000000134b08000 CR4: 00000000000006f0 Call Trace: ? show_regs+0x6d/0x80 ? __die+0x24/0x80 ? page_fault_oops+0x99/0x1b0 ? kernelmode_fixup_or_oops+0xb2/0x140 ? __bad_area_nosemaphore+0x1a5/0x2c0 ? find_vma+0x34/0x60 ? bad_area_nosemaphore+0x16/0x30 ? do_user_addr_fault+0x2a2/0x6b0 ? exc_page_fault+0x83/0x1b0 ? asm_exc_page_fault+0x27/0x30 ? aafs_create.constprop.0+0x7f/0x130 ? aafs_create.constprop.0+0x51/0x130 __aafs_profile_mkdir+0x3d6/0x480 aa_replace_profiles+0x83f/0x1270 policy_update+0xe3/0x180 profile_load+0xbc/0x150 ? rw_verify_area+0x47/0x140 vfs_write+0x100/0x480 ? __x64_sys_openat+0x55/0xa0 ? syscall_exit_to_user_mode+0x86/0x260 ksys_write+0x73/0x100 __x64_sys_write+0x19/0x30 x64_sys_call+0x7e/0x25c0 do_syscall_64+0x7f/0x180 entry_SYSCALL_64_after_hwframe+0x78/0x80 RIP: 0033:0x7be9f211c574 Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 80 3d d5 ea 0e 00 00 74 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20 48 89 RSP: 002b:00007ffd26f2b8c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00005d504415e200 RCX: 00007be9f211c574 RDX: 0000000000001fc1 RSI: 00005d504418bc80 RDI: 0000000000000004 RBP: 0000000000001fc1 R08: 0000000000001fc1 R09: 0000000080000000 R10: 0000000000000000 R11: 0000000000000202 R12: 00005d504418bc80 R13: 0000000000000004 R14: 00007ffd26f2b9b0 R15: 00007ffd26f2ba30 Modules linked in: snd_seq_dummy snd_hrtimer qrtr snd_hda_codec_generic snd_hda_intel snd_intel_dspcfg snd_intel_sdw_acpi snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq snd_seq_device i2c_i801 snd_timer i2c_smbus qxl snd soundcore drm_ttm_helper lpc_ich ttm joydev input_leds serio_raw mac_hid binfmt_misc msr parport_pc ppdev lp parport efi_pstore nfnetlink dmi_sysfs qemu_fw_cfg ip_tables x_tables autofs4 hid_generic usbhid hid ahci libahci psmouse virtio_rng xhci_pci xhci_pci_renesas CR2: 0000000000000030 ---[ end trace 0000000000000000 ]--- RIP: 0010:aafs_create.constprop.0+0x7f/0x130 Code: 4c 63 e0 48 83 c4 18 4c 89 e0 5b 41 5c 41 5d 41 5e 41 5f 5d 31 d2 31 c9 31 f6 31 ff 45 31 c0 45 31 c9 45 31 d2 c3 cc cc cc cc <4d> 8b 55 30 4d 8d ba a0 00 00 00 4c 89 55 c0 4c 89 ff e8 7a 6a ae RSP: 0018:ffffc9000b2c7c98 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 00000000000041ed RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffc9000b2c7cd8 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82baac10 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007be9f22cf740(0000) GS:ffff88817bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000030 CR3: 0000000134b08000 CR4: 00000000000006f0 Signed-off-by: Leesoo Ahn Signed-off-by: John Johansen --- security/apparmor/apparmorfs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index bcfea073e3f2..01b923d97a44 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -1692,6 +1692,10 @@ int __aafs_profile_mkdir(struct aa_profile *profile, struct dentry *parent) struct aa_profile *p; p = aa_deref_parent(profile); dent = prof_dir(p); + if (!dent) { + error = -ENOENT; + goto fail2; + } /* adding to parent that previously didn't have children */ dent = aafs_create_dir("profiles", dent); if (IS_ERR(dent)) From 7bd3d76a1f9fb38c8234cfadb7131e0b26deb919 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 8 Mar 2024 18:32:15 +0000 Subject: [PATCH 006/120] unicode: make utf8 test count static The variables failed_tests and total_tests are not used outside of the utf8-selftest.c file so make them static to avoid the following warnings: fs/unicode/utf8-selftest.c:17:14: warning: symbol 'failed_tests' was not declared. Should it be static? fs/unicode/utf8-selftest.c:18:14: warning: symbol 'total_tests' was not declared. Should it be static? Signed-off-by: Ben Dooks Link: https://lore.kernel.org/r/20240308183215.1924331-1-ben.dooks@codethink.co.uk Signed-off-by: Gabriel Krisman Bertazi --- fs/unicode/utf8-selftest.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c index eb2bbdd688d7..c928e6007356 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/utf8-selftest.c @@ -14,8 +14,8 @@ #include "utf8n.h" -unsigned int failed_tests; -unsigned int total_tests; +static unsigned int failed_tests; +static unsigned int total_tests; /* Tests will be based on this version. */ #define UTF8_LATEST UNICODE_AGE(12, 1, 0) From 68318904a7758e11f16fa9d202a6df60f896e71a Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Fri, 24 May 2024 11:48:09 -0700 Subject: [PATCH 007/120] unicode: add MODULE_DESCRIPTION() macros Currently 'make W=1' reports: WARNING: modpost: missing MODULE_DESCRIPTION() in fs/unicode/utf8data.o WARNING: modpost: missing MODULE_DESCRIPTION() in fs/unicode/utf8-selftest.o Add a MODULE_DESCRIPTION() to utf8-selftest.c and utf8data.c_shipped, and update mkutf8data.c to add a MODULE_DESCRIPTION() to any future generated utf8data file. Signed-off-by: Jeff Johnson Link: https://lore.kernel.org/r/20240524-md-unicode-v1-1-e2727ce8574d@quicinc.com Signed-off-by: Gabriel Krisman Bertazi --- fs/unicode/mkutf8data.c | 1 + fs/unicode/utf8-selftest.c | 1 + fs/unicode/utf8data.c_shipped | 1 + 3 files changed, 3 insertions(+) diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c index bc1a7c8b5c8d..77b685db8275 100644 --- a/fs/unicode/mkutf8data.c +++ b/fs/unicode/mkutf8data.c @@ -3352,6 +3352,7 @@ static void write_file(void) fprintf(file, "};\n"); fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);"); fprintf(file, "\n"); + fprintf(file, "MODULE_DESCRIPTION(\"UTF8 data table\");\n"); fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n"); fclose(file); } diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c index c928e6007356..600e15efe9ed 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/utf8-selftest.c @@ -307,4 +307,5 @@ module_init(init_test_ucd); module_exit(exit_test_ucd); MODULE_AUTHOR("Gabriel Krisman Bertazi "); +MODULE_DESCRIPTION("Kernel module for testing utf-8 support"); MODULE_LICENSE("GPL"); diff --git a/fs/unicode/utf8data.c_shipped b/fs/unicode/utf8data.c_shipped index d9b62901aa96..dafa5fed761d 100644 --- a/fs/unicode/utf8data.c_shipped +++ b/fs/unicode/utf8data.c_shipped @@ -4120,4 +4120,5 @@ struct utf8data_table utf8_data_table = { .utf8data = utf8data, }; EXPORT_SYMBOL_GPL(utf8_data_table); +MODULE_DESCRIPTION("UTF8 data table"); MODULE_LICENSE("GPL v2"); From 1b3bf0747d4f1a963e59c26e602868bdce195318 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 11 Jun 2024 10:19:50 -0700 Subject: [PATCH 008/120] tools/power/turbostat: Switch to new Intel CPU model defines New CPU #defines encode vendor and family as well as model. N.B. Copied VFM_*() defines here from to avoid an application picking a second internal kernel header file. Signed-off-by: Tony Luck Acked-by: Rafael J. Wysocki Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 165 +++++++++++++++----------- 1 file changed, 95 insertions(+), 70 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9f5d053d4bc6..02312e62c857 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -9,6 +9,30 @@ #define _GNU_SOURCE #include MSRHEADER + +// copied from arch/x86/include/asm/cpu_device_id.h +#define VFM_MODEL_BIT 0 +#define VFM_FAMILY_BIT 8 +#define VFM_VENDOR_BIT 16 +#define VFM_RSVD_BIT 24 + +#define VFM_MODEL_MASK GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT) +#define VFM_FAMILY_MASK GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT) +#define VFM_VENDOR_MASK GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT) + +#define VFM_MODEL(vfm) (((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT) +#define VFM_FAMILY(vfm) (((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT) +#define VFM_VENDOR(vfm) (((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT) + +#define VFM_MAKE(_vendor, _family, _model) ( \ + ((_model) << VFM_MODEL_BIT) | \ + ((_family) << VFM_FAMILY_BIT) | \ + ((_vendor) << VFM_VENDOR_BIT) \ +) +// end copied section + +#define X86_VENDOR_INTEL 0 + #include INTEL_FAMILY_HEADER #include BUILD_BUG_HEADER #include @@ -367,7 +391,7 @@ struct platform_features { }; struct platform_data { - unsigned int model; + unsigned int vfm; const struct platform_features *features; }; @@ -910,75 +934,75 @@ static const struct platform_features amd_features_with_rapl = { }; static const struct platform_data turbostat_pdata[] = { - { INTEL_FAM6_NEHALEM, &nhm_features }, - { INTEL_FAM6_NEHALEM_G, &nhm_features }, - { INTEL_FAM6_NEHALEM_EP, &nhm_features }, - { INTEL_FAM6_NEHALEM_EX, &nhx_features }, - { INTEL_FAM6_WESTMERE, &nhm_features }, - { INTEL_FAM6_WESTMERE_EP, &nhm_features }, - { INTEL_FAM6_WESTMERE_EX, &nhx_features }, - { INTEL_FAM6_SANDYBRIDGE, &snb_features }, - { INTEL_FAM6_SANDYBRIDGE_X, &snx_features }, - { INTEL_FAM6_IVYBRIDGE, &ivb_features }, - { INTEL_FAM6_IVYBRIDGE_X, &ivx_features }, - { INTEL_FAM6_HASWELL, &hsw_features }, - { INTEL_FAM6_HASWELL_X, &hsx_features }, - { INTEL_FAM6_HASWELL_L, &hswl_features }, - { INTEL_FAM6_HASWELL_G, &hswg_features }, - { INTEL_FAM6_BROADWELL, &bdw_features }, - { INTEL_FAM6_BROADWELL_G, &bdwg_features }, - { INTEL_FAM6_BROADWELL_X, &bdx_features }, - { INTEL_FAM6_BROADWELL_D, &bdx_features }, - { INTEL_FAM6_SKYLAKE_L, &skl_features }, - { INTEL_FAM6_SKYLAKE, &skl_features }, - { INTEL_FAM6_SKYLAKE_X, &skx_features }, - { INTEL_FAM6_KABYLAKE_L, &skl_features }, - { INTEL_FAM6_KABYLAKE, &skl_features }, - { INTEL_FAM6_COMETLAKE, &skl_features }, - { INTEL_FAM6_COMETLAKE_L, &skl_features }, - { INTEL_FAM6_CANNONLAKE_L, &cnl_features }, - { INTEL_FAM6_ICELAKE_X, &icx_features }, - { INTEL_FAM6_ICELAKE_D, &icx_features }, - { INTEL_FAM6_ICELAKE_L, &cnl_features }, - { INTEL_FAM6_ICELAKE_NNPI, &cnl_features }, - { INTEL_FAM6_ROCKETLAKE, &cnl_features }, - { INTEL_FAM6_TIGERLAKE_L, &cnl_features }, - { INTEL_FAM6_TIGERLAKE, &cnl_features }, - { INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features }, - { INTEL_FAM6_EMERALDRAPIDS_X, &spr_features }, - { INTEL_FAM6_GRANITERAPIDS_X, &spr_features }, - { INTEL_FAM6_LAKEFIELD, &cnl_features }, - { INTEL_FAM6_ALDERLAKE, &adl_features }, - { INTEL_FAM6_ALDERLAKE_L, &adl_features }, - { INTEL_FAM6_RAPTORLAKE, &adl_features }, - { INTEL_FAM6_RAPTORLAKE_P, &adl_features }, - { INTEL_FAM6_RAPTORLAKE_S, &adl_features }, - { INTEL_FAM6_METEORLAKE, &cnl_features }, - { INTEL_FAM6_METEORLAKE_L, &cnl_features }, - { INTEL_FAM6_ARROWLAKE_H, &arl_features }, - { INTEL_FAM6_ARROWLAKE_U, &arl_features }, - { INTEL_FAM6_ARROWLAKE, &arl_features }, - { INTEL_FAM6_LUNARLAKE_M, &arl_features }, - { INTEL_FAM6_ATOM_SILVERMONT, &slv_features }, - { INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features }, - { INTEL_FAM6_ATOM_AIRMONT, &amt_features }, - { INTEL_FAM6_ATOM_GOLDMONT, &gmt_features }, - { INTEL_FAM6_ATOM_GOLDMONT_D, &gmtd_features }, - { INTEL_FAM6_ATOM_GOLDMONT_PLUS, &gmtp_features }, - { INTEL_FAM6_ATOM_TREMONT_D, &tmtd_features }, - { INTEL_FAM6_ATOM_TREMONT, &tmt_features }, - { INTEL_FAM6_ATOM_TREMONT_L, &tmt_features }, - { INTEL_FAM6_ATOM_GRACEMONT, &adl_features }, - { INTEL_FAM6_ATOM_CRESTMONT_X, &srf_features }, - { INTEL_FAM6_ATOM_CRESTMONT, &grr_features }, - { INTEL_FAM6_XEON_PHI_KNL, &knl_features }, - { INTEL_FAM6_XEON_PHI_KNM, &knl_features }, + { INTEL_NEHALEM, &nhm_features }, + { INTEL_NEHALEM_G, &nhm_features }, + { INTEL_NEHALEM_EP, &nhm_features }, + { INTEL_NEHALEM_EX, &nhx_features }, + { INTEL_WESTMERE, &nhm_features }, + { INTEL_WESTMERE_EP, &nhm_features }, + { INTEL_WESTMERE_EX, &nhx_features }, + { INTEL_SANDYBRIDGE, &snb_features }, + { INTEL_SANDYBRIDGE_X, &snx_features }, + { INTEL_IVYBRIDGE, &ivb_features }, + { INTEL_IVYBRIDGE_X, &ivx_features }, + { INTEL_HASWELL, &hsw_features }, + { INTEL_HASWELL_X, &hsx_features }, + { INTEL_HASWELL_L, &hswl_features }, + { INTEL_HASWELL_G, &hswg_features }, + { INTEL_BROADWELL, &bdw_features }, + { INTEL_BROADWELL_G, &bdwg_features }, + { INTEL_BROADWELL_X, &bdx_features }, + { INTEL_BROADWELL_D, &bdx_features }, + { INTEL_SKYLAKE_L, &skl_features }, + { INTEL_SKYLAKE, &skl_features }, + { INTEL_SKYLAKE_X, &skx_features }, + { INTEL_KABYLAKE_L, &skl_features }, + { INTEL_KABYLAKE, &skl_features }, + { INTEL_COMETLAKE, &skl_features }, + { INTEL_COMETLAKE_L, &skl_features }, + { INTEL_CANNONLAKE_L, &cnl_features }, + { INTEL_ICELAKE_X, &icx_features }, + { INTEL_ICELAKE_D, &icx_features }, + { INTEL_ICELAKE_L, &cnl_features }, + { INTEL_ICELAKE_NNPI, &cnl_features }, + { INTEL_ROCKETLAKE, &cnl_features }, + { INTEL_TIGERLAKE_L, &cnl_features }, + { INTEL_TIGERLAKE, &cnl_features }, + { INTEL_SAPPHIRERAPIDS_X, &spr_features }, + { INTEL_EMERALDRAPIDS_X, &spr_features }, + { INTEL_GRANITERAPIDS_X, &spr_features }, + { INTEL_LAKEFIELD, &cnl_features }, + { INTEL_ALDERLAKE, &adl_features }, + { INTEL_ALDERLAKE_L, &adl_features }, + { INTEL_RAPTORLAKE, &adl_features }, + { INTEL_RAPTORLAKE_P, &adl_features }, + { INTEL_RAPTORLAKE_S, &adl_features }, + { INTEL_METEORLAKE, &cnl_features }, + { INTEL_METEORLAKE_L, &cnl_features }, + { INTEL_ARROWLAKE_H, &arl_features }, + { INTEL_ARROWLAKE_U, &arl_features }, + { INTEL_ARROWLAKE, &arl_features }, + { INTEL_LUNARLAKE_M, &arl_features }, + { INTEL_ATOM_SILVERMONT, &slv_features }, + { INTEL_ATOM_SILVERMONT_D, &slvd_features }, + { INTEL_ATOM_AIRMONT, &amt_features }, + { INTEL_ATOM_GOLDMONT, &gmt_features }, + { INTEL_ATOM_GOLDMONT_D, &gmtd_features }, + { INTEL_ATOM_GOLDMONT_PLUS, &gmtp_features }, + { INTEL_ATOM_TREMONT_D, &tmtd_features }, + { INTEL_ATOM_TREMONT, &tmt_features }, + { INTEL_ATOM_TREMONT_L, &tmt_features }, + { INTEL_ATOM_GRACEMONT, &adl_features }, + { INTEL_ATOM_CRESTMONT_X, &srf_features }, + { INTEL_ATOM_CRESTMONT, &grr_features }, + { INTEL_XEON_PHI_KNL, &knl_features }, + { INTEL_XEON_PHI_KNM, &knl_features }, /* * Missing support for - * INTEL_FAM6_ICELAKE - * INTEL_FAM6_ATOM_SILVERMONT_MID - * INTEL_FAM6_ATOM_AIRMONT_MID - * INTEL_FAM6_ATOM_AIRMONT_NP + * INTEL_ICELAKE + * INTEL_ATOM_SILVERMONT_MID + * INTEL_ATOM_AIRMONT_MID + * INTEL_ATOM_AIRMONT_NP */ { 0, NULL }, }; @@ -1003,11 +1027,12 @@ void probe_platform_features(unsigned int family, unsigned int model) return; } - if (!genuine_intel || family != 6) + if (!genuine_intel) return; for (i = 0; turbostat_pdata[i].features; i++) { - if (turbostat_pdata[i].model == model) { + if (VFM_FAMILY(turbostat_pdata[i].vfm) == family && + VFM_MODEL(turbostat_pdata[i].vfm) == model) { platform = turbostat_pdata[i].features; return; } From c81c8ee445e84703cd90171365ed596ef94399cc Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Fri, 24 May 2024 12:06:39 +0200 Subject: [PATCH 009/120] tools/power turbostat: Remove anonymous union from rapl_counter_info_t fd_perf field used to be part of the union, but later moved out of it, because we test it with fd_perf != -1 to determine if any perf counter is opened, making the union unused. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 02312e62c857..1ac74cd49cc9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1085,15 +1085,9 @@ struct rapl_counter_info_t { unsigned long long flags[NUM_RAPL_COUNTERS]; double scale[NUM_RAPL_COUNTERS]; enum rapl_unit unit[NUM_RAPL_COUNTERS]; - - union { - /* Active when source == RAPL_SOURCE_MSR */ - struct { - unsigned long long msr[NUM_RAPL_COUNTERS]; - unsigned long long msr_mask[NUM_RAPL_COUNTERS]; - int msr_shift[NUM_RAPL_COUNTERS]; - }; - }; + unsigned long long msr[NUM_RAPL_COUNTERS]; + unsigned long long msr_mask[NUM_RAPL_COUNTERS]; + int msr_shift[NUM_RAPL_COUNTERS]; int fd_perf; }; From 73ed3c941a76ff3348022477975c84b8d183599a Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Fri, 24 May 2024 12:22:57 +0200 Subject: [PATCH 010/120] tools/power turbostat: Replace enum rapl_source and cstate_source with counter_source Reuse the enum. It means the same thing in both cases. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 39 +++++++++++++-------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 1ac74cd49cc9..0fc6c107e371 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -86,8 +86,7 @@ enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; -enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR }; -enum cstate_source { CSTATE_SOURCE_NONE, CSTATE_SOURCE_PERF, CSTATE_SOURCE_MSR }; +enum counter_source { COUNTER_SOURCE_NONE, COUNTER_SOURCE_PERF, COUNTER_SOURCE_MSR }; struct sysfs_path { char path[PATH_BYTES]; @@ -1081,7 +1080,7 @@ enum rapl_unit { struct rapl_counter_info_t { unsigned long long data[NUM_RAPL_COUNTERS]; - enum rapl_source source[NUM_RAPL_COUNTERS]; + enum counter_source source[NUM_RAPL_COUNTERS]; unsigned long long flags[NUM_RAPL_COUNTERS]; double scale[NUM_RAPL_COUNTERS]; enum rapl_unit unit[NUM_RAPL_COUNTERS]; @@ -1243,7 +1242,7 @@ enum ccstate_rci_index { struct cstate_counter_info_t { unsigned long long data[NUM_CSTATE_COUNTERS]; - enum cstate_source source[NUM_CSTATE_COUNTERS]; + enum counter_source source[NUM_CSTATE_COUNTERS]; unsigned long long msr[NUM_CSTATE_COUNTERS]; int fd_perf_core; int fd_perf_pkg; @@ -3601,7 +3600,7 @@ size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) size_t ret = 0; for (int i = 0; i < NUM_RAPL_COUNTERS; ++i) - if (rci->source[i] == RAPL_SOURCE_PERF) + if (rci->source[i] == COUNTER_SOURCE_PERF) ++ret; return ret; @@ -3612,7 +3611,7 @@ static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t size_t ret = 0; for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) - if (cci->source[i] == CSTATE_SOURCE_PERF) + if (cci->source[i] == COUNTER_SOURCE_PERF) ++ret; return ret; @@ -3653,10 +3652,10 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) { switch (rci->source[i]) { - case RAPL_SOURCE_NONE: + case COUNTER_SOURCE_NONE: break; - case RAPL_SOURCE_PERF: + case COUNTER_SOURCE_PERF: assert(pi < ARRAY_SIZE(perf_data)); assert(rci->fd_perf != -1); @@ -3669,7 +3668,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct ++pi; break; - case RAPL_SOURCE_MSR: + case COUNTER_SOURCE_MSR: if (debug) fprintf(stderr, "Reading rapl counter via msr at %u\n", i); @@ -3791,10 +3790,10 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat for (unsigned int i = 0, pi = 0; i < NUM_CSTATE_COUNTERS; ++i) { switch (cci->source[i]) { - case CSTATE_SOURCE_NONE: + case COUNTER_SOURCE_NONE: break; - case CSTATE_SOURCE_PERF: + case COUNTER_SOURCE_PERF: assert(pi < ARRAY_SIZE(perf_data)); assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1); @@ -3807,7 +3806,7 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat ++pi; break; - case CSTATE_SOURCE_MSR: + case COUNTER_SOURCE_MSR: assert(!no_msr); if (get_msr(cpu, cci->msr[i], &cci->data[i])) return -13 - i; @@ -3828,7 +3827,7 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat * when invoked for the thread sibling. */ #define PERF_COUNTER_WRITE_DATA(out_counter, index) do { \ - if (cci->source[index] != CSTATE_SOURCE_NONE) \ + if (cci->source[index] != COUNTER_SOURCE_NONE) \ out_counter = cci->data[index]; \ } while (0) @@ -6894,7 +6893,7 @@ void rapl_perf_init(void) rci->fd_perf = -1; for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) { rci->data[i] = 0; - rci->source[i] = RAPL_SOURCE_NONE; + rci->source[i] = COUNTER_SOURCE_NONE; } } @@ -6936,14 +6935,14 @@ void rapl_perf_init(void) /* Use perf API for this counter */ if (!no_perf && cai->perf_name && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) { - rci->source[cai->rci_index] = RAPL_SOURCE_PERF; + rci->source[cai->rci_index] = COUNTER_SOURCE_PERF; rci->scale[cai->rci_index] = scale * cai->compat_scale; rci->unit[cai->rci_index] = unit; rci->flags[cai->rci_index] = cai->flags; /* Use MSR for this counter */ } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { - rci->source[cai->rci_index] = RAPL_SOURCE_MSR; + rci->source[cai->rci_index] = COUNTER_SOURCE_MSR; rci->msr[cai->rci_index] = cai->msr; rci->msr_mask[cai->rci_index] = cai->msr_mask; rci->msr_shift[cai->rci_index] = cai->msr_shift; @@ -6953,7 +6952,7 @@ void rapl_perf_init(void) } } - if (rci->source[cai->rci_index] != RAPL_SOURCE_NONE) + if (rci->source[cai->rci_index] != COUNTER_SOURCE_NONE) has_counter = 1; } @@ -7146,17 +7145,17 @@ void cstate_perf_init_(bool soft_c1) /* Use perf API for this counter */ if (!no_perf && cai->perf_name && add_cstate_perf_counter(cpu, cci, cai) != -1) { - cci->source[cai->rci_index] = CSTATE_SOURCE_PERF; + cci->source[cai->rci_index] = COUNTER_SOURCE_PERF; /* User MSR for this counter */ } else if (!no_msr && cai->msr && pkg_cstate_limit >= cai->pkg_cstate_limit && probe_msr(cpu, cai->msr) == 0) { - cci->source[cai->rci_index] = CSTATE_SOURCE_MSR; + cci->source[cai->rci_index] = COUNTER_SOURCE_MSR; cci->msr[cai->rci_index] = cai->msr; } } - if (cci->source[cai->rci_index] != CSTATE_SOURCE_NONE) { + if (cci->source[cai->rci_index] != COUNTER_SOURCE_NONE) { has_counter = true; cores_visited[core_id] = true; pkg_visited[pkg_id] = true; From 25e713c6b54a899b5a8b5e09b58f4b87a5b3664b Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Fri, 24 May 2024 13:04:33 +0200 Subject: [PATCH 011/120] tools/power turbostat: Add ZERO_ARRAY for zero initializing builtin array It makes it harder to shoot yourself in the foot, by using additional __must_be_array() check. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 0fc6c107e371..29751b41ea0d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1060,6 +1060,8 @@ size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affi #define MAX_ADDED_PACKAGE_COUNTERS 16 #define BITMASK_SIZE 32 +#define ZERO_ARRAY(arr) (memset(arr, 0, sizeof(arr)) + __must_be_array(arr)) + /* Indexes used to map data read from perf and MSRs into global variables */ enum rapl_rci_index { RAPL_RCI_INDEX_ENERGY_PKG = 0, @@ -3733,9 +3735,9 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat assert(ccstate_counter_info); assert(cpu <= ccstate_counter_info_size); - memset(perf_data, 0, sizeof(perf_data)); - memset(perf_data_core, 0, sizeof(perf_data_core)); - memset(perf_data_pkg, 0, sizeof(perf_data_pkg)); + ZERO_ARRAY(perf_data); + ZERO_ARRAY(perf_data_core); + ZERO_ARRAY(perf_data_pkg); cci = &ccstate_counter_info[cpu]; From 675e979db473d08be346a3190c5f0db095a57153 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Fri, 7 Jun 2024 16:43:58 +0200 Subject: [PATCH 012/120] cxl/events: Use a common struct for DRAM and General Media events cxl_event_common was an unfortunate naming choice and caused confusion with the existing Common Event Record. Furthermore, its fields didn't map all the common information between DRAM and General Media Events. Remove cxl_event_common and introduce cxl_event_media_hdr to record common information between DRAM and General Media events. cxl_event_media_hdr, which is embedded in both cxl_event_gen_media and cxl_event_dram, leverages the commonalities between the two events to simplify their respective handling. Suggested-by: Dan Williams Reviewed-by: Alison Schofield Reviewed-by: Dan Williams Reviewed-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/20240607144423.48681-1-fabio.m.de.francesco@linux.intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/mbox.c | 2 +- drivers/cxl/core/trace.h | 32 ++++++++++----------- include/linux/cxl-event.h | 45 +++++++++++++----------------- tools/testing/cxl/test/mem.c | 54 +++++++++++++++++++----------------- 4 files changed, 65 insertions(+), 68 deletions(-) diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 2626f3fff201..a08f050cc1ca 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -875,7 +875,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, guard(rwsem_read)(&cxl_region_rwsem); guard(rwsem_read)(&cxl_dpa_rwsem); - dpa = le64_to_cpu(evt->common.phys_addr) & CXL_DPA_MASK; + dpa = le64_to_cpu(evt->media_hdr.phys_addr) & CXL_DPA_MASK; cxlr = cxl_dpa_to_region(cxlmd, dpa); if (cxlr) hpa = cxl_trace_hpa(cxlr, cxlmd, dpa); diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index ee5cd4eb2f16..6d8b71d8f6c4 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -340,23 +340,23 @@ TRACE_EVENT(cxl_general_media, ), TP_fast_assign( - CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + CXL_EVT_TP_fast_assign(cxlmd, log, rec->media_hdr.hdr); __entry->hdr_uuid = CXL_EVENT_GEN_MEDIA_UUID; /* General Media */ - __entry->dpa = le64_to_cpu(rec->phys_addr); + __entry->dpa = le64_to_cpu(rec->media_hdr.phys_addr); __entry->dpa_flags = __entry->dpa & CXL_DPA_FLAGS_MASK; /* Mask after flags have been parsed */ __entry->dpa &= CXL_DPA_MASK; - __entry->descriptor = rec->descriptor; - __entry->type = rec->type; - __entry->transaction_type = rec->transaction_type; - __entry->channel = rec->channel; - __entry->rank = rec->rank; + __entry->descriptor = rec->media_hdr.descriptor; + __entry->type = rec->media_hdr.type; + __entry->transaction_type = rec->media_hdr.transaction_type; + __entry->channel = rec->media_hdr.channel; + __entry->rank = rec->media_hdr.rank; __entry->device = get_unaligned_le24(rec->device); memcpy(__entry->comp_id, &rec->component_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE); - __entry->validity_flags = get_unaligned_le16(&rec->validity_flags); + __entry->validity_flags = get_unaligned_le16(&rec->media_hdr.validity_flags); __entry->hpa = hpa; if (cxlr) { __assign_str(region_name); @@ -440,19 +440,19 @@ TRACE_EVENT(cxl_dram, ), TP_fast_assign( - CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + CXL_EVT_TP_fast_assign(cxlmd, log, rec->media_hdr.hdr); __entry->hdr_uuid = CXL_EVENT_DRAM_UUID; /* DRAM */ - __entry->dpa = le64_to_cpu(rec->phys_addr); + __entry->dpa = le64_to_cpu(rec->media_hdr.phys_addr); __entry->dpa_flags = __entry->dpa & CXL_DPA_FLAGS_MASK; __entry->dpa &= CXL_DPA_MASK; - __entry->descriptor = rec->descriptor; - __entry->type = rec->type; - __entry->transaction_type = rec->transaction_type; - __entry->validity_flags = get_unaligned_le16(rec->validity_flags); - __entry->channel = rec->channel; - __entry->rank = rec->rank; + __entry->descriptor = rec->media_hdr.descriptor; + __entry->type = rec->media_hdr.type; + __entry->transaction_type = rec->media_hdr.transaction_type; + __entry->validity_flags = get_unaligned_le16(rec->media_hdr.validity_flags); + __entry->channel = rec->media_hdr.channel; + __entry->rank = rec->media_hdr.rank; __entry->nibble_mask = get_unaligned_le24(rec->nibble_mask); __entry->bank_group = rec->bank_group; __entry->bank = rec->bank; diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 60b25020281f..0bea1afbd747 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -21,6 +21,21 @@ struct cxl_event_record_hdr { u8 reserved[15]; } __packed; +struct cxl_event_media_hdr { + struct cxl_event_record_hdr hdr; + __le64 phys_addr; + u8 descriptor; + u8 type; + u8 transaction_type; + /* + * The meaning of Validity Flags from bit 2 is + * different across DRAM and General Media records + */ + u8 validity_flags[2]; + u8 channel; + u8 rank; +} __packed; + #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 struct cxl_event_generic { struct cxl_event_record_hdr hdr; @@ -33,14 +48,7 @@ struct cxl_event_generic { */ #define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 struct cxl_event_gen_media { - struct cxl_event_record_hdr hdr; - __le64 phys_addr; - u8 descriptor; - u8 type; - u8 transaction_type; - u8 validity_flags[2]; - u8 channel; - u8 rank; + struct cxl_event_media_hdr media_hdr; u8 device[3]; u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE]; u8 reserved[46]; @@ -52,14 +60,7 @@ struct cxl_event_gen_media { */ #define CXL_EVENT_DER_CORRECTION_MASK_SIZE 0x20 struct cxl_event_dram { - struct cxl_event_record_hdr hdr; - __le64 phys_addr; - u8 descriptor; - u8 type; - u8 transaction_type; - u8 validity_flags[2]; - u8 channel; - u8 rank; + struct cxl_event_media_hdr media_hdr; u8 nibble_mask[3]; u8 bank_group; u8 bank; @@ -95,21 +96,13 @@ struct cxl_event_mem_module { u8 reserved[0x3d]; } __packed; -/* - * General Media or DRAM Event Common Fields - * - provides common access to phys_addr - */ -struct cxl_event_common { - struct cxl_event_record_hdr hdr; - __le64 phys_addr; -} __packed; - union cxl_event { struct cxl_event_generic generic; struct cxl_event_gen_media gen_media; struct cxl_event_dram dram; struct cxl_event_mem_module mem_module; - struct cxl_event_common common; + /* dram & gen_media event header */ + struct cxl_event_media_hdr media_hdr; } __packed; /* diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index eaf091a3d331..94de2cf60f4f 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -385,19 +385,21 @@ struct cxl_test_gen_media { struct cxl_test_gen_media gen_media = { .id = CXL_EVENT_GEN_MEDIA_UUID, .rec = { - .hdr = { - .length = sizeof(struct cxl_test_gen_media), - .flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT, - /* .handle = Set dynamically */ - .related_handle = cpu_to_le16(0), + .media_hdr = { + .hdr = { + .length = sizeof(struct cxl_test_gen_media), + .flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT, + /* .handle = Set dynamically */ + .related_handle = cpu_to_le16(0), + }, + .phys_addr = cpu_to_le64(0x2000), + .descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, + .type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR, + .transaction_type = CXL_GMER_TRANS_HOST_WRITE, + /* .validity_flags = */ + .channel = 1, + .rank = 30, }, - .phys_addr = cpu_to_le64(0x2000), - .descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, - .type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR, - .transaction_type = CXL_GMER_TRANS_HOST_WRITE, - /* .validity_flags = */ - .channel = 1, - .rank = 30 }, }; @@ -409,18 +411,20 @@ struct cxl_test_dram { struct cxl_test_dram dram = { .id = CXL_EVENT_DRAM_UUID, .rec = { - .hdr = { - .length = sizeof(struct cxl_test_dram), - .flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, - /* .handle = Set dynamically */ - .related_handle = cpu_to_le16(0), + .media_hdr = { + .hdr = { + .length = sizeof(struct cxl_test_dram), + .flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, + /* .handle = Set dynamically */ + .related_handle = cpu_to_le16(0), + }, + .phys_addr = cpu_to_le64(0x8000), + .descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT, + .type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR, + .transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB, + /* .validity_flags = */ + .channel = 1, }, - .phys_addr = cpu_to_le64(0x8000), - .descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT, - .type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR, - .transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB, - /* .validity_flags = */ - .channel = 1, .bank_group = 5, .bank = 2, .column = {0xDE, 0xAD}, @@ -474,11 +478,11 @@ static int mock_set_timestamp(struct cxl_dev_state *cxlds, static void cxl_mock_add_event_logs(struct mock_event_store *mes) { put_unaligned_le16(CXL_GMER_VALID_CHANNEL | CXL_GMER_VALID_RANK, - &gen_media.rec.validity_flags); + &gen_media.rec.media_hdr.validity_flags); put_unaligned_le16(CXL_DER_VALID_CHANNEL | CXL_DER_VALID_BANK_GROUP | CXL_DER_VALID_BANK | CXL_DER_VALID_COLUMN, - &dram.rec.validity_flags); + &dram.rec.media_hdr.validity_flags); mes_add_event(mes, CXL_EVENT_TYPE_INFO, &maint_needed); mes_add_event(mes, CXL_EVENT_TYPE_INFO, From a0caa19711ceb54c34368f66a746844fb03fde6c Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Fri, 7 Jun 2024 06:57:15 -0700 Subject: [PATCH 013/120] cxl: add missing MODULE_DESCRIPTION() macros make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/core/cxl_core.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/cxl_pci.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/cxl_mem.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/cxl_acpi.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/cxl_pmem.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/cxl_port.o Add the missing invocations of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20240607-md-drivers-cxl-v2-1-0c61d95ee7a7@quicinc.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 1 + drivers/cxl/core/port.c | 1 + drivers/cxl/mem.c | 1 + drivers/cxl/pci.c | 1 + drivers/cxl/pmem.c | 1 + drivers/cxl/port.c | 1 + 6 files changed, 6 insertions(+) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 571069863c62..e51315ea4a6a 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -921,6 +921,7 @@ static void __exit cxl_acpi_exit(void) /* load before dax_hmem sees 'Soft Reserved' CXL ranges */ subsys_initcall(cxl_acpi_init); module_exit(cxl_acpi_exit); +MODULE_DESCRIPTION("CXL ACPI: Platform Support"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); MODULE_IMPORT_NS(ACPI); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 887ed6e358fb..e31c5fcd9bf8 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -2356,5 +2356,6 @@ static void cxl_core_exit(void) subsys_initcall(cxl_core_init); module_exit(cxl_core_exit); +MODULE_DESCRIPTION("CXL: Core Compute Express Link support"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 0c79d9ce877c..1afb0e78082b 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -252,6 +252,7 @@ static struct cxl_driver cxl_mem_driver = { module_cxl_driver(cxl_mem_driver); +MODULE_DESCRIPTION("CXL: Memory Expansion"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); MODULE_ALIAS_CXL(CXL_DEVICE_MEMORY_EXPANDER); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index e53646e9f2fb..4be35dc22202 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1066,5 +1066,6 @@ static void __exit cxl_pci_driver_exit(void) module_init(cxl_pci_driver_init); module_exit(cxl_pci_driver_exit); +MODULE_DESCRIPTION("CXL: PCI manageability"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c index 2ecdaee63021..4ef93da22335 100644 --- a/drivers/cxl/pmem.c +++ b/drivers/cxl/pmem.c @@ -453,6 +453,7 @@ static __exit void cxl_pmem_exit(void) cxl_driver_unregister(&cxl_nvdimm_bridge_driver); } +MODULE_DESCRIPTION("CXL PMEM: Persistent Memory Support"); MODULE_LICENSE("GPL v2"); module_init(cxl_pmem_init); module_exit(cxl_pmem_exit); diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 97c21566677a..d7d5d982ce69 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -209,6 +209,7 @@ static struct cxl_driver cxl_port_driver = { }; module_cxl_driver(cxl_port_driver); +MODULE_DESCRIPTION("CXL: Port enumeration and services"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(CXL); MODULE_ALIAS_CXL(CXL_DEVICE_PORT); From a3483ee7e6a7f2d12b5950246f4e0ef94f4a5df0 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 18 Jun 2024 16:46:37 +0800 Subject: [PATCH 014/120] cxl/region: Fix a race condition in memory hotplug notifier In the memory hotplug notifier function of the CXL region, cxl_region_perf_attrs_callback(), the node ID is obtained by checking the host address range of the region. However, the address range information is not available when the region is registered in devm_cxl_add_region(). Additionally, this information may be removed or added under the protection of cxl_region_rwsem during runtime. If the memory notifier is called for nodes other than that backed by the region, a race condition may occur, potentially leading to a NULL dereference or an invalid address range. The race condition is addressed by checking the availability of the address range information under the protection of cxl_region_rwsem. To enhance code readability and use guard(), the relevant code has been moved into a newly added function: cxl_region_nid(). Fixes: 067353a46d8c ("cxl/region: Add memory hotplug notifier for cxl region") Signed-off-by: Huang, Ying Cc: Dan Williams Cc: Alison Schofield Cc: Andrew Morton Cc: Jonathan Cameron Cc: Dave Jiang Cc: Bharata B Rao Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Davidlohr Bueso Cc: Vishal Verma Cc: Ira Weiny Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20240618084639.1419629-2-ying.huang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 3c2b6144be23..51aeef2c012c 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2304,14 +2304,25 @@ static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid) return true; } +static int cxl_region_nid(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled; + struct cxl_decoder *cxld; + + guard(rwsem_read)(&cxl_region_rwsem); + cxled = p->targets[0]; + if (!cxled) + return NUMA_NO_NODE; + cxld = &cxled->cxld; + return phys_to_target_node(cxld->hpa_range.start); +} + static int cxl_region_perf_attrs_callback(struct notifier_block *nb, unsigned long action, void *arg) { struct cxl_region *cxlr = container_of(nb, struct cxl_region, memory_notifier); - struct cxl_region_params *p = &cxlr->params; - struct cxl_endpoint_decoder *cxled = p->targets[0]; - struct cxl_decoder *cxld = &cxled->cxld; struct memory_notify *mnb = arg; int nid = mnb->status_change_nid; int region_nid; @@ -2319,7 +2330,7 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb, if (nid == NUMA_NO_NODE || action != MEM_ONLINE) return NOTIFY_DONE; - region_nid = phys_to_target_node(cxld->hpa_range.start); + region_nid = cxl_region_nid(cxlr); if (nid != region_nid) return NOTIFY_DONE; From 643e8e3e65290fdfe507bb23fa524c77e1345af3 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 18 Jun 2024 16:46:38 +0800 Subject: [PATCH 015/120] cxl/region: Support to calculate memory tier abstract distance An abstract distance value must be assigned by the driver that makes the memory available to the system. It reflects relative performance and is used to place memory nodes backed by CXL regions in the appropriate memory tiers allowing promotion/demotion within the existing memory tiering mechanism. The abstract distance is calculated based on the memory access latency and bandwidth of CXL regions. Signed-off-by: Huang, Ying Acked-by: Dan Williams Cc: Alison Schofield Cc: Andrew Morton Cc: Jonathan Cameron Cc: Dave Jiang Cc: Bharata B Rao Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Davidlohr Bueso Cc: Vishal Verma Cc: Ira Weiny Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20240618084639.1419629-3-ying.huang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 27 +++++++++++++++++++++++++++ drivers/cxl/cxl.h | 2 ++ 2 files changed, 29 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 51aeef2c012c..dc15ceba7ab7 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include "core.h" @@ -2228,6 +2229,7 @@ static void unregister_region(void *_cxlr) int i; unregister_memory_notifier(&cxlr->memory_notifier); + unregister_mt_adistance_algorithm(&cxlr->adist_notifier); device_del(&cxlr->dev); /* @@ -2340,6 +2342,27 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb, return NOTIFY_OK; } +static int cxl_region_calculate_adistance(struct notifier_block *nb, + unsigned long nid, void *data) +{ + struct cxl_region *cxlr = container_of(nb, struct cxl_region, + adist_notifier); + struct access_coordinate *perf; + int *adist = data; + int region_nid; + + region_nid = cxl_region_nid(cxlr); + if (nid != region_nid) + return NOTIFY_OK; + + perf = &cxlr->coord[ACCESS_COORDINATE_CPU]; + + if (mt_perf_to_adistance(perf, adist)) + return NOTIFY_OK; + + return NOTIFY_STOP; +} + /** * devm_cxl_add_region - Adds a region to a decoder * @cxlrd: root decoder @@ -2382,6 +2405,10 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, cxlr->memory_notifier.priority = CXL_CALLBACK_PRI; register_memory_notifier(&cxlr->memory_notifier); + cxlr->adist_notifier.notifier_call = cxl_region_calculate_adistance; + cxlr->adist_notifier.priority = 100; + register_mt_adistance_algorithm(&cxlr->adist_notifier); + rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr); if (rc) return ERR_PTR(rc); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 603c0120cff8..f46252373159 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -522,6 +522,7 @@ struct cxl_region_params { * @params: active + config params for the region * @coord: QoS access coordinates for the region * @memory_notifier: notifier for setting the access coordinates to node + * @adist_notifier: notifier for calculating the abstract distance of node */ struct cxl_region { struct device dev; @@ -534,6 +535,7 @@ struct cxl_region { struct cxl_region_params params; struct access_coordinate coord[ACCESS_COORDINATE_MAX]; struct notifier_block memory_notifier; + struct notifier_block adist_notifier; }; struct cxl_nvdimm_bridge { From f3d70720e92c24c22e0e63c2588636edba33eb1a Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 18 Jun 2024 16:46:39 +0800 Subject: [PATCH 016/120] cxl/region: Simplify cxl_region_nid() The node ID of the region can be gotten via resource start address directly. This simplifies the implementation of cxl_region_nid(). Signed-off-by: Huang Ying Suggested-by: Alison Schofield Cc: Dan Williams Cc: Andrew Morton Cc: Jonathan Cameron Cc: Dave Jiang Cc: Bharata B Rao Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Davidlohr Bueso Cc: Vishal Verma Cc: Ira Weiny Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20240618084639.1419629-4-ying.huang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index dc15ceba7ab7..32473fddce03 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2309,15 +2309,13 @@ static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid) static int cxl_region_nid(struct cxl_region *cxlr) { struct cxl_region_params *p = &cxlr->params; - struct cxl_endpoint_decoder *cxled; - struct cxl_decoder *cxld; + struct resource *res; guard(rwsem_read)(&cxl_region_rwsem); - cxled = p->targets[0]; - if (!cxled) + res = p->res; + if (!res) return NUMA_NO_NODE; - cxld = &cxled->cxld; - return phys_to_target_node(cxld->hpa_range.start); + return phys_to_target_node(res->start); } static int cxl_region_perf_attrs_callback(struct notifier_block *nb, From 67f9c312b0a7f4bc869376d2a68308e673235954 Mon Sep 17 00:00:00 2001 From: Aswin Unnikrishnan Date: Sun, 12 May 2024 11:23:20 +0000 Subject: [PATCH 017/120] rust: add example for `alias` argument in `module` macro documentation Add example for `alias` argument supported by `module` macro. `alias` accepts an array of alternate names for the module as string. Reviewed-by: Alice Ryhl Signed-off-by: Aswin Unnikrishnan Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20240512112324.8514-1-aswinunni01@gmail.com Signed-off-by: Miguel Ojeda --- rust/macros/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs index 520eae5fd792..aa89b41fa10e 100644 --- a/rust/macros/lib.rs +++ b/rust/macros/lib.rs @@ -35,6 +35,7 @@ use proc_macro::TokenStream; /// author: "Rust for Linux Contributors", /// description: "My very own kernel module!", /// license: "GPL", +/// alias: ["alternate_module_name"], /// } /// /// struct MyModule; From 63249a070eb5187d5caec995d171b53e374a0741 Mon Sep 17 00:00:00 2001 From: Aswin Unnikrishnan Date: Sun, 12 May 2024 11:23:21 +0000 Subject: [PATCH 018/120] rust: fix datatype in docs for `module` macro arguments Remove the mention of byte array as datatype for `module` macro arguments since the arguments are defined as string, and `alias` is a string array. Signed-off-by: Aswin Unnikrishnan Reviewed-by: Vincenzo Palazzo Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20240512112324.8514-2-aswinunni01@gmail.com Signed-off-by: Miguel Ojeda --- rust/macros/lib.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs index aa89b41fa10e..5214e07367c5 100644 --- a/rust/macros/lib.rs +++ b/rust/macros/lib.rs @@ -58,11 +58,11 @@ use proc_macro::TokenStream; /// /// # Supported argument types /// - `type`: type which implements the [`Module`] trait (required). -/// - `name`: byte array of the name of the kernel module (required). -/// - `author`: byte array of the author of the kernel module. -/// - `description`: byte array of the description of the kernel module. -/// - `license`: byte array of the license of the kernel module (required). -/// - `alias`: byte array of alias name of the kernel module. +/// - `name`: ASCII string literal of the name of the kernel module (required). +/// - `author`: string literal of the author of the kernel module. +/// - `description`: string literal of the description of the kernel module. +/// - `license`: ASCII string literal of the license of the kernel module (required). +/// - `alias`: array of ASCII string literals of the alias names of the kernel module. #[proc_macro] pub fn module(ts: TokenStream) -> TokenStream { module::module(ts) From 549d3c2ffbea44fe123a67983fd8b15ab6989d8d Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 1 May 2024 21:35:48 +0900 Subject: [PATCH 019/120] rust: add 'firmware' field support to module! macro This adds 'firmware' field support to module! macro, corresponds to MODULE_FIRMWARE macro. You can specify the file names of binary firmware that the kernel module requires. The information is embedded in the modinfo section of the kernel module. For example, a tool to build an initramfs uses this information to put the firmware files into the initramfs image. Signed-off-by: FUJITA Tomonori Reviewed-by: Benno Lossin Link: https://lore.kernel.org/r/20240501123548.51769-1-fujita.tomonori@gmail.com Signed-off-by: Miguel Ojeda --- rust/macros/lib.rs | 32 ++++++++++++++++++++++++++++++++ rust/macros/module.rs | 18 ++++++++++++++++-- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs index 5214e07367c5..6dcc72aff111 100644 --- a/rust/macros/lib.rs +++ b/rust/macros/lib.rs @@ -56,6 +56,36 @@ use proc_macro::TokenStream; /// } /// ``` /// +/// ## Firmware +/// +/// The following example shows how to declare a kernel module that needs +/// to load binary firmware files. You need to specify the file names of +/// the firmware in the `firmware` field. The information is embedded +/// in the `modinfo` section of the kernel module. For example, a tool to +/// build an initramfs uses this information to put the firmware files into +/// the initramfs image. +/// +/// ```ignore +/// use kernel::prelude::*; +/// +/// module!{ +/// type: MyDeviceDriverModule, +/// name: "my_device_driver_module", +/// author: "Rust for Linux Contributors", +/// description: "My device driver requires firmware", +/// license: "GPL", +/// firmware: ["my_device_firmware1.bin", "my_device_firmware2.bin"], +/// } +/// +/// struct MyDeviceDriverModule; +/// +/// impl kernel::Module for MyDeviceDriverModule { +/// fn init() -> Result { +/// Ok(Self) +/// } +/// } +/// ``` +/// /// # Supported argument types /// - `type`: type which implements the [`Module`] trait (required). /// - `name`: ASCII string literal of the name of the kernel module (required). @@ -63,6 +93,8 @@ use proc_macro::TokenStream; /// - `description`: string literal of the description of the kernel module. /// - `license`: ASCII string literal of the license of the kernel module (required). /// - `alias`: array of ASCII string literals of the alias names of the kernel module. +/// - `firmware`: array of ASCII string literals of the firmware files of +/// the kernel module. #[proc_macro] pub fn module(ts: TokenStream) -> TokenStream { module::module(ts) diff --git a/rust/macros/module.rs b/rust/macros/module.rs index acd0393b5095..411dc103d82e 100644 --- a/rust/macros/module.rs +++ b/rust/macros/module.rs @@ -97,14 +97,22 @@ struct ModuleInfo { author: Option, description: Option, alias: Option>, + firmware: Option>, } impl ModuleInfo { fn parse(it: &mut token_stream::IntoIter) -> Self { let mut info = ModuleInfo::default(); - const EXPECTED_KEYS: &[&str] = - &["type", "name", "author", "description", "license", "alias"]; + const EXPECTED_KEYS: &[&str] = &[ + "type", + "name", + "author", + "description", + "license", + "alias", + "firmware", + ]; const REQUIRED_KEYS: &[&str] = &["type", "name", "license"]; let mut seen_keys = Vec::new(); @@ -131,6 +139,7 @@ impl ModuleInfo { "description" => info.description = Some(expect_string(it)), "license" => info.license = expect_string_ascii(it), "alias" => info.alias = Some(expect_string_array(it)), + "firmware" => info.firmware = Some(expect_string_array(it)), _ => panic!( "Unknown key \"{}\". Valid keys are: {:?}.", key, EXPECTED_KEYS @@ -186,6 +195,11 @@ pub(crate) fn module(ts: TokenStream) -> TokenStream { modinfo.emit("alias", &alias); } } + if let Some(firmware) = info.firmware { + for fw in firmware { + modinfo.emit("firmware", &fw); + } + } // Built-in modules also export the `file` modinfo string. let file = From 526c539452cec6e7e65776d5807e6c66dd65d636 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 17 May 2024 19:06:15 +0200 Subject: [PATCH 020/120] docs: rust: introduce the new kernel.org LLVM+Rust toolchains These combined LLVM+Rust toolchains are now available, thanks to Nathan Chancellor (ClangBuiltLinux). Thus introduce them in the Rust Quick Start guide. Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240517170615.377786-1-ojeda@kernel.org Co-developed-by: Miguel Ojeda Signed-off-by: Miguel Ojeda --- Documentation/rust/quick-start.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst index cc3f11e0d441..6fe69a601134 100644 --- a/Documentation/rust/quick-start.rst +++ b/Documentation/rust/quick-start.rst @@ -16,6 +16,13 @@ under names like ``rustc``, ``rust-src``, ``rust-bindgen``, etc. However, at the time of writing, they are likely not to be recent enough unless the distribution tracks the latest releases. +Prebuilt stable versions of LLVM+Rust are provided on `kernel.org +`_. These are the same slim and fast +LLVM toolchains from :ref:`Getting LLVM ` with versions of Rust +added to them that Rust for Linux supports, depending on the Linux version. Two +sets are provided: the "latest LLVM" and "matching LLVM" (please see the link +for more information). + To easily check whether the requirements are met, the following target can be used:: From fe7d9d804337180d2377f0654537970c6cd863f7 Mon Sep 17 00:00:00 2001 From: Roland Xu Date: Thu, 23 May 2024 00:08:46 +0800 Subject: [PATCH 021/120] rust: kernel: make impl_has_work compatible with more generics Make the impl_has_work macro compatible with more complex generics such as lifetimes and const generic arguments. Signed-off-by: Roland Xu Link: https://lore.kernel.org/r/ME0P282MB4890A180B99490CC65EF64FDCCEB2@ME0P282MB4890.AUSP282.PROD.OUTLOOK.COM Suggested-by: Benno Lossin Link: https://github.com/Rust-for-Linux/linux/issues/1077 [ Wrapped message to 72 columns. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/kernel/workqueue.rs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/rust/kernel/workqueue.rs b/rust/kernel/workqueue.rs index 1cec63a2aea8..553a5cba2adc 100644 --- a/rust/kernel/workqueue.rs +++ b/rust/kernel/workqueue.rs @@ -482,24 +482,26 @@ pub unsafe trait HasWork { /// use kernel::sync::Arc; /// use kernel::workqueue::{self, impl_has_work, Work}; /// -/// struct MyStruct { -/// work_field: Work, +/// struct MyStruct<'a, T, const N: usize> { +/// work_field: Work, 17>, +/// f: fn(&'a [T; N]), /// } /// /// impl_has_work! { -/// impl HasWork for MyStruct { self.work_field } +/// impl{'a, T, const N: usize} HasWork, 17> +/// for MyStruct<'a, T, N> { self.work_field } /// } /// ``` #[macro_export] macro_rules! impl_has_work { - ($(impl$(<$($implarg:ident),*>)? + ($(impl$({$($generics:tt)*})? HasWork<$work_type:ty $(, $id:tt)?> - for $self:ident $(<$($selfarg:ident),*>)? + for $self:ty { self.$field:ident } )*) => {$( // SAFETY: The implementation of `raw_get_work` only compiles if the field has the right // type. - unsafe impl$(<$($implarg),*>)? $crate::workqueue::HasWork<$work_type $(, $id)?> for $self $(<$($selfarg),*>)? { + unsafe impl$(<$($generics)+>)? $crate::workqueue::HasWork<$work_type $(, $id)?> for $self { const OFFSET: usize = ::core::mem::offset_of!(Self, $field) as usize; #[inline] @@ -515,7 +517,7 @@ macro_rules! impl_has_work { pub use impl_has_work; impl_has_work! { - impl HasWork for ClosureWork { self.work } + impl{T} HasWork for ClosureWork { self.work } } unsafe impl WorkItemPointer for Arc From 9ffc80c819739ab60c42223c46b7351cec6a0e97 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 28 May 2024 18:35:02 +0200 Subject: [PATCH 022/120] kbuild: rust: remove now-unneeded `rusttest` custom sysroot handling Since we dropped our custom `alloc` in commit 9d0441bab775 ("rust: alloc: remove our fork of the `alloc` crate"), there is no need anymore to keep the custom sysroot hack. Thus delete it, which makes the target way simpler and faster too. This also means we are not using Cargo for anything at the moment, and that no download is required anymore, so update the main `Makefile` and the documentation accordingly. Link: https://lore.kernel.org/r/20240528163502.411600-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Documentation/rust/quick-start.rst | 14 ------ Documentation/rust/testing.rst | 5 +-- Makefile | 3 +- rust/Makefile | 70 ++++-------------------------- 4 files changed, 11 insertions(+), 81 deletions(-) diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst index 6fe69a601134..ac2f16288458 100644 --- a/Documentation/rust/quick-start.rst +++ b/Documentation/rust/quick-start.rst @@ -171,20 +171,6 @@ can be installed manually:: The standalone installers also come with ``clippy``. -cargo -***** - -``cargo`` is the Rust native build system. It is currently required to run -the tests since it is used to build a custom standard library that contains -the facilities provided by the custom ``alloc`` in the kernel. The tests can -be run using the ``rusttest`` Make target. - -If ``rustup`` is being used, all the profiles already install the tool, -thus nothing needs to be done. - -The standalone installers also come with ``cargo``. - - rustdoc ******* diff --git a/Documentation/rust/testing.rst b/Documentation/rust/testing.rst index acfd0c2be48d..568b71b415a4 100644 --- a/Documentation/rust/testing.rst +++ b/Documentation/rust/testing.rst @@ -131,9 +131,8 @@ Additionally, there are the ``#[test]`` tests. These can be run using the make LLVM=1 rusttest -This requires the kernel ``.config`` and downloads external repositories. It -runs the ``#[test]`` tests on the host (currently) and thus is fairly limited in -what these tests can test. +This requires the kernel ``.config``. It runs the ``#[test]`` tests on the host +(currently) and thus is fairly limited in what these tests can test. The Kselftests -------------- diff --git a/Makefile b/Makefile index b25b5b44af10..d4134a9f32b6 100644 --- a/Makefile +++ b/Makefile @@ -507,7 +507,6 @@ RUSTDOC = rustdoc RUSTFMT = rustfmt CLIPPY_DRIVER = clippy-driver BINDGEN = bindgen -CARGO = cargo PAHOLE = pahole RESOLVE_BTFIDS = $(objtree)/tools/bpf/resolve_btfids/resolve_btfids LEX = flex @@ -601,7 +600,7 @@ endif export RUSTC_BOOTSTRAP := 1 export ARCH SRCARCH CONFIG_SHELL BASH HOSTCC KBUILD_HOSTCFLAGS CROSS_COMPILE LD CC HOSTPKG_CONFIG -export RUSTC RUSTDOC RUSTFMT RUSTC_OR_CLIPPY_QUIET RUSTC_OR_CLIPPY BINDGEN CARGO +export RUSTC RUSTDOC RUSTFMT RUSTC_OR_CLIPPY_QUIET RUSTC_OR_CLIPPY BINDGEN export HOSTRUSTC KBUILD_HOSTRUSTFLAGS export CPP AR NM STRIP OBJCOPY OBJDUMP READELF PAHOLE RESOLVE_BTFIDS LEX YACC AWK INSTALLKERNEL export PERL PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX diff --git a/rust/Makefile b/rust/Makefile index f70d5e244fee..385378311322 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -44,17 +44,10 @@ rustc_sysroot := $(shell MAKEFLAGS= $(RUSTC) $(rust_flags) --print sysroot) rustc_host_target := $(shell $(RUSTC) --version --verbose | grep -F 'host: ' | cut -d' ' -f2) RUST_LIB_SRC ?= $(rustc_sysroot)/lib/rustlib/src/rust/library -ifeq ($(quiet),silent_) -cargo_quiet=-q +ifneq ($(quiet),) rust_test_quiet=-q rustdoc_test_quiet=--test-args -q rustdoc_test_kernel_quiet=>/dev/null -else ifeq ($(quiet),quiet_) -rust_test_quiet=-q -rustdoc_test_quiet=--test-args -q -rustdoc_test_kernel_quiet=>/dev/null -else -cargo_quiet=--verbose endif core-cfgs = \ @@ -135,22 +128,21 @@ quiet_cmd_rustc_test_library = RUSTC TL $< @$(objtree)/include/generated/rustc_cfg $(rustc_target_flags) \ --crate-type $(if $(rustc_test_library_proc),proc-macro,rlib) \ --out-dir $(objtree)/$(obj)/test --cfg testlib \ - --sysroot $(objtree)/$(obj)/test/sysroot \ -L$(objtree)/$(obj)/test \ --crate-name $(subst rusttest-,,$(subst rusttestlib-,,$@)) $< -rusttestlib-build_error: $(src)/build_error.rs rusttest-prepare FORCE +rusttestlib-build_error: $(src)/build_error.rs FORCE +$(call if_changed,rustc_test_library) rusttestlib-macros: private rustc_target_flags = --extern proc_macro rusttestlib-macros: private rustc_test_library_proc = yes -rusttestlib-macros: $(src)/macros/lib.rs rusttest-prepare FORCE +rusttestlib-macros: $(src)/macros/lib.rs FORCE +$(call if_changed,rustc_test_library) -rusttestlib-bindings: $(src)/bindings/lib.rs rusttest-prepare FORCE +rusttestlib-bindings: $(src)/bindings/lib.rs FORCE +$(call if_changed,rustc_test_library) -rusttestlib-uapi: $(src)/uapi/lib.rs rusttest-prepare FORCE +rusttestlib-uapi: $(src)/uapi/lib.rs FORCE +$(call if_changed,rustc_test_library) quiet_cmd_rustdoc_test = RUSTDOC T $< @@ -159,7 +151,7 @@ quiet_cmd_rustdoc_test = RUSTDOC T $< $(RUSTDOC) --test $(rust_common_flags) \ @$(objtree)/include/generated/rustc_cfg \ $(rustc_target_flags) $(rustdoc_test_target_flags) \ - --sysroot $(objtree)/$(obj)/test/sysroot $(rustdoc_test_quiet) \ + $(rustdoc_test_quiet) \ -L$(objtree)/$(obj)/test --output $(rustdoc_output) \ --crate-name $(subst rusttest-,,$@) $< @@ -192,7 +184,6 @@ quiet_cmd_rustc_test = RUSTC T $< $(RUSTC) --test $(rust_common_flags) \ @$(objtree)/include/generated/rustc_cfg \ $(rustc_target_flags) --out-dir $(objtree)/$(obj)/test \ - --sysroot $(objtree)/$(obj)/test/sysroot \ -L$(objtree)/$(obj)/test \ --crate-name $(subst rusttest-,,$@) $<; \ $(objtree)/$(obj)/test/$(subst rusttest-,,$@) $(rust_test_quiet) \ @@ -200,60 +191,15 @@ quiet_cmd_rustc_test = RUSTC T $< rusttest: rusttest-macros rusttest-kernel -# This prepares a custom sysroot with our custom `alloc` instead of -# the standard one. -# -# This requires several hacks: -# - Unlike `core` and `alloc`, `std` depends on more than a dozen crates, -# including third-party crates that need to be downloaded, plus custom -# `build.rs` steps. Thus hardcoding things here is not maintainable. -# - `cargo` knows how to build the standard library, but it is an unstable -# feature so far (`-Zbuild-std`). -# - `cargo` only considers the use case of building the standard library -# to use it in a given package. Thus we need to create a dummy package -# and pick the generated libraries from there. -# - The usual ways of modifying the dependency graph in `cargo` do not seem -# to apply for the `-Zbuild-std` steps, thus we have to mislead it -# by modifying the sources in the sysroot. -# - To avoid messing with the user's Rust installation, we create a clone -# of the sysroot. However, `cargo` ignores `RUSTFLAGS` in the `-Zbuild-std` -# steps, thus we use a wrapper binary passed via `RUSTC` to pass the flag. -# -# In the future, we hope to avoid the whole ordeal by either: -# - Making the `test` crate not depend on `std` (either improving upstream -# or having our own custom crate). -# - Making the tests run in kernel space (requires the previous point). -# - Making `std` and friends be more like a "normal" crate, so that -# `-Zbuild-std` and related hacks are not needed. -quiet_cmd_rustsysroot = RUSTSYSROOT - cmd_rustsysroot = \ - rm -rf $(objtree)/$(obj)/test; \ - mkdir -p $(objtree)/$(obj)/test; \ - cp -a $(rustc_sysroot) $(objtree)/$(obj)/test/sysroot; \ - echo '\#!/bin/sh' > $(objtree)/$(obj)/test/rustc_sysroot; \ - echo "$(RUSTC) --sysroot=$(abspath $(objtree)/$(obj)/test/sysroot) \"\$$@\"" \ - >> $(objtree)/$(obj)/test/rustc_sysroot; \ - chmod u+x $(objtree)/$(obj)/test/rustc_sysroot; \ - $(CARGO) -q new $(objtree)/$(obj)/test/dummy; \ - RUSTC=$(objtree)/$(obj)/test/rustc_sysroot $(CARGO) $(cargo_quiet) \ - test -Zbuild-std --target $(rustc_host_target) \ - --manifest-path $(objtree)/$(obj)/test/dummy/Cargo.toml; \ - rm $(objtree)/$(obj)/test/sysroot/lib/rustlib/$(rustc_host_target)/lib/*; \ - cp $(objtree)/$(obj)/test/dummy/target/$(rustc_host_target)/debug/deps/* \ - $(objtree)/$(obj)/test/sysroot/lib/rustlib/$(rustc_host_target)/lib - -rusttest-prepare: FORCE - +$(call if_changed,rustsysroot) - rusttest-macros: private rustc_target_flags = --extern proc_macro rusttest-macros: private rustdoc_test_target_flags = --crate-type proc-macro -rusttest-macros: $(src)/macros/lib.rs rusttest-prepare FORCE +rusttest-macros: $(src)/macros/lib.rs FORCE +$(call if_changed,rustc_test) +$(call if_changed,rustdoc_test) rusttest-kernel: private rustc_target_flags = --extern alloc \ --extern build_error --extern macros --extern bindings --extern uapi -rusttest-kernel: $(src)/kernel/lib.rs rusttest-prepare \ +rusttest-kernel: $(src)/kernel/lib.rs \ rusttestlib-build_error rusttestlib-macros rusttestlib-bindings \ rusttestlib-uapi FORCE +$(call if_changed,rustc_test) From d3ee24cce4e558041cbf21a5bc9cd9a4b1b3c7e7 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Wed, 29 May 2024 08:34:52 +0000 Subject: [PATCH 023/120] rust: alloc: fix typo in docs for GFP_NOWAIT Fix a typo in alloc.rs by replacing Ror with For. Signed-off-by: Alice Ryhl Reviewed-by: Vincenzo Palazzo Link: https://lore.kernel.org/r/20240529083452.779865-1-aliceryhl@google.com Signed-off-by: Miguel Ojeda --- rust/kernel/alloc.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs index 531b5e471cb1..c44995914f75 100644 --- a/rust/kernel/alloc.rs +++ b/rust/kernel/alloc.rs @@ -66,7 +66,7 @@ pub mod flags { /// The same as [`GFP_KERNEL`], except the allocation is accounted to kmemcg. pub const GFP_KERNEL_ACCOUNT: Flags = Flags(bindings::GFP_KERNEL_ACCOUNT); - /// Ror kernel allocations that should not stall for direct reclaim, start physical IO or + /// For kernel allocations that should not stall for direct reclaim, start physical IO or /// use any filesystem callback. It is very likely to fail to allocate memory, even for very /// small allocations. pub const GFP_NOWAIT: Flags = Flags(bindings::GFP_NOWAIT); From ab44079e2869c9f1743d167d5f9b5befe375b6d9 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Fri, 7 Jun 2024 08:23:41 +0000 Subject: [PATCH 024/120] rust: alloc: add __GFP_HIGHMEM flag Make it possible to allocate memory that doesn't need to mapped into the kernel's address space. This flag is useful together with Page::alloc_page [1]. Rust Binder needs this for the memory that holds incoming transactions for each process. Each process will have a few megabytes of memory allocated with this flag, which is mapped into the process using vm_insert_page. When the kernel copies data for an incoming transaction into a process's memory region, it will use kmap_local_page to temporarily map pages that are being modified. There is no need for them to take up address space in the kernel when the kernel is not writing an incoming transaction into the page. Link: https://lore.kernel.org/all/20240528-alice-mm-v7-4-78222c31b8f4@google.com/ [1] Signed-off-by: Alice Ryhl Link: https://lore.kernel.org/r/20240607-highmem-v1-1-d18c5ca4072f@google.com Signed-off-by: Miguel Ojeda --- rust/bindings/bindings_helper.h | 1 + rust/kernel/alloc.rs | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index ddb5644d4fd9..52a1412338ef 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -25,3 +25,4 @@ const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL; const gfp_t RUST_CONST_HELPER_GFP_KERNEL_ACCOUNT = GFP_KERNEL_ACCOUNT; const gfp_t RUST_CONST_HELPER_GFP_NOWAIT = GFP_NOWAIT; const gfp_t RUST_CONST_HELPER___GFP_ZERO = __GFP_ZERO; +const gfp_t RUST_CONST_HELPER___GFP_HIGHMEM = ___GFP_HIGHMEM; diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs index c44995914f75..396fe5a85a8f 100644 --- a/rust/kernel/alloc.rs +++ b/rust/kernel/alloc.rs @@ -52,6 +52,14 @@ pub mod flags { /// This is normally or'd with other flags. pub const __GFP_ZERO: Flags = Flags(bindings::__GFP_ZERO); + /// Allow the allocation to be in high memory. + /// + /// Allocations in high memory may not be mapped into the kernel's address space, so this can't + /// be used with `kmalloc` and other similar methods. + /// + /// This is normally or'd with other flags. + pub const __GFP_HIGHMEM: Flags = Flags(bindings::__GFP_HIGHMEM); + /// Users can not sleep and need the allocation to succeed. /// /// A lower watermark is applied to allow access to "atomic reserves". The current From b63c455d38be5f62a0665f3080c67334db5b4c41 Mon Sep 17 00:00:00 2001 From: Dirk Behme Date: Mon, 10 Jun 2024 14:23:32 +0200 Subject: [PATCH 025/120] docs: rust: no_std is used Using the #![no_std] attribute in the Rust kernel support is different to the default Rust usage. Mention this in the Documentation. Signed-off-by: Dirk Behme Reviewed-by: Trevor Gross Link: https://lore.kernel.org/r/20240610122332.3858571-1-dirk.behme@de.bosch.com [ Avoided breaking links in two lines. - Miguel ] Signed-off-by: Miguel Ojeda --- Documentation/rust/general-information.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/rust/general-information.rst b/Documentation/rust/general-information.rst index 4bb6ac12d482..e3f388ef4ee4 100644 --- a/Documentation/rust/general-information.rst +++ b/Documentation/rust/general-information.rst @@ -7,6 +7,14 @@ This document contains useful information to know when working with the Rust support in the kernel. +``no_std`` +---------- + +The Rust support in the kernel can link only `core `_, +but not `std `_. Crates for use in the +kernel must opt into this behavior using the ``#![no_std]`` attribute. + + Code documentation ------------------ From 6dc9d9ca9a728c8b30976d3ca353563ba0bfb949 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 27 Jun 2024 17:43:55 -0700 Subject: [PATCH 026/120] kbuild: rust-analyzer: better error handling 1) Provide a better error message for the "Rust not available" case. Without this patch, one gets various misleading messages, such as: "No rule to make target 'rust-analyzer'" Instead, run scripts/rust_is_available.sh directly, as a prerequisite, and let that script report the cause of any problems, as well as providing a link to the documentation. Thanks to Miguel Ojeda for the idea of just letting rust_is_available.sh report its results directly. The new output in the failure case looks like this: $ make rust-analyzer *** *** Rust compiler 'rustc' could not be found. *** *** *** Please see Documentation/rust/quick-start.rst for details *** on how to set up the Rust support. *** make[1]: *** [/kernel_work/linux-github/Makefile:1975: rust-analyzer] Error 1 make: *** [Makefile:240: __sub-make] Error 2 Reviewed-by: Finn Behrens Reviewed-by: Alice Ryhl Tested-by: Alice Ryhl Signed-off-by: John Hubbard Link: https://lore.kernel.org/r/20240628004356.1384486-2-jhubbard@nvidia.com [ Reworded title. - Miguel ] Signed-off-by: Miguel Ojeda --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index d4134a9f32b6..de5be3845c70 100644 --- a/Makefile +++ b/Makefile @@ -1969,6 +1969,7 @@ tags TAGS cscope gtags: FORCE # IDE support targets PHONY += rust-analyzer rust-analyzer: + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh $(Q)$(MAKE) $(build)=rust $@ # Script to generate missing namespace dependencies From 5045b460843adf5bb31774b9464df5ae8e4da0d4 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 27 Jun 2024 17:43:56 -0700 Subject: [PATCH 027/120] kbuild: rust-analyzer: improve comment documentation Replace the cryptic phrase ("IDE support targets") that initially appears to be about how to support old hard drives, with a few sentences that explain what "make rust-analyzer" provides. Signed-off-by: John Hubbard Reviewed-by: Finn Behrens Reviewed-by: Alice Ryhl Link: https://lore.kernel.org/r/20240628004356.1384486-3-jhubbard@nvidia.com [ Reworded title. - Miguel ] Signed-off-by: Miguel Ojeda --- Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index de5be3845c70..fea263aaa492 100644 --- a/Makefile +++ b/Makefile @@ -1966,7 +1966,9 @@ quiet_cmd_tags = GEN $@ tags TAGS cscope gtags: FORCE $(call cmd,tags) -# IDE support targets +# Generate rust-project.json (a file that describes the structure of non-Cargo +# Rust projects) for rust-analyzer (an implementation of the Language Server +# Protocol). PHONY += rust-analyzer rust-analyzer: $(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh From 1b580e7b9ba2e5939c4b94da2cb4888605b39955 Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Tue, 28 May 2024 14:58:02 +0000 Subject: [PATCH 028/120] rust: uaccess: add userspace pointers A pointer to an area in userspace memory, which can be either read-only or read-write. All methods on this struct are safe: attempting to read or write on bad addresses (either out of the bound of the slice or unmapped addresses) will return `EFAULT`. Concurrent access, *including data races to/from userspace memory*, is permitted, because fundamentally another userspace thread/process could always be modifying memory at the same time (in the same way that userspace Rust's `std::io` permits data races with the contents of files on disk). In the presence of a race, the exact byte values read/written are unspecified but the operation is well-defined. Kernelspace code should validate its copy of data after completing a read, and not expect that multiple reads of the same address will return the same value. These APIs are designed to make it difficult to accidentally write TOCTOU bugs. Every time you read from a memory location, the pointer is advanced by the length so that you cannot use that reader to read the same memory location twice. Preventing double-fetches avoids TOCTOU bugs. This is accomplished by taking `self` by value to prevent obtaining multiple readers on a given `UserSlice`, and the readers only permitting forward reads. If double-fetching a memory location is necessary for some reason, then that is done by creating multiple readers to the same memory location. Constructing a `UserSlice` performs no checks on the provided address and length, it can safely be constructed inside a kernel thread with no current userspace process. Reads and writes wrap the kernel APIs `copy_from_user` and `copy_to_user`, which check the memory map of the current process and enforce that the address range is within the user range (no additional calls to `access_ok` are needed). This code is based on something that was originally written by Wedson on the old rust branch. It was modified by Alice by removing the `IoBufferReader` and `IoBufferWriter` traits, and various other changes. Signed-off-by: Wedson Almeida Filho Reviewed-by: Benno Lossin Reviewed-by: Trevor Gross Reviewed-by: Boqun Feng Co-developed-by: Alice Ryhl Signed-off-by: Alice Ryhl Link: https://lore.kernel.org/r/20240528-alice-mm-v7-1-78222c31b8f4@google.com [ Wrapped docs to 100 and added a few intra-doc links. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/helpers.c | 14 ++ rust/kernel/lib.rs | 1 + rust/kernel/uaccess.rs | 313 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 328 insertions(+) create mode 100644 rust/kernel/uaccess.rs diff --git a/rust/helpers.c b/rust/helpers.c index 2c37a0f5d7a8..d6abe4bd45d9 100644 --- a/rust/helpers.c +++ b/rust/helpers.c @@ -39,6 +39,20 @@ __noreturn void rust_helper_BUG(void) } EXPORT_SYMBOL_GPL(rust_helper_BUG); +unsigned long rust_helper_copy_from_user(void *to, const void __user *from, + unsigned long n) +{ + return copy_from_user(to, from, n); +} +EXPORT_SYMBOL_GPL(rust_helper_copy_from_user); + +unsigned long rust_helper_copy_to_user(void __user *to, const void *from, + unsigned long n) +{ + return copy_to_user(to, from, n); +} +EXPORT_SYMBOL_GPL(rust_helper_copy_to_user); + void rust_helper_mutex_lock(struct mutex *lock) { mutex_lock(lock); diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index fbd91a48ff8b..767026db068e 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -45,6 +45,7 @@ pub mod sync; pub mod task; pub mod time; pub mod types; +pub mod uaccess; pub mod workqueue; #[doc(hidden)] diff --git a/rust/kernel/uaccess.rs b/rust/kernel/uaccess.rs new file mode 100644 index 000000000000..55f0d7ecfa3f --- /dev/null +++ b/rust/kernel/uaccess.rs @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Slices to user space memory regions. +//! +//! C header: [`include/linux/uaccess.h`](srctree/include/linux/uaccess.h) + +use crate::{alloc::Flags, bindings, error::Result, prelude::*}; +use alloc::vec::Vec; +use core::ffi::{c_ulong, c_void}; +use core::mem::MaybeUninit; + +/// The type used for userspace addresses. +pub type UserPtr = usize; + +/// A pointer to an area in userspace memory, which can be either read-only or read-write. +/// +/// All methods on this struct are safe: attempting to read or write on bad addresses (either out of +/// the bound of the slice or unmapped addresses) will return [`EFAULT`]. Concurrent access, +/// *including data races to/from userspace memory*, is permitted, because fundamentally another +/// userspace thread/process could always be modifying memory at the same time (in the same way that +/// userspace Rust's [`std::io`] permits data races with the contents of files on disk). In the +/// presence of a race, the exact byte values read/written are unspecified but the operation is +/// well-defined. Kernelspace code should validate its copy of data after completing a read, and not +/// expect that multiple reads of the same address will return the same value. +/// +/// These APIs are designed to make it difficult to accidentally write TOCTOU (time-of-check to +/// time-of-use) bugs. Every time a memory location is read, the reader's position is advanced by +/// the read length and the next read will start from there. This helps prevent accidentally reading +/// the same location twice and causing a TOCTOU bug. +/// +/// Creating a [`UserSliceReader`] and/or [`UserSliceWriter`] consumes the `UserSlice`, helping +/// ensure that there aren't multiple readers or writers to the same location. +/// +/// If double-fetching a memory location is necessary for some reason, then that is done by creating +/// multiple readers to the same memory location, e.g. using [`clone_reader`]. +/// +/// # Examples +/// +/// Takes a region of userspace memory from the current process, and modify it by adding one to +/// every byte in the region. +/// +/// ```no_run +/// use alloc::vec::Vec; +/// use core::ffi::c_void; +/// use kernel::error::Result; +/// use kernel::uaccess::{UserPtr, UserSlice}; +/// +/// fn bytes_add_one(uptr: UserPtr, len: usize) -> Result<()> { +/// let (read, mut write) = UserSlice::new(uptr, len).reader_writer(); +/// +/// let mut buf = Vec::new(); +/// read.read_all(&mut buf, GFP_KERNEL)?; +/// +/// for b in &mut buf { +/// *b = b.wrapping_add(1); +/// } +/// +/// write.write_slice(&buf)?; +/// Ok(()) +/// } +/// ``` +/// +/// Example illustrating a TOCTOU (time-of-check to time-of-use) bug. +/// +/// ```no_run +/// use alloc::vec::Vec; +/// use core::ffi::c_void; +/// use kernel::error::{code::EINVAL, Result}; +/// use kernel::uaccess::{UserPtr, UserSlice}; +/// +/// /// Returns whether the data in this region is valid. +/// fn is_valid(uptr: UserPtr, len: usize) -> Result { +/// let read = UserSlice::new(uptr, len).reader(); +/// +/// let mut buf = Vec::new(); +/// read.read_all(&mut buf, GFP_KERNEL)?; +/// +/// todo!() +/// } +/// +/// /// Returns the bytes behind this user pointer if they are valid. +/// fn get_bytes_if_valid(uptr: UserPtr, len: usize) -> Result> { +/// if !is_valid(uptr, len)? { +/// return Err(EINVAL); +/// } +/// +/// let read = UserSlice::new(uptr, len).reader(); +/// +/// let mut buf = Vec::new(); +/// read.read_all(&mut buf, GFP_KERNEL)?; +/// +/// // THIS IS A BUG! The bytes could have changed since we checked them. +/// // +/// // To avoid this kind of bug, don't call `UserSlice::new` multiple +/// // times with the same address. +/// Ok(buf) +/// } +/// ``` +/// +/// [`std::io`]: https://doc.rust-lang.org/std/io/index.html +/// [`clone_reader`]: UserSliceReader::clone_reader +pub struct UserSlice { + ptr: UserPtr, + length: usize, +} + +impl UserSlice { + /// Constructs a user slice from a raw pointer and a length in bytes. + /// + /// Constructing a [`UserSlice`] performs no checks on the provided address and length, it can + /// safely be constructed inside a kernel thread with no current userspace process. Reads and + /// writes wrap the kernel APIs `copy_from_user` and `copy_to_user`, which check the memory map + /// of the current process and enforce that the address range is within the user range (no + /// additional calls to `access_ok` are needed). Validity of the pointer is checked when you + /// attempt to read or write, not in the call to `UserSlice::new`. + /// + /// Callers must be careful to avoid time-of-check-time-of-use (TOCTOU) issues. The simplest way + /// is to create a single instance of [`UserSlice`] per user memory block as it reads each byte + /// at most once. + pub fn new(ptr: UserPtr, length: usize) -> Self { + UserSlice { ptr, length } + } + + /// Reads the entirety of the user slice, appending it to the end of the provided buffer. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address. + pub fn read_all(self, buf: &mut Vec, flags: Flags) -> Result { + self.reader().read_all(buf, flags) + } + + /// Constructs a [`UserSliceReader`]. + pub fn reader(self) -> UserSliceReader { + UserSliceReader { + ptr: self.ptr, + length: self.length, + } + } + + /// Constructs a [`UserSliceWriter`]. + pub fn writer(self) -> UserSliceWriter { + UserSliceWriter { + ptr: self.ptr, + length: self.length, + } + } + + /// Constructs both a [`UserSliceReader`] and a [`UserSliceWriter`]. + /// + /// Usually when this is used, you will first read the data, and then overwrite it afterwards. + pub fn reader_writer(self) -> (UserSliceReader, UserSliceWriter) { + ( + UserSliceReader { + ptr: self.ptr, + length: self.length, + }, + UserSliceWriter { + ptr: self.ptr, + length: self.length, + }, + ) + } +} + +/// A reader for [`UserSlice`]. +/// +/// Used to incrementally read from the user slice. +pub struct UserSliceReader { + ptr: UserPtr, + length: usize, +} + +impl UserSliceReader { + /// Skip the provided number of bytes. + /// + /// Returns an error if skipping more than the length of the buffer. + pub fn skip(&mut self, num_skip: usize) -> Result { + // Update `self.length` first since that's the fallible part of this operation. + self.length = self.length.checked_sub(num_skip).ok_or(EFAULT)?; + self.ptr = self.ptr.wrapping_add(num_skip); + Ok(()) + } + + /// Create a reader that can access the same range of data. + /// + /// Reading from the clone does not advance the current reader. + /// + /// The caller should take care to not introduce TOCTOU issues, as described in the + /// documentation for [`UserSlice`]. + pub fn clone_reader(&self) -> UserSliceReader { + UserSliceReader { + ptr: self.ptr, + length: self.length, + } + } + + /// Returns the number of bytes left to be read from this reader. + /// + /// Note that even reading less than this number of bytes may fail. + pub fn len(&self) -> usize { + self.length + } + + /// Returns `true` if no data is available in the io buffer. + pub fn is_empty(&self) -> bool { + self.length == 0 + } + + /// Reads raw data from the user slice into a kernel buffer. + /// + /// For a version that uses `&mut [u8]`, please see [`UserSliceReader::read_slice`]. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address, or if the read goes out of + /// bounds of this [`UserSliceReader`]. This call may modify `out` even if it returns an error. + /// + /// # Guarantees + /// + /// After a successful call to this method, all bytes in `out` are initialized. + pub fn read_raw(&mut self, out: &mut [MaybeUninit]) -> Result { + let len = out.len(); + let out_ptr = out.as_mut_ptr().cast::(); + if len > self.length { + return Err(EFAULT); + } + let Ok(len_ulong) = c_ulong::try_from(len) else { + return Err(EFAULT); + }; + // SAFETY: `out_ptr` points into a mutable slice of length `len_ulong`, so we may write + // that many bytes to it. + let res = + unsafe { bindings::copy_from_user(out_ptr, self.ptr as *const c_void, len_ulong) }; + if res != 0 { + return Err(EFAULT); + } + self.ptr = self.ptr.wrapping_add(len); + self.length -= len; + Ok(()) + } + + /// Reads raw data from the user slice into a kernel buffer. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address, or if the read goes out of + /// bounds of this [`UserSliceReader`]. This call may modify `out` even if it returns an error. + pub fn read_slice(&mut self, out: &mut [u8]) -> Result { + // SAFETY: The types are compatible and `read_raw` doesn't write uninitialized bytes to + // `out`. + let out = unsafe { &mut *(out as *mut [u8] as *mut [MaybeUninit]) }; + self.read_raw(out) + } + + /// Reads the entirety of the user slice, appending it to the end of the provided buffer. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address. + pub fn read_all(mut self, buf: &mut Vec, flags: Flags) -> Result { + let len = self.length; + VecExt::::reserve(buf, len, flags)?; + + // The call to `try_reserve` was successful, so the spare capacity is at least `len` bytes + // long. + self.read_raw(&mut buf.spare_capacity_mut()[..len])?; + + // SAFETY: Since the call to `read_raw` was successful, so the next `len` bytes of the + // vector have been initialized. + unsafe { buf.set_len(buf.len() + len) }; + Ok(()) + } +} + +/// A writer for [`UserSlice`]. +/// +/// Used to incrementally write into the user slice. +pub struct UserSliceWriter { + ptr: UserPtr, + length: usize, +} + +impl UserSliceWriter { + /// Returns the amount of space remaining in this buffer. + /// + /// Note that even writing less than this number of bytes may fail. + pub fn len(&self) -> usize { + self.length + } + + /// Returns `true` if no more data can be written to this buffer. + pub fn is_empty(&self) -> bool { + self.length == 0 + } + + /// Writes raw data to this user pointer from a kernel buffer. + /// + /// Fails with [`EFAULT`] if the write happens on a bad address, or if the write goes out of + /// bounds of this [`UserSliceWriter`]. This call may modify the associated userspace slice even + /// if it returns an error. + pub fn write_slice(&mut self, data: &[u8]) -> Result { + let len = data.len(); + let data_ptr = data.as_ptr().cast::(); + if len > self.length { + return Err(EFAULT); + } + let Ok(len_ulong) = c_ulong::try_from(len) else { + return Err(EFAULT); + }; + // SAFETY: `data_ptr` points into an immutable slice of length `len_ulong`, so we may read + // that many bytes from it. + let res = unsafe { bindings::copy_to_user(self.ptr as *mut c_void, data_ptr, len_ulong) }; + if res != 0 { + return Err(EFAULT); + } + self.ptr = self.ptr.wrapping_add(len); + self.length -= len; + Ok(()) + } +} From 1f9a8286bc0c3df7d789ea625d9d9db3d7779f2d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 May 2024 14:58:03 +0000 Subject: [PATCH 029/120] uaccess: always export _copy_[from|to]_user with CONFIG_RUST Rust code needs to be able to access _copy_from_user and _copy_to_user so that it can skip the check_copy_size check in cases where the length is known at compile-time, mirroring the logic for when C code will skip check_copy_size. To do this, we ensure that exported versions of these methods are available when CONFIG_RUST is enabled. Alice has verified that this patch passes the CONFIG_TEST_USER_COPY test on x86 using the Android cuttlefish emulator. Signed-off-by: Arnd Bergmann Tested-by: Alice Ryhl Reviewed-by: Boqun Feng Reviewed-by: Kees Cook Signed-off-by: Alice Ryhl Acked-by: Andrew Morton Link: https://lore.kernel.org/r/20240528-alice-mm-v7-2-78222c31b8f4@google.com Signed-off-by: Miguel Ojeda --- include/linux/uaccess.h | 46 ++++++++++++++++++++++++++++------------- lib/usercopy.c | 30 ++++----------------------- 2 files changed, 36 insertions(+), 40 deletions(-) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 3064314f4832..d8e4105a2f21 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -138,13 +139,26 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) return raw_copy_to_user(to, from, n); } -#ifdef INLINE_COPY_FROM_USER +/* + * Architectures that #define INLINE_COPY_TO_USER use this function + * directly in the normal copy_to/from_user(), the other ones go + * through an extern _copy_to/from_user(), which expands the same code + * here. + * + * Rust code always uses the extern definition. + */ static inline __must_check unsigned long -_copy_from_user(void *to, const void __user *from, unsigned long n) +_inline_copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { + /* + * Ensure that bad access_ok() speculation will not + * lead to nasty side effects *after* the copy is + * finished: + */ + barrier_nospec(); instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); @@ -153,14 +167,11 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) memset(to + (n - res), 0, res); return res; } -#else extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); -#endif -#ifdef INLINE_COPY_TO_USER static inline __must_check unsigned long -_copy_to_user(void __user *to, const void *from, unsigned long n) +_inline_copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) @@ -171,25 +182,32 @@ _copy_to_user(void __user *to, const void *from, unsigned long n) } return n; } -#else extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); -#endif static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { - if (check_copy_size(to, n, false)) - n = _copy_from_user(to, from, n); - return n; + if (!check_copy_size(to, n, false)) + return n; +#ifdef INLINE_COPY_FROM_USER + return _inline_copy_from_user(to, from, n); +#else + return _copy_from_user(to, from, n); +#endif } static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { - if (check_copy_size(from, n, true)) - n = _copy_to_user(to, from, n); - return n; + if (!check_copy_size(from, n, true)) + return n; + +#ifdef INLINE_COPY_TO_USER + return _inline_copy_to_user(to, from, n); +#else + return _copy_to_user(to, from, n); +#endif } #ifndef copy_mc_to_kernel diff --git a/lib/usercopy.c b/lib/usercopy.c index 499a7a7d54db..7b17b83c8042 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -12,40 +12,18 @@ /* out-of-line parts */ -#ifndef INLINE_COPY_FROM_USER +#if !defined(INLINE_COPY_FROM_USER) || defined(CONFIG_RUST) unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { - unsigned long res = n; - might_fault(); - if (!should_fail_usercopy() && likely(access_ok(from, n))) { - /* - * Ensure that bad access_ok() speculation will not - * lead to nasty side effects *after* the copy is - * finished: - */ - barrier_nospec(); - instrument_copy_from_user_before(to, from, n); - res = raw_copy_from_user(to, from, n); - instrument_copy_from_user_after(to, from, n, res); - } - if (unlikely(res)) - memset(to + (n - res), 0, res); - return res; + return _inline_copy_from_user(to, from, n); } EXPORT_SYMBOL(_copy_from_user); #endif -#ifndef INLINE_COPY_TO_USER +#if !defined(INLINE_COPY_TO_USER) || defined(CONFIG_RUST) unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { - might_fault(); - if (should_fail_usercopy()) - return n; - if (likely(access_ok(to, n))) { - instrument_copy_to_user(to, from, n); - n = raw_copy_to_user(to, from, n); - } - return n; + return _inline_copy_to_user(to, from, n); } EXPORT_SYMBOL(_copy_to_user); #endif From b33bf37adbb2ae35881e7fdd997ce3334d71b129 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 28 May 2024 14:58:04 +0000 Subject: [PATCH 030/120] rust: uaccess: add typed accessors for userspace pointers Add safe methods for reading and writing Rust values to and from userspace pointers. The C methods for copying to/from userspace use a function called `check_object_size` to verify that the kernel pointer is not dangling. However, this check is skipped when the length is a compile-time constant, with the assumption that such cases trivially have a correct kernel pointer. In this patch, we apply the same optimization to the typed accessors. For both methods, the size of the operation is known at compile time to be size_of of the type being read or written. Since the C side doesn't provide a variant that skips only this check, we create custom helpers for this purpose. The majority of reads and writes to userspace pointers in the Rust Binder driver uses these accessor methods. Benchmarking has found that skipping the `check_object_size` check makes a big difference for the cases being skipped here. (And that the check doesn't make a difference for the cases that use the raw read/write methods.) This code is based on something that was originally written by Wedson on the old rust branch. It was modified by Alice to skip the `check_object_size` check, and to update various comments, including the notes about kernel pointers in `WritableToBytes`. Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Reviewed-by: Benno Lossin Reviewed-by: Boqun Feng Reviewed-by: Trevor Gross Reviewed-by: Gary Guo Signed-off-by: Alice Ryhl Link: https://lore.kernel.org/r/20240528-alice-mm-v7-3-78222c31b8f4@google.com [ Wrapped docs to 100 and added a few intra-doc links. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/kernel/types.rs | 64 ++++++++++++++++++++++++++++++++++ rust/kernel/uaccess.rs | 79 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 141 insertions(+), 2 deletions(-) diff --git a/rust/kernel/types.rs b/rust/kernel/types.rs index 2e7c9008621f..bd189d646adb 100644 --- a/rust/kernel/types.rs +++ b/rust/kernel/types.rs @@ -409,3 +409,67 @@ pub enum Either { /// Constructs an instance of [`Either`] containing a value of type `R`. Right(R), } + +/// Types for which any bit pattern is valid. +/// +/// Not all types are valid for all values. For example, a `bool` must be either zero or one, so +/// reading arbitrary bytes into something that contains a `bool` is not okay. +/// +/// It's okay for the type to have padding, as initializing those bytes has no effect. +/// +/// # Safety +/// +/// All bit-patterns must be valid for this type. This type must not have interior mutability. +pub unsafe trait FromBytes {} + +// SAFETY: All bit patterns are acceptable values of the types below. +unsafe impl FromBytes for u8 {} +unsafe impl FromBytes for u16 {} +unsafe impl FromBytes for u32 {} +unsafe impl FromBytes for u64 {} +unsafe impl FromBytes for usize {} +unsafe impl FromBytes for i8 {} +unsafe impl FromBytes for i16 {} +unsafe impl FromBytes for i32 {} +unsafe impl FromBytes for i64 {} +unsafe impl FromBytes for isize {} +// SAFETY: If all bit patterns are acceptable for individual values in an array, then all bit +// patterns are also acceptable for arrays of that type. +unsafe impl FromBytes for [T] {} +unsafe impl FromBytes for [T; N] {} + +/// Types that can be viewed as an immutable slice of initialized bytes. +/// +/// If a struct implements this trait, then it is okay to copy it byte-for-byte to userspace. This +/// means that it should not have any padding, as padding bytes are uninitialized. Reading +/// uninitialized memory is not just undefined behavior, it may even lead to leaking sensitive +/// information on the stack to userspace. +/// +/// The struct should also not hold kernel pointers, as kernel pointer addresses are also considered +/// sensitive. However, leaking kernel pointers is not considered undefined behavior by Rust, so +/// this is a correctness requirement, but not a safety requirement. +/// +/// # Safety +/// +/// Values of this type may not contain any uninitialized bytes. This type must not have interior +/// mutability. +pub unsafe trait AsBytes {} + +// SAFETY: Instances of the following types have no uninitialized portions. +unsafe impl AsBytes for u8 {} +unsafe impl AsBytes for u16 {} +unsafe impl AsBytes for u32 {} +unsafe impl AsBytes for u64 {} +unsafe impl AsBytes for usize {} +unsafe impl AsBytes for i8 {} +unsafe impl AsBytes for i16 {} +unsafe impl AsBytes for i32 {} +unsafe impl AsBytes for i64 {} +unsafe impl AsBytes for isize {} +unsafe impl AsBytes for bool {} +unsafe impl AsBytes for char {} +unsafe impl AsBytes for str {} +// SAFETY: If individual values in an array have no uninitialized portions, then the array itself +// does not have any uninitialized portions either. +unsafe impl AsBytes for [T] {} +unsafe impl AsBytes for [T; N] {} diff --git a/rust/kernel/uaccess.rs b/rust/kernel/uaccess.rs index 55f0d7ecfa3f..e9347cff99ab 100644 --- a/rust/kernel/uaccess.rs +++ b/rust/kernel/uaccess.rs @@ -4,10 +4,16 @@ //! //! C header: [`include/linux/uaccess.h`](srctree/include/linux/uaccess.h) -use crate::{alloc::Flags, bindings, error::Result, prelude::*}; +use crate::{ + alloc::Flags, + bindings, + error::Result, + prelude::*, + types::{AsBytes, FromBytes}, +}; use alloc::vec::Vec; use core::ffi::{c_ulong, c_void}; -use core::mem::MaybeUninit; +use core::mem::{size_of, MaybeUninit}; /// The type used for userspace addresses. pub type UserPtr = usize; @@ -247,6 +253,41 @@ impl UserSliceReader { self.read_raw(out) } + /// Reads a value of the specified type. + /// + /// Fails with [`EFAULT`] if the read happens on a bad address, or if the read goes out of + /// bounds of this [`UserSliceReader`]. + pub fn read(&mut self) -> Result { + let len = size_of::(); + if len > self.length { + return Err(EFAULT); + } + let Ok(len_ulong) = c_ulong::try_from(len) else { + return Err(EFAULT); + }; + let mut out: MaybeUninit = MaybeUninit::uninit(); + // SAFETY: The local variable `out` is valid for writing `size_of::()` bytes. + // + // By using the _copy_from_user variant, we skip the check_object_size check that verifies + // the kernel pointer. This mirrors the logic on the C side that skips the check when the + // length is a compile-time constant. + let res = unsafe { + bindings::_copy_from_user( + out.as_mut_ptr().cast::(), + self.ptr as *const c_void, + len_ulong, + ) + }; + if res != 0 { + return Err(EFAULT); + } + self.ptr = self.ptr.wrapping_add(len); + self.length -= len; + // SAFETY: The read above has initialized all bytes in `out`, and since `T` implements + // `FromBytes`, any bit-pattern is a valid value for this type. + Ok(unsafe { out.assume_init() }) + } + /// Reads the entirety of the user slice, appending it to the end of the provided buffer. /// /// Fails with [`EFAULT`] if the read happens on a bad address. @@ -310,4 +351,38 @@ impl UserSliceWriter { self.length -= len; Ok(()) } + + /// Writes the provided Rust value to this userspace pointer. + /// + /// Fails with [`EFAULT`] if the write happens on a bad address, or if the write goes out of + /// bounds of this [`UserSliceWriter`]. This call may modify the associated userspace slice even + /// if it returns an error. + pub fn write(&mut self, value: &T) -> Result { + let len = size_of::(); + if len > self.length { + return Err(EFAULT); + } + let Ok(len_ulong) = c_ulong::try_from(len) else { + return Err(EFAULT); + }; + // SAFETY: The reference points to a value of type `T`, so it is valid for reading + // `size_of::()` bytes. + // + // By using the _copy_to_user variant, we skip the check_object_size check that verifies the + // kernel pointer. This mirrors the logic on the C side that skips the check when the length + // is a compile-time constant. + let res = unsafe { + bindings::_copy_to_user( + self.ptr as *mut c_void, + (value as *const T).cast::(), + len_ulong, + ) + }; + if res != 0 { + return Err(EFAULT); + } + self.ptr = self.ptr.wrapping_add(len); + self.length -= len; + Ok(()) + } } From fc6e66f4696b63b8a2645a2bcea407cb04bd0666 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 28 May 2024 14:58:05 +0000 Subject: [PATCH 031/120] rust: add abstraction for `struct page` Adds a new struct called `Page` that wraps a pointer to `struct page`. This struct is assumed to hold ownership over the page, so that Rust code can allocate and manage pages directly. The page type has various methods for reading and writing into the page. These methods will temporarily map the page to allow the operation. All of these methods use a helper that takes an offset and length, performs bounds checks, and returns a pointer to the given offset in the page. This patch only adds support for pages of order zero, as that is all Rust Binder needs. However, it is written to make it easy to add support for higher-order pages in the future. To do that, you would add a const generic parameter to `Page` that specifies the order. Most of the methods do not need to be adjusted, as the logic for dealing with mapping multiple pages at once can be isolated to just the `with_pointer_into_page` method. Rust Binder needs to manage pages directly as that is how transactions are delivered: Each process has an mmap'd region for incoming transactions. When an incoming transaction arrives, the Binder driver will choose a region in the mmap, allocate and map the relevant pages manually, and copy the incoming transaction directly into the page. This architecture allows the driver to copy transactions directly from the address space of one process to another, without an intermediate copy to a kernel buffer. This code is based on Wedson's page abstractions from the old rust branch, but it has been modified by Alice by removing the incomplete support for higher-order pages, by introducing the `with_*` helpers to consolidate the bounds checking logic into a single place, and various other changes. Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Reviewed-by: Andreas Hindborg Reviewed-by: Trevor Gross Reviewed-by: Benno Lossin Reviewed-by: Boqun Feng Signed-off-by: Alice Ryhl Link: https://lore.kernel.org/r/20240528-alice-mm-v7-4-78222c31b8f4@google.com [ Fixed typos and added a few intra-doc links. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/bindings/bindings_helper.h | 1 + rust/helpers.c | 20 +++ rust/kernel/alloc.rs | 7 + rust/kernel/lib.rs | 1 + rust/kernel/page.rs | 250 ++++++++++++++++++++++++++++++++ 5 files changed, 279 insertions(+) create mode 100644 rust/kernel/page.rs diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 52a1412338ef..f2bafb10f181 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -20,6 +20,7 @@ /* `bindgen` gets confused at certain things. */ const size_t RUST_CONST_HELPER_ARCH_SLAB_MINALIGN = ARCH_SLAB_MINALIGN; +const size_t RUST_CONST_HELPER_PAGE_SIZE = PAGE_SIZE; const gfp_t RUST_CONST_HELPER_GFP_ATOMIC = GFP_ATOMIC; const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL; const gfp_t RUST_CONST_HELPER_GFP_KERNEL_ACCOUNT = GFP_KERNEL_ACCOUNT; diff --git a/rust/helpers.c b/rust/helpers.c index d6abe4bd45d9..305f0577fae9 100644 --- a/rust/helpers.c +++ b/rust/helpers.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include #include @@ -94,6 +96,24 @@ int rust_helper_signal_pending(struct task_struct *t) } EXPORT_SYMBOL_GPL(rust_helper_signal_pending); +struct page *rust_helper_alloc_pages(gfp_t gfp_mask, unsigned int order) +{ + return alloc_pages(gfp_mask, order); +} +EXPORT_SYMBOL_GPL(rust_helper_alloc_pages); + +void *rust_helper_kmap_local_page(struct page *page) +{ + return kmap_local_page(page); +} +EXPORT_SYMBOL_GPL(rust_helper_kmap_local_page); + +void rust_helper_kunmap_local(const void *addr) +{ + kunmap_local(addr); +} +EXPORT_SYMBOL_GPL(rust_helper_kunmap_local); + refcount_t rust_helper_REFCOUNT_INIT(int n) { return (refcount_t)REFCOUNT_INIT(n); diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs index 396fe5a85a8f..1966bd407017 100644 --- a/rust/kernel/alloc.rs +++ b/rust/kernel/alloc.rs @@ -20,6 +20,13 @@ pub struct AllocError; #[derive(Clone, Copy)] pub struct Flags(u32); +impl Flags { + /// Get the raw representation of this flag. + pub(crate) fn as_raw(self) -> u32 { + self.0 + } +} + impl core::ops::BitOr for Flags { type Output = Self; fn bitor(self, rhs: Self) -> Self::Output { diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index 767026db068e..5d310e79485f 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -35,6 +35,7 @@ pub mod ioctl; pub mod kunit; #[cfg(CONFIG_NET)] pub mod net; +pub mod page; pub mod prelude; pub mod print; mod static_assert; diff --git a/rust/kernel/page.rs b/rust/kernel/page.rs new file mode 100644 index 000000000000..208a006d587c --- /dev/null +++ b/rust/kernel/page.rs @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Kernel page allocation and management. + +use crate::{ + alloc::{AllocError, Flags}, + bindings, + error::code::*, + error::Result, + uaccess::UserSliceReader, +}; +use core::ptr::{self, NonNull}; + +/// A bitwise shift for the page size. +pub const PAGE_SHIFT: usize = bindings::PAGE_SHIFT as usize; + +/// The number of bytes in a page. +pub const PAGE_SIZE: usize = bindings::PAGE_SIZE; + +/// A bitmask that gives the page containing a given address. +pub const PAGE_MASK: usize = !(PAGE_SIZE - 1); + +/// A pointer to a page that owns the page allocation. +/// +/// # Invariants +/// +/// The pointer is valid, and has ownership over the page. +pub struct Page { + page: NonNull, +} + +// SAFETY: Pages have no logic that relies on them staying on a given thread, so moving them across +// threads is safe. +unsafe impl Send for Page {} + +// SAFETY: Pages have no logic that relies on them not being accessed concurrently, so accessing +// them concurrently is safe. +unsafe impl Sync for Page {} + +impl Page { + /// Allocates a new page. + /// + /// # Examples + /// + /// Allocate memory for a page. + /// + /// ``` + /// use kernel::page::Page; + /// + /// # fn dox() -> Result<(), kernel::alloc::AllocError> { + /// let page = Page::alloc_page(GFP_KERNEL)?; + /// # Ok(()) } + /// ``` + /// + /// Allocate memory for a page and zero its contents. + /// + /// ``` + /// use kernel::page::Page; + /// + /// # fn dox() -> Result<(), kernel::alloc::AllocError> { + /// let page = Page::alloc_page(GFP_KERNEL | __GFP_ZERO)?; + /// # Ok(()) } + /// ``` + pub fn alloc_page(flags: Flags) -> Result { + // SAFETY: Depending on the value of `gfp_flags`, this call may sleep. Other than that, it + // is always safe to call this method. + let page = unsafe { bindings::alloc_pages(flags.as_raw(), 0) }; + let page = NonNull::new(page).ok_or(AllocError)?; + // INVARIANT: We just successfully allocated a page, so we now have ownership of the newly + // allocated page. We transfer that ownership to the new `Page` object. + Ok(Self { page }) + } + + /// Returns a raw pointer to the page. + pub fn as_ptr(&self) -> *mut bindings::page { + self.page.as_ptr() + } + + /// Runs a piece of code with this page mapped to an address. + /// + /// The page is unmapped when this call returns. + /// + /// # Using the raw pointer + /// + /// It is up to the caller to use the provided raw pointer correctly. The pointer is valid for + /// `PAGE_SIZE` bytes and for the duration in which the closure is called. The pointer might + /// only be mapped on the current thread, and when that is the case, dereferencing it on other + /// threads is UB. Other than that, the usual rules for dereferencing a raw pointer apply: don't + /// cause data races, the memory may be uninitialized, and so on. + /// + /// If multiple threads map the same page at the same time, then they may reference with + /// different addresses. However, even if the addresses are different, the underlying memory is + /// still the same for these purposes (e.g., it's still a data race if they both write to the + /// same underlying byte at the same time). + fn with_page_mapped(&self, f: impl FnOnce(*mut u8) -> T) -> T { + // SAFETY: `page` is valid due to the type invariants on `Page`. + let mapped_addr = unsafe { bindings::kmap_local_page(self.as_ptr()) }; + + let res = f(mapped_addr.cast()); + + // This unmaps the page mapped above. + // + // SAFETY: Since this API takes the user code as a closure, it can only be used in a manner + // where the pages are unmapped in reverse order. This is as required by `kunmap_local`. + // + // In other words, if this call to `kunmap_local` happens when a different page should be + // unmapped first, then there must necessarily be a call to `kmap_local_page` other than the + // call just above in `with_page_mapped` that made that possible. In this case, it is the + // unsafe block that wraps that other call that is incorrect. + unsafe { bindings::kunmap_local(mapped_addr) }; + + res + } + + /// Runs a piece of code with a raw pointer to a slice of this page, with bounds checking. + /// + /// If `f` is called, then it will be called with a pointer that points at `off` bytes into the + /// page, and the pointer will be valid for at least `len` bytes. The pointer is only valid on + /// this task, as this method uses a local mapping. + /// + /// If `off` and `len` refers to a region outside of this page, then this method returns + /// [`EINVAL`] and does not call `f`. + /// + /// # Using the raw pointer + /// + /// It is up to the caller to use the provided raw pointer correctly. The pointer is valid for + /// `len` bytes and for the duration in which the closure is called. The pointer might only be + /// mapped on the current thread, and when that is the case, dereferencing it on other threads + /// is UB. Other than that, the usual rules for dereferencing a raw pointer apply: don't cause + /// data races, the memory may be uninitialized, and so on. + /// + /// If multiple threads map the same page at the same time, then they may reference with + /// different addresses. However, even if the addresses are different, the underlying memory is + /// still the same for these purposes (e.g., it's still a data race if they both write to the + /// same underlying byte at the same time). + fn with_pointer_into_page( + &self, + off: usize, + len: usize, + f: impl FnOnce(*mut u8) -> Result, + ) -> Result { + let bounds_ok = off <= PAGE_SIZE && len <= PAGE_SIZE && (off + len) <= PAGE_SIZE; + + if bounds_ok { + self.with_page_mapped(move |page_addr| { + // SAFETY: The `off` integer is at most `PAGE_SIZE`, so this pointer offset will + // result in a pointer that is in bounds or one off the end of the page. + f(unsafe { page_addr.add(off) }) + }) + } else { + Err(EINVAL) + } + } + + /// Maps the page and reads from it into the given buffer. + /// + /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes + /// outside of the page, then this call returns [`EINVAL`]. + /// + /// # Safety + /// + /// * Callers must ensure that `dst` is valid for writing `len` bytes. + /// * Callers must ensure that this call does not race with a write to the same page that + /// overlaps with this read. + pub unsafe fn read_raw(&self, dst: *mut u8, offset: usize, len: usize) -> Result { + self.with_pointer_into_page(offset, len, move |src| { + // SAFETY: If `with_pointer_into_page` calls into this closure, then + // it has performed a bounds check and guarantees that `src` is + // valid for `len` bytes. + // + // There caller guarantees that there is no data race. + unsafe { ptr::copy_nonoverlapping(src, dst, len) }; + Ok(()) + }) + } + + /// Maps the page and writes into it from the given buffer. + /// + /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes + /// outside of the page, then this call returns [`EINVAL`]. + /// + /// # Safety + /// + /// * Callers must ensure that `src` is valid for reading `len` bytes. + /// * Callers must ensure that this call does not race with a read or write to the same page + /// that overlaps with this write. + pub unsafe fn write_raw(&self, src: *const u8, offset: usize, len: usize) -> Result { + self.with_pointer_into_page(offset, len, move |dst| { + // SAFETY: If `with_pointer_into_page` calls into this closure, then it has performed a + // bounds check and guarantees that `dst` is valid for `len` bytes. + // + // There caller guarantees that there is no data race. + unsafe { ptr::copy_nonoverlapping(src, dst, len) }; + Ok(()) + }) + } + + /// Maps the page and zeroes the given slice. + /// + /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes + /// outside of the page, then this call returns [`EINVAL`]. + /// + /// # Safety + /// + /// Callers must ensure that this call does not race with a read or write to the same page that + /// overlaps with this write. + pub unsafe fn fill_zero_raw(&self, offset: usize, len: usize) -> Result { + self.with_pointer_into_page(offset, len, move |dst| { + // SAFETY: If `with_pointer_into_page` calls into this closure, then it has performed a + // bounds check and guarantees that `dst` is valid for `len` bytes. + // + // There caller guarantees that there is no data race. + unsafe { ptr::write_bytes(dst, 0u8, len) }; + Ok(()) + }) + } + + /// Copies data from userspace into this page. + /// + /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes + /// outside of the page, then this call returns [`EINVAL`]. + /// + /// Like the other `UserSliceReader` methods, data races are allowed on the userspace address. + /// However, they are not allowed on the page you are copying into. + /// + /// # Safety + /// + /// Callers must ensure that this call does not race with a read or write to the same page that + /// overlaps with this write. + pub unsafe fn copy_from_user_slice_raw( + &self, + reader: &mut UserSliceReader, + offset: usize, + len: usize, + ) -> Result { + self.with_pointer_into_page(offset, len, move |dst| { + // SAFETY: If `with_pointer_into_page` calls into this closure, then it has performed a + // bounds check and guarantees that `dst` is valid for `len` bytes. Furthermore, we have + // exclusive access to the slice since the caller guarantees that there are no races. + reader.read_raw(unsafe { core::slice::from_raw_parts_mut(dst.cast(), len) }) + }) + } +} + +impl Drop for Page { + fn drop(&mut self) { + // SAFETY: By the type invariants, we have ownership of the page and can free it. + unsafe { bindings::__free_pages(self.page.as_ptr(), 0) }; + } +} From 86588139b84386a47108b024d703c3c24ddbbec8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 3 Jul 2024 18:01:35 -0700 Subject: [PATCH 032/120] Documentation: CXL Maturity Map Provide a survey of the work-in-progress maturity (implementation status) of various aspects of the CXL subsystem. Clarify that in addition to ongoing upkeep relative to specification updates, there are some long running themes in the driver that respond to the discovery of new corner cases (bugs) and new use cases (feature extensions). The primary audience is distribution maintainers, but it also serves as a guide for kernel developers to understand what aspects of the CXL subsystem need more help. It is a landing page to document ongoing progress, and a guide to discern exposure to work-in-progress features. Reviewed-by: Adam Manzanares Signed-off-by: Dan Williams Reviewed-by: Jonathan Cameron Reviewed-by: Ira Weiny Reviewed-by: Alison Schofield Link: https://patch.msgid.link/172005486862.2048248.6668794717827294862.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dave Jiang --- Documentation/driver-api/cxl/index.rst | 2 + Documentation/driver-api/cxl/maturity-map.rst | 202 ++++++++++++++++++ MAINTAINERS | 1 + 3 files changed, 205 insertions(+) create mode 100644 Documentation/driver-api/cxl/maturity-map.rst diff --git a/Documentation/driver-api/cxl/index.rst b/Documentation/driver-api/cxl/index.rst index 036e49553542..12b82725d322 100644 --- a/Documentation/driver-api/cxl/index.rst +++ b/Documentation/driver-api/cxl/index.rst @@ -9,4 +9,6 @@ Compute Express Link memory-devices + maturity-map + .. only:: subproject and html diff --git a/Documentation/driver-api/cxl/maturity-map.rst b/Documentation/driver-api/cxl/maturity-map.rst new file mode 100644 index 000000000000..df8e2ac2a320 --- /dev/null +++ b/Documentation/driver-api/cxl/maturity-map.rst @@ -0,0 +1,202 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +=========================================== +Compute Express Link Subsystem Maturity Map +=========================================== + +The Linux CXL subsystem tracks the dynamic `CXL specification +`_ that +continues to respond to new use cases with new features, capability +updates and fixes. At any given point some aspects of the subsystem are +more mature than others. While the periodic pull requests summarize the +`work being incorporated each merge window +`_, +those do not always convey progress relative to a starting point and a +future end goal. + +What follows is a coarse breakdown of the subsystem's major +responsibilities along with a maturity score. The expectation is that +the change-history of this document provides an overview summary of the +subsystem maturation over time. + +The maturity scores are: + +- [3] Mature: Work in this area is complete and no changes on the horizon. + Note that this score can regress from one kernel release to the next + based on new test results or end user reports. + +- [2] Stabilizing: Major functionality operational, common cases are + mature, but known corner cases are still a work in progress. + +- [1] Initial: Capability that has exited the Proof of Concept phase, but + may still have significant gaps to close and fixes to apply as real + world testing occurs. + +- [0] Known gap: Feature is on a medium to long term horizon to + implement. If the specification has a feature that does not even have + a '0' score in this document, there is a good chance that no one in + the linux-cxl@vger.kernel.org community has started to look at it. + +- X: Out of scope for kernel enabling, or kernel enabling not required + +Feature and Capabilities +======================== + +Enumeration / Provisioning +-------------------------- +All of the fundamental enumeration an object model of the subsystem is +in place, but there are several corner cases that are pending closure. + + +* [2] CXL Window Enumeration + + * [0] :ref:`Extended-linear memory-side cache ` + * [0] Low Memory-hole + * [0] Hetero-interleave + +* [2] Switch Enumeration + + * [0] CXL register enumeration link-up dependency + +* [2] HDM Decoder Configuration + + * [0] Decoder target and granularity constraints + +* [2] Performance enumeration + + * [3] Endpoint CDAT + * [3] Switch CDAT + * [1] CDAT to Core-mm integration + + * [1] x86 + * [0] Arm64 + * [0] All other arch. + + * [0] Shared link + +* [2] Hotplug + (see CXL Window Enumeration) + + * [0] Handle Soft Reserved conflicts + +* [0] :ref:`RCH link status ` +* [0] Fabrics / G-FAM (chapter 7) +* [0] Global Access Endpoint + + +RAS +--- +In many ways CXL can be seen as a standardization of what would normally +be handled by custom EDAC drivers. The open development here is +mainly caused by the enumeration corner cases above. + +* [3] Component events (OS) +* [2] Component events (FFM) +* [1] Endpoint protocol errors (OS) +* [1] Endpoint protocol errors (FFM) +* [0] Switch protocol errors (OS) +* [1] Switch protocol errors (FFM) +* [2] DPA->HPA Address translation + + * [1] XOR Interleave translation + (see CXL Window Enumeration) + +* [1] Memory Failure coordination +* [0] Scrub control +* [2] ACPI error injection EINJ + + * [0] EINJ v2 + * [X] Compliance DOE + +* [2] Native error injection +* [3] RCH error handling +* [1] VH error handling +* [0] PPR +* [0] Sparing +* [0] Device built in test + + +Mailbox commands +---------------- + +* [3] Firmware update +* [3] Health / Alerts +* [1] :ref:`Background commands ` +* [3] Sanitization +* [3] Security commands +* [3] RAW Command Debug Passthrough +* [0] CEL-only-validation Passthrough +* [0] Switch CCI +* [3] Timestamp +* [1] PMEM labels +* [0] PMEM GPF / Dirty Shutdown +* [0] Scan Media + +PMU +--- +* [1] Type 3 PMU +* [0] Switch USP/ DSP, Root Port + +Security +-------- + +* [X] CXL Trusted Execution Environment Security Protocol (TSP) +* [X] CXL IDE (subsumed by TSP) + +Memory-pooling +-------------- + +* [1] Hotplug of LDs (via PCI hotplug) +* [0] Dynamic Capacity Device (DCD) Support + +Multi-host sharing +------------------ + +* [0] Hardware coherent shared memory +* [0] Software managed coherency shared memory + +Multi-host memory +----------------- + +* [0] Dynamic Capacity Device Support +* [0] Sharing + +Accelerator +----------- + +* [0] Accelerator memory enumeration HDM-D (CXL 1.1/2.0 Type-2) +* [0] Accelerator memory enumeration HDM-DB (CXL 3.0 Type-2) +* [0] CXL.cache 68b (CXL 2.0) +* [0] CXL.cache 256b Cache IDs (CXL 3.0) + +User Flow Support +----------------- + +* [0] HPA->DPA Address translation (need xormaps export solution) + +Details +======= + +.. _extended-linear: + +* **Extended-linear memory-side cache**: An HMAT proposal to enumerate the presence of a + memory-side cache where the cache capacity extends the SRAT address + range capacity. `See the ECN + `_ + for more details: + +.. _rch-link-status: + +* **RCH Link Status**: RCH (Restricted CXL Host) topologies, end up + hiding some standard registers like PCIe Link Status / Capabilities in + the CXL RCRB (Root Complex Register Block). + +.. _background-commands: + +* **Background commands**: The CXL background command mechanism is + awkward as the single slot is monopolized potentially indefinitely by + various commands. A `cancel on conflict + `_ + facility is needed to make sure the kernel can ensure forward progress + of priority commands. diff --git a/MAINTAINERS b/MAINTAINERS index 3c4fdf74a3f9..ce047ab7e977 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5485,6 +5485,7 @@ M: Ira Weiny M: Dan Williams L: linux-cxl@vger.kernel.org S: Maintained +F: Documentation/driver-api/cxl F: drivers/cxl/ F: include/linux/einj-cxl.h F: include/linux/cxl-event.h From 67bab430f4e71332aced7562a67d73444f5b4b77 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Fri, 24 May 2024 14:31:52 +0200 Subject: [PATCH 033/120] tools/power turbostat: Group SMI counter with APERF and MPERF These three counters now are treated similar to other perf counters groups. This simplifies and gets rid of a lot of special cases for APERF and MPERF. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 635 ++++++++++++-------------- 1 file changed, 280 insertions(+), 355 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 29751b41ea0d..495235055fa2 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -85,7 +85,6 @@ enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; -enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR }; enum counter_source { COUNTER_SOURCE_NONE, COUNTER_SOURCE_PERF, COUNTER_SOURCE_MSR }; struct sysfs_path { @@ -275,7 +274,6 @@ char *proc_stat = "/proc/stat"; FILE *outf; int *fd_percpu; int *fd_instr_count_percpu; -struct amperf_group_fd *fd_amperf_percpu; /* File descriptors for perf group with APERF and MPERF counters. */ struct timeval interval_tv = { 5, 0 }; struct timespec interval_ts = { 5, 0 }; @@ -290,6 +288,7 @@ unsigned int summary_only; unsigned int list_header_only; unsigned int dump_only; unsigned int has_aperf; +unsigned int has_aperf_access; unsigned int has_epb; unsigned int has_turbo; unsigned int is_hybrid; @@ -330,7 +329,6 @@ unsigned int first_counter_read = 1; int ignore_stdin; bool no_msr; bool no_perf; -enum amperf_source amperf_source; enum gfx_sysfs_idx { GFX_rc6, @@ -1381,6 +1379,67 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = { }, }; +/* Indexes used to map data read from perf and MSRs into global variables */ +enum msr_rci_index { + MSR_RCI_INDEX_APERF = 0, + MSR_RCI_INDEX_MPERF = 1, + MSR_RCI_INDEX_SMI = 2, + NUM_MSR_COUNTERS, +}; + +struct msr_counter_info_t { + unsigned long long data[NUM_MSR_COUNTERS]; + enum counter_source source[NUM_MSR_COUNTERS]; + unsigned long long msr[NUM_MSR_COUNTERS]; + unsigned long long msr_mask[NUM_MSR_COUNTERS]; + int fd_perf; +}; + +struct msr_counter_info_t *msr_counter_info; +unsigned int msr_counter_info_size; + +struct msr_counter_arch_info { + const char *perf_subsys; + const char *perf_name; + unsigned long long msr; + unsigned long long msr_mask; + unsigned int rci_index; /* Maps data from perf counters to global variables */ + bool needed; + bool present; +}; + +enum msr_arch_info_index { + MSR_ARCH_INFO_APERF_INDEX = 0, + MSR_ARCH_INFO_MPERF_INDEX = 1, + MSR_ARCH_INFO_SMI_INDEX = 2, +}; + +static struct msr_counter_arch_info msr_counter_arch_infos[] = { + [MSR_ARCH_INFO_APERF_INDEX] = { + .perf_subsys = "msr", + .perf_name = "aperf", + .msr = MSR_IA32_APERF, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .rci_index = MSR_RCI_INDEX_APERF, + }, + + [MSR_ARCH_INFO_MPERF_INDEX] = { + .perf_subsys = "msr", + .perf_name = "mperf", + .msr = MSR_IA32_MPERF, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .rci_index = MSR_RCI_INDEX_MPERF, + }, + + [MSR_ARCH_INFO_SMI_INDEX] = { + .perf_subsys = "msr", + .perf_name = "smi", + .msr = MSR_SMI_COUNT, + .msr_mask = 0xFFFFFFFF, + .rci_index = MSR_RCI_INDEX_SMI, + }, +}; + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -1767,7 +1826,7 @@ int get_msr_fd(int cpu) static void bic_disable_msr_access(void) { - const unsigned long bic_msrs = BIC_SMI | BIC_Mod_c6 | BIC_CoreTmp | + const unsigned long bic_msrs = BIC_Mod_c6 | BIC_CoreTmp | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp; bic_enabled &= ~bic_msrs; @@ -3402,30 +3461,6 @@ static unsigned int read_perf_counter_info_n(const char *const path, const char return v; } -static unsigned int read_msr_type(void) -{ - const char *const path = "/sys/bus/event_source/devices/msr/type"; - const char *const format = "%u"; - - return read_perf_counter_info_n(path, format); -} - -static unsigned int read_aperf_config(void) -{ - const char *const path = "/sys/bus/event_source/devices/msr/events/aperf"; - const char *const format = "event=%x"; - - return read_perf_counter_info_n(path, format); -} - -static unsigned int read_mperf_config(void) -{ - const char *const path = "/sys/bus/event_source/devices/msr/events/mperf"; - const char *const format = "event=%x"; - - return read_perf_counter_info_n(path, format); -} - static unsigned int read_perf_type(const char *subsys) { const char *const path_format = "/sys/bus/event_source/devices/%s/type"; @@ -3479,124 +3514,6 @@ static double read_perf_rapl_scale(const char *subsys, const char *event_name) return scale; } -static struct amperf_group_fd open_amperf_fd(int cpu) -{ - const unsigned int msr_type = read_msr_type(); - const unsigned int aperf_config = read_aperf_config(); - const unsigned int mperf_config = read_mperf_config(); - struct amperf_group_fd fds = {.aperf = -1, .mperf = -1 }; - - fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP); - fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP); - - return fds; -} - -static int get_amperf_fd(int cpu) -{ - assert(fd_amperf_percpu); - - if (fd_amperf_percpu[cpu].aperf) - return fd_amperf_percpu[cpu].aperf; - - fd_amperf_percpu[cpu] = open_amperf_fd(cpu); - - return fd_amperf_percpu[cpu].aperf; -} - -/* Read APERF, MPERF and TSC using the perf API. */ -static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu) -{ - union { - struct { - unsigned long nr_entries; - unsigned long aperf; - unsigned long mperf; - }; - - unsigned long as_array[3]; - } cnt; - - const int fd_amperf = get_amperf_fd(cpu); - - /* - * Read the TSC with rdtsc, because we want the absolute value and not - * the offset from the start of the counter. - */ - t->tsc = rdtsc(); - - const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array)); - - if (n != sizeof(cnt.as_array)) - return -2; - - t->aperf = cnt.aperf * aperf_mperf_multiplier; - t->mperf = cnt.mperf * aperf_mperf_multiplier; - - return 0; -} - -/* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */ -static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu) -{ - unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; - int aperf_mperf_retry_count = 0; - - /* - * The TSC, APERF and MPERF must be read together for - * APERF/MPERF and MPERF/TSC to give accurate results. - * - * Unfortunately, APERF and MPERF are read by - * individual system call, so delays may occur - * between them. If the time to read them - * varies by a large amount, we re-read them. - */ - - /* - * This initial dummy APERF read has been seen to - * reduce jitter in the subsequent reads. - */ - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - -retry: - t->tsc = rdtsc(); /* re-read close to APERF */ - - tsc_before = t->tsc; - - if (get_msr(cpu, MSR_IA32_APERF, &t->aperf)) - return -3; - - tsc_between = rdtsc(); - - if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf)) - return -4; - - tsc_after = rdtsc(); - - aperf_time = tsc_between - tsc_before; - mperf_time = tsc_after - tsc_between; - - /* - * If the system call latency to read APERF and MPERF - * differ by more than 2x, then try again. - */ - if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) { - aperf_mperf_retry_count++; - if (aperf_mperf_retry_count < 5) - goto retry; - else - warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); - } - aperf_mperf_retry_count = 0; - - t->aperf = t->aperf * aperf_mperf_multiplier; - t->mperf = t->mperf * aperf_mperf_multiplier; - - return 0; -} - size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci) { size_t ret = 0; @@ -3853,6 +3770,84 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat return 0; } +size_t msr_counter_info_count_perf(const struct msr_counter_info_t *mci) +{ + size_t ret = 0; + + for (int i = 0; i < NUM_MSR_COUNTERS; ++i) + if (mci->source[i] == COUNTER_SOURCE_PERF) + ++ret; + + return ret; +} + +int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t) +{ + unsigned long long perf_data[NUM_MSR_COUNTERS + 1]; + + struct msr_counter_info_t *mci; + + if (debug) + fprintf(stderr, "%s: cpu%d\n", __func__, cpu); + + assert(msr_counter_info); + assert(cpu <= msr_counter_info_size); + + mci = &msr_counter_info[cpu]; + + ZERO_ARRAY(perf_data); + ZERO_ARRAY(mci->data); + + if (mci->fd_perf != -1) { + const size_t num_perf_counters = msr_counter_info_count_perf(mci); + const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long); + const ssize_t actual_read_size = read(mci->fd_perf, &perf_data[0], sizeof(perf_data)); + + if (actual_read_size != expected_read_size) + err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, + actual_read_size); + } + + for (unsigned int i = 0, pi = 1; i < NUM_MSR_COUNTERS; ++i) { + switch (mci->source[i]) { + case COUNTER_SOURCE_NONE: + break; + + case COUNTER_SOURCE_PERF: + assert(pi < ARRAY_SIZE(perf_data)); + assert(mci->fd_perf != -1); + + if (debug) + fprintf(stderr, "Reading msr counter via perf at %u: %llu\n", i, perf_data[pi]); + + mci->data[i] = perf_data[pi]; + + ++pi; + break; + + case COUNTER_SOURCE_MSR: + assert(!no_msr); + + if (get_msr(cpu, mci->msr[i], &mci->data[i])) + return -2 - i; + + mci->data[i] &= mci->msr_mask[i]; + + if (debug) + fprintf(stderr, "Reading msr counter via msr at %u: %llu\n", i, mci->data[i]); + + break; + } + } + + BUILD_BUG_ON(NUM_MSR_COUNTERS != 3); + t->aperf = mci->data[MSR_RCI_INDEX_APERF]; + t->mperf = mci->data[MSR_RCI_INDEX_MPERF]; + t->smi_count = mci->data[MSR_RCI_INDEX_SMI]; + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -3878,24 +3873,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) t->tsc = rdtsc(); /* we are running on local CPU of interest */ - if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC) - || soft_c1_residency_display(BIC_Avg_MHz)) { - int status = -1; - - assert(!no_perf || !no_msr); - - switch (amperf_source) { - case AMPERF_SOURCE_PERF: - status = read_aperf_mperf_tsc_perf(t, cpu); - break; - case AMPERF_SOURCE_MSR: - status = read_aperf_mperf_tsc_msr(t, cpu); - break; - } - - if (status != 0) - return status; - } + get_smi_aperf_mperf(cpu, t); if (DO_BIC(BIC_IPC)) if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long)) @@ -3903,11 +3881,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (DO_BIC(BIC_IRQ)) t->irq_count = irqs_per_cpu[cpu]; - if (DO_BIC(BIC_SMI)) { - if (get_msr(cpu, MSR_SMI_COUNT, &msr)) - return -5; - t->smi_count = msr & 0xFFFFFFFF; - } get_cstate_counters(cpu, t, c, p); @@ -4489,25 +4462,6 @@ void free_fd_percpu(void) fd_percpu = NULL; } -void free_fd_amperf_percpu(void) -{ - int i; - - if (!fd_amperf_percpu) - return; - - for (i = 0; i < topo.max_cpu_num + 1; ++i) { - if (fd_amperf_percpu[i].mperf != 0) - close(fd_amperf_percpu[i].mperf); - - if (fd_amperf_percpu[i].aperf != 0) - close(fd_amperf_percpu[i].aperf); - } - - free(fd_amperf_percpu); - fd_amperf_percpu = NULL; -} - void free_fd_instr_count_percpu(void) { if (!fd_instr_count_percpu) @@ -4542,6 +4496,21 @@ void free_fd_cstate(void) ccstate_counter_info_size = 0; } +void free_fd_msr(void) +{ + if (!msr_counter_info) + return; + + for (int cpu = 0; cpu < topo.max_cpu_num; ++cpu) { + if (msr_counter_info[cpu].fd_perf != -1) + close(msr_counter_info[cpu].fd_perf); + } + + free(msr_counter_info); + msr_counter_info = NULL; + msr_counter_info_size = 0; +} + void free_fd_rapl_percpu(void) { if (!rapl_counter_info_perdomain) @@ -4601,7 +4570,7 @@ void free_all_buffers(void) free_fd_percpu(); free_fd_instr_count_percpu(); - free_fd_amperf_percpu(); + free_fd_msr(); free_fd_rapl_percpu(); free_fd_cstate(); @@ -4938,6 +4907,7 @@ static void update_effective_set(bool startup) } void linux_perf_init(void); +void msr_perf_init(void); void rapl_perf_init(void); void cstate_perf_init(void); @@ -4946,6 +4916,7 @@ void re_initialize(void) free_all_buffers(); setup_all_buffers(false); linux_perf_init(); + msr_perf_init(); rapl_perf_init(); cstate_perf_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, @@ -6799,15 +6770,6 @@ static int has_instr_count_access(void) return has_access; } -bool is_aperf_access_required(void) -{ - return BIC_IS_ENABLED(BIC_Avg_MHz) - || BIC_IS_ENABLED(BIC_Busy) - || BIC_IS_ENABLED(BIC_Bzy_MHz) - || BIC_IS_ENABLED(BIC_IPC) - || BIC_IS_ENABLED(BIC_CPU_c1); -} - int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai, double *scale_, enum rapl_unit *unit_) { @@ -6866,14 +6828,6 @@ void linux_perf_init(void) if (fd_instr_count_percpu == NULL) err(-1, "calloc fd_instr_count_percpu"); } - - const bool aperf_required = is_aperf_access_required(); - - if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) { - fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu)); - if (fd_amperf_percpu == NULL) - err(-1, "calloc fd_amperf_percpu"); - } } void rapl_perf_init(void) @@ -6966,75 +6920,11 @@ void rapl_perf_init(void) free(domain_visited); } -static int has_amperf_access_via_msr(void) -{ - if (no_msr) - return 0; - - if (probe_msr(base_cpu, MSR_IA32_APERF)) - return 0; - - if (probe_msr(base_cpu, MSR_IA32_MPERF)) - return 0; - - return 1; -} - -static int has_amperf_access_via_perf(void) -{ - struct amperf_group_fd fds; - - /* - * Cache the last result, so we don't warn the user multiple times - * - * Negative means cached, no access - * Zero means not cached - * Positive means cached, has access - */ - static int has_access_cached; - - if (no_perf) - return 0; - - if (has_access_cached != 0) - return has_access_cached > 0; - - fds = open_amperf_fd(base_cpu); - has_access_cached = (fds.aperf != -1) && (fds.mperf != -1); - - if (fds.aperf == -1) - warnx("Failed to access %s. Some of the counters may not be available\n" - "\tRun as root to enable them or use %s to disable the access explicitly", - "APERF perf counter", "--no-perf"); - else - close(fds.aperf); - - if (fds.mperf == -1) - warnx("Failed to access %s. Some of the counters may not be available\n" - "\tRun as root to enable them or use %s to disable the access explicitly", - "MPERF perf counter", "--no-perf"); - else - close(fds.mperf); - - if (has_access_cached == 0) - has_access_cached = -1; - - return has_access_cached > 0; -} - -/* Check if we can access APERF and MPERF */ +/* Assumes msr_counter_info is populated */ static int has_amperf_access(void) { - if (!is_aperf_access_required()) - return 0; - - if (!no_msr && has_amperf_access_via_msr()) - return 1; - - if (!no_perf && has_amperf_access_via_perf()) - return 1; - - return 0; + return msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].present && + msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].present; } int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name) @@ -7083,6 +6973,114 @@ int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const st return ret; } +int add_msr_perf_counter_(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai) +{ + if (no_perf) + return -1; + + const unsigned int type = read_perf_type(cai->perf_subsys); + const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name); + + const int fd_counter = open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP); + + if (fd_counter == -1) + return -1; + + /* If it's the first counter opened, make it a group descriptor */ + if (cci->fd_perf == -1) + cci->fd_perf = fd_counter; + + return fd_counter; +} + +int add_msr_perf_counter(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai) +{ + int ret = add_msr_perf_counter_(cpu, cci, cai); + + if (debug) + fprintf(stderr, "%s: %s/%s: %d (cpu: %d)\n", __func__, cai->perf_subsys, cai->perf_name, ret, cpu); + + return ret; +} + +void msr_perf_init_(void) +{ + const int mci_num = topo.max_cpu_num + 1; + + msr_counter_info = calloc(mci_num, sizeof(*msr_counter_info)); + if (!msr_counter_info) + err(1, "calloc msr_counter_info"); + msr_counter_info_size = mci_num; + + for (int cpu = 0; cpu < mci_num; ++cpu) + msr_counter_info[cpu].fd_perf = -1; + + for (int cidx = 0; cidx < NUM_MSR_COUNTERS; ++cidx) { + + struct msr_counter_arch_info *cai = &msr_counter_arch_infos[cidx]; + + cai->present = false; + + for (int cpu = 0; cpu < mci_num; ++cpu) { + + struct msr_counter_info_t *const cci = &msr_counter_info[cpu]; + + if (cpu_is_not_allowed(cpu)) + continue; + + if (cai->needed) { + /* Use perf API for this counter */ + if (!no_perf && cai->perf_name && add_msr_perf_counter(cpu, cci, cai) != -1) { + cci->source[cai->rci_index] = COUNTER_SOURCE_PERF; + cai->present = true; + + /* User MSR for this counter */ + } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) { + cci->source[cai->rci_index] = COUNTER_SOURCE_MSR; + cci->msr[cai->rci_index] = cai->msr; + cci->msr_mask[cai->rci_index] = cai->msr_mask; + cai->present = true; + } + } + } + } +} + +/* Initialize data for reading perf counters from the MSR group. */ +void msr_perf_init(void) +{ + bool need_amperf = false, need_smi = false; + const bool need_soft_c1 = (!platform->has_msr_core_c1_res) && (platform->supported_cstates & CC1); + + need_amperf = BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || BIC_IS_ENABLED(BIC_Bzy_MHz) + || BIC_IS_ENABLED(BIC_IPC) || need_soft_c1; + + if (BIC_IS_ENABLED(BIC_SMI)) + need_smi = true; + + /* Enable needed counters */ + msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].needed = need_amperf; + msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].needed = need_amperf; + msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].needed = need_smi; + + msr_perf_init_(); + + const bool has_amperf = has_amperf_access(); + const bool has_smi = msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].present; + + has_aperf_access = has_amperf; + + if (has_amperf) { + BIC_PRESENT(BIC_Avg_MHz); + BIC_PRESENT(BIC_Busy); + BIC_PRESENT(BIC_Bzy_MHz); + BIC_PRESENT(BIC_SMI); + } + + if (has_smi) + BIC_PRESENT(BIC_SMI); +} + void cstate_perf_init_(bool soft_c1) { bool has_counter; @@ -7340,12 +7338,6 @@ void process_cpuid() __cpuid(0x6, eax, ebx, ecx, edx); has_aperf = ecx & (1 << 0); - if (has_aperf && has_amperf_access()) { - BIC_PRESENT(BIC_Avg_MHz); - BIC_PRESENT(BIC_Busy); - BIC_PRESENT(BIC_Bzy_MHz); - BIC_PRESENT(BIC_IPC); - } do_dts = eax & (1 << 0); if (do_dts) BIC_PRESENT(BIC_CoreTmp); @@ -7462,6 +7454,11 @@ static void counter_info_init(void) if (platform->has_msr_atom_pkg_c6_residency && cai->msr == MSR_PKG_C6_RESIDENCY) cai->msr = MSR_ATOM_PKG_C6_RESIDENCY; } + + for (int i = 0; i < NUM_MSR_COUNTERS; ++i) { + msr_counter_arch_infos[i].present = false; + msr_counter_arch_infos[i].needed = false; + } } void probe_pm_features(void) @@ -7837,21 +7834,6 @@ void set_base_cpu(void) err(-ENODEV, "No valid cpus found"); } -static void set_amperf_source(void) -{ - amperf_source = AMPERF_SOURCE_PERF; - - const bool aperf_required = is_aperf_access_required(); - - if (no_perf || !aperf_required || !has_amperf_access_via_perf()) - amperf_source = AMPERF_SOURCE_MSR; - - if (quiet || !debug) - return; - - fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf"); -} - bool has_added_counters(void) { /* @@ -7862,54 +7844,8 @@ bool has_added_counters(void) return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters; } -bool is_msr_access_required(void) -{ - if (no_msr) - return false; - - if (has_added_counters()) - return true; - - return BIC_IS_ENABLED(BIC_SMI) - || BIC_IS_ENABLED(BIC_CPU_c1) - || BIC_IS_ENABLED(BIC_CPU_c3) - || BIC_IS_ENABLED(BIC_CPU_c6) - || BIC_IS_ENABLED(BIC_CPU_c7) - || BIC_IS_ENABLED(BIC_Mod_c6) - || BIC_IS_ENABLED(BIC_CoreTmp) - || BIC_IS_ENABLED(BIC_Totl_c0) - || BIC_IS_ENABLED(BIC_Any_c0) - || BIC_IS_ENABLED(BIC_GFX_c0) - || BIC_IS_ENABLED(BIC_CPUGFX) - || BIC_IS_ENABLED(BIC_Pkgpc3) - || BIC_IS_ENABLED(BIC_Pkgpc6) - || BIC_IS_ENABLED(BIC_Pkgpc2) - || BIC_IS_ENABLED(BIC_Pkgpc7) - || BIC_IS_ENABLED(BIC_Pkgpc8) - || BIC_IS_ENABLED(BIC_Pkgpc9) - || BIC_IS_ENABLED(BIC_Pkgpc10) - /* TODO: Multiplex access with perf */ - || BIC_IS_ENABLED(BIC_CorWatt) - || BIC_IS_ENABLED(BIC_Cor_J) - || BIC_IS_ENABLED(BIC_PkgWatt) - || BIC_IS_ENABLED(BIC_CorWatt) - || BIC_IS_ENABLED(BIC_GFXWatt) - || BIC_IS_ENABLED(BIC_RAMWatt) - || BIC_IS_ENABLED(BIC_Pkg_J) - || BIC_IS_ENABLED(BIC_Cor_J) - || BIC_IS_ENABLED(BIC_GFX_J) - || BIC_IS_ENABLED(BIC_RAM_J) - || BIC_IS_ENABLED(BIC_PKG__) - || BIC_IS_ENABLED(BIC_RAM__) - || BIC_IS_ENABLED(BIC_PkgTmp) - || (is_aperf_access_required() && !has_amperf_access_via_perf()); -} - void check_msr_access(void) { - if (!is_msr_access_required()) - no_msr = 1; - check_dev_msr(); check_msr_permission(); @@ -7919,19 +7855,8 @@ void check_msr_access(void) void check_perf_access(void) { - const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC); - - if (no_perf || !intrcount_required || !has_instr_count_access()) + if (no_perf || !BIC_IS_ENABLED(BIC_IPC) || !has_instr_count_access()) bic_enabled &= ~BIC_IPC; - - const bool aperf_required = is_aperf_access_required(); - - if (!aperf_required || !has_amperf_access()) { - bic_enabled &= ~BIC_Avg_MHz; - bic_enabled &= ~BIC_Busy; - bic_enabled &= ~BIC_Bzy_MHz; - bic_enabled &= ~BIC_IPC; - } } void turbostat_init() @@ -7943,7 +7868,7 @@ void turbostat_init() process_cpuid(); counter_info_init(); probe_pm_features(); - set_amperf_source(); + msr_perf_init(); linux_perf_init(); rapl_perf_init(); cstate_perf_init(); @@ -7951,8 +7876,8 @@ void turbostat_init() for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); - if (DO_BIC(BIC_IPC)) - (void)get_instr_count_fd(base_cpu); + if (BIC_IS_ENABLED(BIC_IPC) && has_aperf_access && get_instr_count_fd(base_cpu) != -1) + BIC_PRESENT(BIC_IPC); /* * If TSC tweak is needed, but couldn't get it, From 361b8fc73cf63dc0c3be3778720631f1a33ba9db Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 28 May 2024 15:46:10 +0200 Subject: [PATCH 034/120] tools/power turbostat: Extend --add option with perf counters User can now read perf counters using "--add perf//". Other details work similarly to how --add works with MSRs. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/Makefile | 1 + tools/power/x86/turbostat/turbostat.8 | 5 +- tools/power/x86/turbostat/turbostat.c | 562 +++++++++++++++++++++++++- 3 files changed, 557 insertions(+), 11 deletions(-) diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile index b1e6817f1e54..3946d5254a1f 100644 --- a/tools/power/x86/turbostat/Makefile +++ b/tools/power/x86/turbostat/Makefile @@ -46,6 +46,7 @@ snapshot: turbostat @echo "#define GENMASK_ULL(h, l) (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h @echo '#define BUILD_BUG_ON(cond) do { enum { compile_time_check ## __COUNTER__ = 1/(!(cond)) }; } while (0)' > $(SNAPSHOT)/build_bug.h + @echo '#define __must_be_array(arr) 0' >> $(SNAPSHOT)/build_bug.h @echo PWD=. > $(SNAPSHOT)/Makefile @echo "CFLAGS += -DMSRHEADER='\"msr-index.h\"'" >> $(SNAPSHOT)/Makefile diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 8d37acd39201..5537fc6b5bc3 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -28,10 +28,13 @@ name as necessary to disambiguate it from others is necessary. Note that option .PP \fB--add attributes\fP add column with counter having specified 'attributes'. The 'location' attribute is required, all others are optional. .nf - location: {\fBmsrDDD\fP | \fBmsr0xXXX\fP | \fB/sys/path...\fP} + location: {\fBmsrDDD\fP | \fBmsr0xXXX\fP | \fB/sys/path...\fP | \fBperf//\fP} msrDDD is a decimal offset, eg. msr16 msr0xXXX is a hex offset, eg. msr0x10 /sys/path... is an absolute path to a sysfs attribute + is a perf device from /sys/bus/event_source/devices/ eg. cstate_core + is a perf event for given device from /sys/bus/event_source/devices//events/ eg. c1-residency + perf/cstate_core/c1-residency would then use /sys/bus/event_source/devices/cstate_core/events/c1-residency scope: {\fBcpu\fP | \fBcore\fP | \fBpackage\fP} sample and print the counter for every cpu, core, or package. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 495235055fa2..be345a4bbe96 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -79,14 +79,40 @@ */ #define NAME_BYTES 20 #define PATH_BYTES 128 +#define PERF_NAME_BYTES 128 #define MAX_NOFILE 0x8000 +#define COUNTER_KIND_PERF_PREFIX "perf/" +#define COUNTER_KIND_PERF_PREFIX_LEN strlen(COUNTER_KIND_PERF_PREFIX) +#define PERF_DEV_NAME_BYTES 32 +#define PERF_EVT_NAME_BYTES 32 + enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; enum counter_source { COUNTER_SOURCE_NONE, COUNTER_SOURCE_PERF, COUNTER_SOURCE_MSR }; +struct perf_counter_info { + struct perf_counter_info *next; + + /* How to open the counter / What counter it is. */ + char device[PERF_DEV_NAME_BYTES]; + char event[PERF_EVT_NAME_BYTES]; + + /* How to show/format the counter. */ + char name[PERF_NAME_BYTES]; + unsigned int width; + enum counter_scope scope; + enum counter_type type; + enum counter_format format; + double scale; + + /* For reading the counter. */ + int *fd_perf_per_domain; + size_t num_domains; +}; + struct sysfs_path { char path[PATH_BYTES]; int id; @@ -1457,6 +1483,7 @@ struct thread_data { unsigned int flags; bool is_atom; unsigned long long counter[MAX_ADDED_THREAD_COUNTERS]; + unsigned long long perf_counter[MAX_ADDED_THREAD_COUNTERS]; } *thread_even, *thread_odd; struct core_data { @@ -1470,6 +1497,7 @@ struct core_data { unsigned int core_id; unsigned long long core_throt_cnt; unsigned long long counter[MAX_ADDED_CORE_COUNTERS]; + unsigned long long perf_counter[MAX_ADDED_CORE_COUNTERS]; } *core_even, *core_odd; struct pkg_data { @@ -1503,6 +1531,7 @@ struct pkg_data { unsigned int pkg_temp_c; unsigned int uncore_mhz; unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS]; + unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS]; } *package_even, *package_odd; #define ODD_COUNTERS thread_odd, core_odd, package_odd @@ -1637,12 +1666,21 @@ int idx_valid(int idx) } struct sys_counters { + /* MSR added counters */ unsigned int added_thread_counters; unsigned int added_core_counters; unsigned int added_package_counters; struct msr_counter *tp; struct msr_counter *cp; struct msr_counter *pp; + + /* perf added counters */ + unsigned int added_thread_perf_counters; + unsigned int added_core_perf_counters; + unsigned int added_package_perf_counters; + struct perf_counter_info *perf_tp; + struct perf_counter_info *perf_cp; + struct perf_counter_info *perf_pp; } sys; static size_t free_msr_counters_(struct msr_counter **pp) @@ -1902,6 +1940,23 @@ int probe_msr(int cpu, off_t offset) return 0; } +/* Convert CPU ID to domain ID for given added perf counter. */ +unsigned int cpu_to_domain(const struct perf_counter_info *pc, int cpu) +{ + switch (pc->scope) { + case SCOPE_CPU: + return cpu; + + case SCOPE_CORE: + return cpus[cpu].physical_core_id; + + case SCOPE_PACKAGE: + return cpus[cpu].physical_package_id; + } + + __builtin_unreachable(); +} + #define MAX_DEFERRED 16 char *deferred_add_names[MAX_DEFERRED]; char *deferred_skip_names[MAX_DEFERRED]; @@ -1925,6 +1980,7 @@ void help(void) "to print statistics, until interrupted.\n" " -a, --add add a counter\n" " eg. --add msr0x10,u64,cpu,delta,MY_TSC\n" + " eg. --add perf/cstate_pkg/c2-residency,package,delta,percent,perfPC2\n" " -c, --cpu cpu-set limit output to summary plus cpu-set:\n" " {core | package | j,k,l..m,n-p }\n" " -d, --debug displays usec, Time_Of_Day_Seconds and more debugging\n" @@ -2034,6 +2090,7 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) void print_header(char *delim) { struct msr_counter *mp; + struct perf_counter_info *pp; int printed = 0; if (DO_BIC(BIC_USEC)) @@ -2091,6 +2148,21 @@ void print_header(char *delim) } } + for (pp = sys.perf_tp; pp; pp = pp->next) { + + if (pp->format == FORMAT_RAW) { + if (pp->width == 64) + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name); + } else { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name); + } + } + if (DO_BIC(BIC_CPU_c1)) outp += sprintf(outp, "%sCPU%%c1", (printed++ ? delim : "")); if (DO_BIC(BIC_CPU_c3)) @@ -2131,6 +2203,21 @@ void print_header(char *delim) } } + for (pp = sys.perf_cp; pp; pp = pp->next) { + + if (pp->format == FORMAT_RAW) { + if (pp->width == 64) + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name); + } else { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name); + } + } + if (DO_BIC(BIC_PkgTmp)) outp += sprintf(outp, "%sPkgTmp", (printed++ ? delim : "")); @@ -2226,6 +2313,21 @@ void print_header(char *delim) } } + for (pp = sys.perf_pp; pp; pp = pp->next) { + + if (pp->format == FORMAT_RAW) { + if (pp->width == 64) + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name); + } else { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name); + else + outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name); + } + } + outp += sprintf(outp, "\n"); } @@ -2346,6 +2448,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data char *fmt8; int i; struct msr_counter *mp; + struct perf_counter_info *pp; char *delim = "\t"; int printed = 0; @@ -2483,6 +2586,31 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } } + /* Added perf counters */ + for (i = 0, pp = sys.perf_tp; pp; ++i, pp = pp->next) { + if (pp->format == FORMAT_RAW) { + if (pp->width == 32) + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)t->perf_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->perf_counter[i]); + } else if (pp->format == FORMAT_DELTA) { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->perf_counter[i]); + else + outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->perf_counter[i]); + } else if (pp->format == FORMAT_PERCENT) { + if (pp->type == COUNTER_USEC) + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + t->perf_counter[i] / interval_float / 10000); + else + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->perf_counter[i] / tsc); + } + } + /* C1 */ if (DO_BIC(BIC_CPU_c1)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc); @@ -2526,6 +2654,24 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } } + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) { + if (pp->width == 32) + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)c->perf_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->perf_counter[i]); + } else if (pp->format == FORMAT_DELTA) { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), c->perf_counter[i]); + else + outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->perf_counter[i]); + } else if (pp->format == FORMAT_PERCENT) { + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->perf_counter[i] / tsc); + } + } + fmt8 = "%s%.2f"; if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) @@ -2680,6 +2826,26 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000); } + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) { + if (pp->width == 32) + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)p->perf_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->perf_counter[i]); + } else if (pp->format == FORMAT_DELTA) { + if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns) + outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), p->perf_counter[i]); + else + outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->perf_counter[i]); + } else if (pp->format == FORMAT_PERCENT) { + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->perf_counter[i] / tsc); + } else if (pp->type == COUNTER_K2M) + outp += + sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->perf_counter[i] / 1000); + } + done: if (*(outp - 1) != '\n') outp += sprintf(outp, "\n"); @@ -2733,6 +2899,7 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) { int i; struct msr_counter *mp; + struct perf_counter_info *pp; if (DO_BIC(BIC_Totl_c0)) old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0; @@ -2793,6 +2960,15 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->counter[i] = new->counter[i] - old->counter[i]; } + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + old->perf_counter[i] = new->perf_counter[i]; + else if (pp->format == FORMAT_AVERAGE) + old->perf_counter[i] = new->perf_counter[i]; + else + old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; + } + return 0; } @@ -2800,6 +2976,7 @@ void delta_core(struct core_data *new, struct core_data *old) { int i; struct msr_counter *mp; + struct perf_counter_info *pp; old->c3 = new->c3 - old->c3; old->c6 = new->c6 - old->c6; @@ -2816,6 +2993,13 @@ void delta_core(struct core_data *new, struct core_data *old) else old->counter[i] = new->counter[i] - old->counter[i]; } + + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + old->perf_counter[i] = new->perf_counter[i]; + else + old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; + } } int soft_c1_residency_display(int bic) @@ -2833,6 +3017,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d { int i; struct msr_counter *mp; + struct perf_counter_info *pp; /* we run cpuid just the 1st time, copy the results */ if (DO_BIC(BIC_APIC)) @@ -2911,6 +3096,14 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d else old->counter[i] = new->counter[i] - old->counter[i]; } + + for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + old->perf_counter[i] = new->perf_counter[i]; + else + old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; + } + return 0; } @@ -3013,6 +3206,10 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) p->counter[i] = 0; + + memset(&t->perf_counter[0], 0, sizeof(t->perf_counter)); + memset(&c->perf_counter[0], 0, sizeof(c->perf_counter)); + memset(&p->perf_counter[0], 0, sizeof(p->perf_counter)); } void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src) @@ -3033,6 +3230,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; struct msr_counter *mp; + struct perf_counter_info *pp; /* copy un-changing apic_id's */ if (DO_BIC(BIC_APIC)) @@ -3063,6 +3261,12 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.threads.counter[i] += t->counter[i]; } + for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + average.threads.perf_counter[i] += t->perf_counter[i]; + } + /* sum per-core values only for 1st thread in core */ if (!is_cpu_first_thread_in_core(t, c, p)) return 0; @@ -3083,6 +3287,12 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.cores.counter[i] += c->counter[i]; } + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + average.cores.perf_counter[i] += c->perf_counter[i]; + } + /* sum per-pkg values only for 1st core in pkg */ if (!is_cpu_first_core_in_package(t, c, p)) return 0; @@ -3134,6 +3344,14 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) else average.packages.counter[i] += p->counter[i]; } + + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if ((pp->format == FORMAT_RAW) && (topo.num_packages == 0)) + average.packages.perf_counter[i] = p->perf_counter[i]; + else + average.packages.perf_counter[i] += p->perf_counter[i]; + } + return 0; } @@ -3145,6 +3363,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data { int i; struct msr_counter *mp; + struct perf_counter_info *pp; clear_counters(&average.threads, &average.cores, &average.packages); @@ -3216,6 +3435,35 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data } average.packages.counter[i] /= topo.allowed_packages; } + + for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + if (pp->type == COUNTER_ITEMS) { + if (average.threads.perf_counter[i] > 9999999) + sums_need_wide_columns = 1; + continue; + } + average.threads.perf_counter[i] /= topo.allowed_cpus; + } + for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + if (pp->type == COUNTER_ITEMS) { + if (average.cores.perf_counter[i] > 9999999) + sums_need_wide_columns = 1; + } + average.cores.perf_counter[i] /= topo.allowed_cores; + } + for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) { + if (pp->format == FORMAT_RAW) + continue; + if (pp->type == COUNTER_ITEMS) { + if (average.packages.perf_counter[i] > 9999999) + sums_need_wide_columns = 1; + } + average.packages.perf_counter[i] /= topo.allowed_packages; + } } static unsigned long long rdtsc(void) @@ -3848,6 +4096,31 @@ int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t) return 0; } +int perf_counter_info_read_values(struct perf_counter_info *pp, int cpu, unsigned long long *out, size_t out_size) +{ + unsigned int domain; + unsigned long long value; + int fd_counter; + + for (size_t i = 0; pp; ++i, pp = pp->next) { + domain = cpu_to_domain(pp, cpu); + assert(domain < pp->num_domains); + + fd_counter = pp->fd_perf_per_domain[domain]; + + if (fd_counter == -1) + continue; + + if (read(fd_counter, &value, sizeof(value)) != sizeof(value)) + return 1; + + assert(i < out_size); + out[i] = value * pp->scale; + } + + return 0; +} + /* * get_counters(...) * migrate to cpu @@ -3889,6 +4162,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -10; } + if (perf_counter_info_read_values(sys.perf_tp, cpu, t->perf_counter, MAX_ADDED_THREAD_COUNTERS)) + return -10; + /* collect core counters only for 1st thread in core */ if (!is_cpu_first_thread_in_core(t, c, p)) goto done; @@ -3927,6 +4203,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -10; } + if (perf_counter_info_read_values(sys.perf_cp, cpu, c->perf_counter, MAX_ADDED_CORE_COUNTERS)) + return -10; + /* collect package counters only for 1st core in package */ if (!is_cpu_first_core_in_package(t, c, p)) goto done; @@ -3999,6 +4278,10 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_mp(cpu, mp, &p->counter[i], path)) return -10; } + + if (perf_counter_info_read_values(sys.perf_pp, cpu, p->perf_counter, MAX_ADDED_PACKAGE_COUNTERS)) + return -10; + done: gettimeofday(&t->tv_end, (struct timezone *)NULL); @@ -4528,6 +4811,36 @@ void free_fd_rapl_percpu(void) rapl_counter_info_perdomain_size = 0; } +void free_fd_added_perf_counters_(struct perf_counter_info *pp) +{ + if (!pp) + return; + + if (!pp->fd_perf_per_domain) + return; + + while (pp) { + for (size_t domain = 0; domain < pp->num_domains; ++domain) { + if (pp->fd_perf_per_domain[domain] != -1) { + close(pp->fd_perf_per_domain[domain]); + pp->fd_perf_per_domain[domain] = -1; + } + } + + free(pp->fd_perf_per_domain); + pp->fd_perf_per_domain = NULL; + + pp = pp->next; + } +} + +void free_fd_added_perf_counters(void) +{ + free_fd_added_perf_counters_(sys.perf_tp); + free_fd_added_perf_counters_(sys.perf_cp); + free_fd_added_perf_counters_(sys.perf_pp); +} + void free_all_buffers(void) { int i; @@ -4573,6 +4886,7 @@ void free_all_buffers(void) free_fd_msr(); free_fd_rapl_percpu(); free_fd_cstate(); + free_fd_added_perf_counters(); free(irq_column_2_cpu); free(irqs_per_cpu); @@ -4910,6 +5224,7 @@ void linux_perf_init(void); void msr_perf_init(void); void rapl_perf_init(void); void cstate_perf_init(void); +void added_perf_counters_init(void); void re_initialize(void) { @@ -4919,6 +5234,7 @@ void re_initialize(void) msr_perf_init(); rapl_perf_init(); cstate_perf_init(); + added_perf_counters_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -7859,6 +8175,117 @@ void check_perf_access(void) bic_enabled &= ~BIC_IPC; } +int added_perf_counters_init_(struct perf_counter_info *pinfo) +{ + size_t num_domains = 0; + unsigned int next_domain; + bool *domain_visited; + unsigned int perf_type, perf_config; + double perf_scale; + int fd_perf; + + if (!pinfo) + return 0; + + const size_t max_num_domains = MAX(topo.max_cpu_num + 1, MAX(topo.max_core_id + 1, topo.max_package_id + 1)); + + domain_visited = calloc(max_num_domains, sizeof(*domain_visited)); + + while (pinfo) { + switch (pinfo->scope) { + case SCOPE_CPU: + num_domains = topo.max_cpu_num + 1; + break; + + case SCOPE_CORE: + num_domains = topo.max_core_id + 1; + break; + + case SCOPE_PACKAGE: + num_domains = topo.max_package_id + 1; + break; + } + + /* Allocate buffer for file descriptor for each domain. */ + pinfo->fd_perf_per_domain = calloc(num_domains, sizeof(*pinfo->fd_perf_per_domain)); + if (!pinfo->fd_perf_per_domain) + errx(1, "%s: alloc %s", __func__, "fd_perf_per_domain"); + + for (size_t i = 0; i < num_domains; ++i) + pinfo->fd_perf_per_domain[i] = -1; + + pinfo->num_domains = num_domains; + pinfo->scale = 1.0; + + memset(domain_visited, 0, max_num_domains * sizeof(*domain_visited)); + + for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) { + + next_domain = cpu_to_domain(pinfo, cpu); + + assert(next_domain < num_domains); + + if (cpu_is_not_allowed(cpu)) + continue; + + if (domain_visited[next_domain]) + continue; + + perf_type = read_perf_type(pinfo->device); + if (perf_type == (unsigned int)-1) { + warnx("%s: perf/%s/%s: failed to read %s", + __func__, pinfo->device, pinfo->event, "type"); + continue; + } + + perf_config = read_rapl_config(pinfo->device, pinfo->event); + if (perf_config == (unsigned int)-1) { + warnx("%s: perf/%s/%s: failed to read %s", + __func__, pinfo->device, pinfo->event, "config"); + continue; + } + + /* Scale is not required, some counters just don't have it. */ + perf_scale = read_perf_rapl_scale(pinfo->device, pinfo->event); + if (perf_scale == 0.0) + perf_scale = 1.0; + + fd_perf = open_perf_counter(cpu, perf_type, perf_config, -1, 0); + if (fd_perf == -1) { + warnx("%s: perf/%s/%s: failed to open counter on cpu%d", + __func__, pinfo->device, pinfo->event, cpu); + continue; + } + + domain_visited[next_domain] = 1; + pinfo->fd_perf_per_domain[next_domain] = fd_perf; + pinfo->scale = perf_scale; + + if (debug) + printf("Add perf/%s/%s cpu%d: %d\n", + pinfo->device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]); + } + + pinfo = pinfo->next; + } + + free(domain_visited); + + return 0; +} + +void added_perf_counters_init(void) +{ + if (added_perf_counters_init_(sys.perf_tp)) + errx(1, "%s: %s", __func__, "thread"); + + if (added_perf_counters_init_(sys.perf_cp)) + errx(1, "%s: %s", __func__, "core"); + + if (added_perf_counters_init_(sys.perf_pp)) + errx(1, "%s: %s", __func__, "package"); +} + void turbostat_init() { setup_all_buffers(true); @@ -7872,6 +8299,7 @@ void turbostat_init() linux_perf_init(); rapl_perf_init(); cstate_perf_init(); + added_perf_counters_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); @@ -8061,6 +8489,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = calloc(1, sizeof(struct msr_counter)); if (msrp == NULL) err(-1, "calloc msr_counter"); + msrp->msr_num = msr_num; strncpy(msrp->name, name, NAME_BYTES - 1); msrp->width = width; @@ -8101,11 +8530,106 @@ int add_counter(unsigned int msr_num, char *path, char *name, return 0; } +/* + * Initialize the fields used for identifying and opening the counter. + * + * Defer the initialization of any runtime buffers for actually reading + * the counters for when we initialize all perf counters, so we can later + * easily call re_initialize(). + */ +struct perf_counter_info *make_perf_counter_info(const char *perf_device, + const char *perf_event, + const char *name, + unsigned int width, + enum counter_scope scope, + enum counter_type type, enum counter_format format) +{ + struct perf_counter_info *pinfo; + + pinfo = calloc(1, sizeof(*pinfo)); + if (!pinfo) + errx(1, "%s: Failed to allocate %s/%s\n", __func__, perf_device, perf_event); + + strncpy(pinfo->device, perf_device, ARRAY_SIZE(pinfo->device) - 1); + strncpy(pinfo->event, perf_event, ARRAY_SIZE(pinfo->event) - 1); + + strncpy(pinfo->name, name, ARRAY_SIZE(pinfo->name) - 1); + pinfo->width = width; + pinfo->scope = scope; + pinfo->type = type; + pinfo->format = format; + + return pinfo; +} + +int add_perf_counter(const char *perf_device, const char *perf_event, const char *name_buffer, unsigned int width, + enum counter_scope scope, enum counter_type type, enum counter_format format) +{ + struct perf_counter_info *pinfo; + + switch (scope) { + case SCOPE_CPU: + if (sys.added_thread_perf_counters >= MAX_ADDED_THREAD_COUNTERS) { + warnx("ignoring thread counter perf/%s/%s", perf_device, perf_event); + return -1; + } + break; + + case SCOPE_CORE: + if (sys.added_core_perf_counters >= MAX_ADDED_CORE_COUNTERS) { + warnx("ignoring core counter perf/%s/%s", perf_device, perf_event); + return -1; + } + break; + + case SCOPE_PACKAGE: + if (sys.added_package_perf_counters >= MAX_ADDED_PACKAGE_COUNTERS) { + warnx("ignoring package counter perf/%s/%s", perf_device, perf_event); + return -1; + } + break; + } + + pinfo = make_perf_counter_info(perf_device, perf_event, name_buffer, width, scope, type, format); + + if (!pinfo) + return -1; + + switch (scope) { + case SCOPE_CPU: + pinfo->next = sys.perf_tp; + sys.perf_tp = pinfo; + ++sys.added_thread_perf_counters; + break; + + case SCOPE_CORE: + pinfo->next = sys.perf_cp; + sys.perf_cp = pinfo; + ++sys.added_core_perf_counters; + break; + + case SCOPE_PACKAGE: + pinfo->next = sys.perf_pp; + sys.perf_pp = pinfo; + ++sys.added_package_perf_counters; + break; + } + + // FIXME: we might not have debug here yet + if (debug) + printf("%s: %s/%s, name: %s, scope%d\n", + __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope); + + return 0; +} + void parse_add_command(char *add_command) { int msr_num = 0; char *path = NULL; - char name_buffer[NAME_BYTES] = ""; + char perf_device[PERF_DEV_NAME_BYTES] = ""; + char perf_event[PERF_EVT_NAME_BYTES] = ""; + char name_buffer[PERF_NAME_BYTES] = ""; int width = 64; int fail = 0; enum counter_scope scope = SCOPE_CPU; @@ -8120,6 +8644,11 @@ void parse_add_command(char *add_command) if (sscanf(add_command, "msr%d", &msr_num) == 1) goto next; + BUILD_BUG_ON(ARRAY_SIZE(perf_device) <= 31); + BUILD_BUG_ON(ARRAY_SIZE(perf_event) <= 31); + if (sscanf(add_command, "perf/%31[^/]/%31[^,]", &perf_device[0], &perf_event[0]) == 2) + goto next; + if (*add_command == '/') { path = add_command; goto next; @@ -8167,7 +8696,8 @@ void parse_add_command(char *add_command) goto next; } - if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) { /* 18 < NAME_BYTES */ + BUILD_BUG_ON(ARRAY_SIZE(name_buffer) <= 18); + if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) { char *eos; eos = strchr(name_buffer, ','); @@ -8184,21 +8714,33 @@ next: } } - if ((msr_num == 0) && (path == NULL)) { - fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter ) required\n"); + if ((msr_num == 0) && (path == NULL) && (perf_device[0] == '\0' || perf_event[0] == '\0')) { + fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter | perf/device/event ) required\n"); fail++; } + /* Test for non-empty perf_device and perf_event */ + const bool is_perf_counter = perf_device[0] && perf_event[0]; + /* generate default column header */ if (*name_buffer == '\0') { - if (width == 32) - sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); - else - sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); + if (is_perf_counter) { + snprintf(name_buffer, ARRAY_SIZE(name_buffer), "perf/%s", perf_event); + } else { + if (width == 32) + sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); + else + sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : ""); + } } - if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0)) - fail++; + if (is_perf_counter) { + if (add_perf_counter(perf_device, perf_event, name_buffer, width, scope, type, format)) + fail++; + } else { + if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0)) + fail++; + } if (fail) { help(); From 25826c20da55a114c2cba8c4f237dfe7a7b4f8f6 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 9 Jul 2024 14:22:30 +0200 Subject: [PATCH 035/120] tools/power turbostat: Fix formatting in turbostat.8 We had an extra "+" at the beginning of some lines that look like a poorly formated patch. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 5537fc6b5bc3..8a21e9b56071 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -70,10 +70,10 @@ The column name "all" can be used to enable all disabled-by-default built-in cou .PP \fB--quiet\fP Do not decode and print the system configuration header information. .PP -+\fB--no-msr\fP Disable all the uses of the MSR driver. -+.PP -+\fB--no-perf\fP Disable all the uses of the perf API. -+.PP +\fB--no-msr\fP Disable all the uses of the MSR driver. +.PP +\fB--no-perf\fP Disable all the uses of the perf API. +.PP \fB--interval seconds\fP overrides the default 5.0 second measurement interval. .PP \fB--num_iterations num\fP number of the measurement iterations. From 9f50066b0dd47cff045ba6da37e6def52783ba55 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 9 Jul 2024 14:46:20 +0200 Subject: [PATCH 036/120] tools/power turbostat: Add perf added counter example to turbostat.8 We had few lines about the feature, but without any complete examples. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 8a21e9b56071..71ae2d5159ad 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -336,6 +336,24 @@ CPU PRF_CTRL .fi +.SH ADD PERF COUNTER EXAMPLE +Here we limit turbostat to showing just the CPU number for cpu0 - cpu3. +We add a counter showing time spent in C1 core cstate, +labeling it with the column header, "pCPU%c1", and display it only once, +after the conclusion of 0.1 second sleep. +We also show CPU%c1 built-in counter that should show similar values. +.nf +sudo ./turbostat --quiet --cpu 0-3 --show CPU,CPU%c1 --add perf/cstate_core/c1-residency,cpu,delta,percent,pCPU%c1 sleep .1 +0.102448 sec +CPU pCPU%c1 CPU%c1 +- 34.89 34.89 +0 45.99 45.99 +1 45.94 45.94 +2 23.83 23.83 +3 23.84 23.84 + +.fi + .SH INPUT For interval-mode, turbostat will immediately end the current interval From 478a01016c08d33635359254fc1ab61a8c83dd1a Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 9 Jul 2024 14:53:14 +0200 Subject: [PATCH 037/120] tools/power turbostat: Fix typo in turbostat.8 "After" was missing an "r", nothing to see here. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 71ae2d5159ad..44cbc7cb382d 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -323,7 +323,7 @@ available on all processors. Here we limit turbostat to showing just the CPU number for cpu0 - cpu3. We add a counter showing the 32-bit raw value of MSR 0x199 (MSR_IA32_PERF_CTL), labeling it with the column header, "PRF_CTRL", and display it only once, -afte the conclusion of a 0.1 second sleep. +after the conclusion of a 0.1 second sleep. .nf sudo ./turbostat --quiet --cpu 0-3 --show CPU --add msr0x199,u32,raw,PRF_CTRL sleep .1 0.101604 sec From e516211f615fb54ce3429870eefc17469ae289b8 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:05:56 +0200 Subject: [PATCH 038/120] rust: macros: indent list item in `paste!`'s docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A new style lint, `doc_lazy_continuation` [1], has been introduced in the upcoming Rust 1.80 (currently in beta), which detects missing indentation in code documentation. We have one such case: error: doc list item missing indentation --> rust/macros/lib.rs:315:5 | 315 | /// default the span of the `[< >]` group is used. | ^ | = help: if this is supposed to be its own paragraph, add a blank line = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#doc_lazy_continuation = note: `-D clippy::doc-lazy-continuation` implied by `-D clippy::style` = help: to override `-D clippy::style` add `#[allow(clippy::doc_lazy_continuation)]` help: indent this line | 315 | /// default the span of the `[< >]` group is used. | ++ While the rendering of the docs by `rustdoc` is not affected, we apply this kind of indentation elsewhere since it looks better. Thus clean it up. Link: https://rust-lang.github.io/rust-clippy/master/index.html#/doc_lazy_continuation [1] Reviewed-by: Björn Roy Baron Reviewed-by: Finn Behrens Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-2-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- rust/macros/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs index 6dcc72aff111..159e75292970 100644 --- a/rust/macros/lib.rs +++ b/rust/macros/lib.rs @@ -345,7 +345,7 @@ pub fn pinned_drop(args: TokenStream, input: TokenStream) -> TokenStream { /// /// Currently supported modifiers are: /// * `span`: change the span of concatenated identifier to the span of the specified token. By -/// default the span of the `[< >]` group is used. +/// default the span of the `[< >]` group is used. /// * `lower`: change the identifier to lower case. /// * `upper`: change the identifier to upper case. /// From dee1396a486cf2b6e7840322f6d104680649f2ff Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:05:57 +0200 Subject: [PATCH 039/120] rust: init: simplify from `map_err` to `inspect_err` A new complexity lint, `manual_inspect` [1], has been introduced in the upcoming Rust 1.81 (currently in nightly), which checks for uses of `map*` which return the original item: error: --> rust/kernel/init.rs:846:23 | 846 | (self.1)(val).map_err(|e| { | ^^^^^^^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#manual_inspect = note: `-D clippy::manual-inspect` implied by `-D warnings` = help: to override `-D warnings` add `#[allow(clippy::manual_inspect)]` help: try | 846 ~ (self.1)(val).inspect_err(|e| { 847 | // SAFETY: `slot` was initialized above. 848 ~ unsafe { core::ptr::drop_in_place(slot) }; | Thus clean them up. Link: https://rust-lang.github.io/rust-clippy/master/index.html#/manual_inspect [1] Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-3-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- rust/kernel/init.rs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/rust/kernel/init.rs b/rust/kernel/init.rs index 68605b633e73..495c09ebe3a3 100644 --- a/rust/kernel/init.rs +++ b/rust/kernel/init.rs @@ -843,11 +843,8 @@ where let val = unsafe { &mut *slot }; // SAFETY: `slot` is considered pinned. let val = unsafe { Pin::new_unchecked(val) }; - (self.1)(val).map_err(|e| { - // SAFETY: `slot` was initialized above. - unsafe { core::ptr::drop_in_place(slot) }; - e - }) + // SAFETY: `slot` was initialized above. + (self.1)(val).inspect_err(|_| unsafe { core::ptr::drop_in_place(slot) }) } } @@ -941,11 +938,9 @@ where // SAFETY: All requirements fulfilled since this function is `__init`. unsafe { self.0.__pinned_init(slot)? }; // SAFETY: The above call initialized `slot` and we still have unique access. - (self.1)(unsafe { &mut *slot }).map_err(|e| { + (self.1)(unsafe { &mut *slot }).inspect_err(|_| // SAFETY: `slot` was initialized above. - unsafe { core::ptr::drop_in_place(slot) }; - e - }) + unsafe { core::ptr::drop_in_place(slot) }) } } From f85bea18f71b2817ea45d63c6d1b91f9bc4a811f Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:05:58 +0200 Subject: [PATCH 040/120] rust: allow `dead_code` for never constructed bindings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Starting with the upcoming Rust 1.80.0 (since upstream commit 35130d7233e9 ("Detect pub structs never constructed and unused associated constants in traits")), the `dead_code` pass detects more cases, which triggers in the `bindings` crate: warning: struct `boot_params` is never constructed --> rust/bindings/bindings_generated.rs:10684:12 | 10684 | pub struct boot_params { | ^^^^^^^^^^^ | = note: `#[warn(dead_code)]` on by default As well as in the `uapi` one: warning: struct `boot_params` is never constructed --> rust/uapi/uapi_generated.rs:10392:12 | 10392 | pub struct boot_params { | ^^^^^^^^^^^ | = note: `#[warn(dead_code)]` on by default These are all expected, since we do not use all the structs in the bindings that `bindgen` generates from the C headers. Therefore, allow them. Reviewed-by: Björn Roy Baron Reviewed-by: Finn Behrens Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-4-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- rust/bindings/lib.rs | 1 + rust/uapi/lib.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/rust/bindings/lib.rs b/rust/bindings/lib.rs index 40ddaee50d8b..93a1a3fc97bc 100644 --- a/rust/bindings/lib.rs +++ b/rust/bindings/lib.rs @@ -24,6 +24,7 @@ unsafe_op_in_unsafe_fn )] +#[allow(dead_code)] mod bindings_raw { // Use glob import here to expose all helpers. // Symbols defined within the module will take precedence to the glob import. diff --git a/rust/uapi/lib.rs b/rust/uapi/lib.rs index 0caad902ba40..80a00260e3e7 100644 --- a/rust/uapi/lib.rs +++ b/rust/uapi/lib.rs @@ -14,6 +14,7 @@ #![cfg_attr(test, allow(unsafe_op_in_unsafe_fn))] #![allow( clippy::all, + dead_code, missing_docs, non_camel_case_types, non_upper_case_globals, From f8f88aa25a03ce1e0fc8a9842840988b870f0c37 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:05:59 +0200 Subject: [PATCH 041/120] rust: relax most deny-level lints to warnings Since we are starting to support several Rust toolchains, lints (including Clippy ones) now may behave differently and lint groups may include new lints. Therefore, to maximize the chances a given version works, relax some deny-level lints to warnings. It may also make our lives a bit easier while developing new code or refactoring. To be clear, the requirements for in-tree code are still the same, since Rust code still needs to be warning-free (patches should be clean under `WERROR=y`) and the set of lints is not changed. `unsafe_op_in_unsafe_fn` is left unmodified, i.e. as an error, since it is becoming the default in the language (warn-by-default in Rust 2024 [1] and ideally an error later on) and thus it should also be very well tested. In addition, it is simple enough that it should not have false positives (unlike e.g. `rust_2018_idioms`'s `explicit_outlives_requirements`). `non_ascii_idents` is left unmodified as well, i.e. as an error, since it is unlikely one gains any productivity during development if it were a warning (in fact, it may be worse, since it is likely one made a typo). In addition, it should not have false positives. Finally, put the two `-D` ones at the top and take the chance to do one per line. Link: https://github.com/rust-lang/rust/pull/112038 [1] Reviewed-by: Finn Behrens Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-5-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Makefile | 24 +++++++++++++----------- rust/Makefile | 4 ++-- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index fea263aaa492..7ea526814fdb 100644 --- a/Makefile +++ b/Makefile @@ -461,17 +461,19 @@ KBUILD_USERLDFLAGS := $(USERLDFLAGS) # host programs. export rust_common_flags := --edition=2021 \ -Zbinary_dep_depinfo=y \ - -Dunsafe_op_in_unsafe_fn -Drust_2018_idioms \ - -Dunreachable_pub -Dnon_ascii_idents \ + -Dunsafe_op_in_unsafe_fn \ + -Dnon_ascii_idents \ + -Wrust_2018_idioms \ + -Wunreachable_pub \ -Wmissing_docs \ - -Drustdoc::missing_crate_level_docs \ - -Dclippy::correctness -Dclippy::style \ - -Dclippy::suspicious -Dclippy::complexity \ - -Dclippy::perf \ - -Dclippy::let_unit_value -Dclippy::mut_mut \ - -Dclippy::needless_bitwise_bool \ - -Dclippy::needless_continue \ - -Dclippy::no_mangle_with_rust_abi \ + -Wrustdoc::missing_crate_level_docs \ + -Wclippy::correctness -Wclippy::style \ + -Wclippy::suspicious -Wclippy::complexity \ + -Wclippy::perf \ + -Wclippy::let_unit_value -Wclippy::mut_mut \ + -Wclippy::needless_bitwise_bool \ + -Wclippy::needless_continue \ + -Wclippy::no_mangle_with_rust_abi \ -Wclippy::dbg_macro KBUILD_HOSTCFLAGS := $(KBUILD_USERHOSTCFLAGS) $(HOST_LFS_CFLAGS) $(HOSTCFLAGS) @@ -572,7 +574,7 @@ KBUILD_RUSTFLAGS := $(rust_common_flags) \ -Csymbol-mangling-version=v0 \ -Crelocation-model=static \ -Zfunction-sections=n \ - -Dclippy::float_arithmetic + -Wclippy::float_arithmetic KBUILD_AFLAGS_KERNEL := KBUILD_CFLAGS_KERNEL := diff --git a/rust/Makefile b/rust/Makefile index 385378311322..bf05e65365da 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -367,7 +367,7 @@ ifneq ($(or $(CONFIG_ARM64),$(and $(CONFIG_RISCV),$(CONFIG_64BIT))),) endif $(obj)/core.o: private skip_clippy = 1 -$(obj)/core.o: private skip_flags = -Dunreachable_pub +$(obj)/core.o: private skip_flags = -Wunreachable_pub $(obj)/core.o: private rustc_objcopy = $(foreach sym,$(redirect-intrinsics),--redefine-sym $(sym)=__rust$(sym)) $(obj)/core.o: private rustc_target_flags = $(core-cfgs) $(obj)/core.o: $(RUST_LIB_SRC)/core/src/lib.rs FORCE @@ -381,7 +381,7 @@ $(obj)/compiler_builtins.o: $(src)/compiler_builtins.rs $(obj)/core.o FORCE +$(call if_changed_dep,rustc_library) $(obj)/alloc.o: private skip_clippy = 1 -$(obj)/alloc.o: private skip_flags = -Dunreachable_pub +$(obj)/alloc.o: private skip_flags = -Wunreachable_pub $(obj)/alloc.o: private rustc_target_flags = $(alloc-cfgs) $(obj)/alloc.o: $(RUST_LIB_SRC)/alloc/src/lib.rs $(obj)/compiler_builtins.o FORCE +$(call if_changed_dep,rustc_library) From bb421b517e4b05d1e971c1fbf6af2f241accbf52 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:06:00 +0200 Subject: [PATCH 042/120] rust: simplify Clippy warning flags set All Clippy lint groups that we enable, except `correctness`, have a default `warn` level, thus they may be removed now that we relaxed all lints to `warn`. Moreover, Clippy provides an `all` lint group that covers the groups we enable by default. Thus just use `all` instead -- the only change is that, if Clippy introduces a new lint group or splits an existing one, we will cover that one automatically. In addition, `let_unit_value` is in `style` since Rust 1.62.0, thus it does not need to be enabled manually. Reviewed-by: Finn Behrens Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-6-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Makefile | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 7ea526814fdb..9044fdb9adb1 100644 --- a/Makefile +++ b/Makefile @@ -467,10 +467,8 @@ export rust_common_flags := --edition=2021 \ -Wunreachable_pub \ -Wmissing_docs \ -Wrustdoc::missing_crate_level_docs \ - -Wclippy::correctness -Wclippy::style \ - -Wclippy::suspicious -Wclippy::complexity \ - -Wclippy::perf \ - -Wclippy::let_unit_value -Wclippy::mut_mut \ + -Wclippy::all \ + -Wclippy::mut_mut \ -Wclippy::needless_bitwise_bool \ -Wclippy::needless_continue \ -Wclippy::no_mangle_with_rust_abi \ From 63b27f4a0074bc6ef987a44ee9ad8bf960b568c2 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:06:01 +0200 Subject: [PATCH 043/120] rust: start supporting several compiler versions It is time to start supporting several Rust compiler versions and thus establish a minimum Rust version. We may still want to upgrade the minimum sometimes in the beginning since there may be important features coming into the language that improve how we write code (e.g. field projections), which may or may not make sense to support conditionally. We will start with a window of two stable releases, and widen it over time. Thus this patch does not move the current minimum (1.78.0), but instead adds support for the recently released 1.79.0. This should already be enough for kernel developers in distributions that provide recent Rust compiler versions routinely, such as Arch Linux, Debian Unstable (outside the freeze period), Fedora Linux, Gentoo Linux (especially the testing channel), Nix (unstable) and openSUSE Tumbleweed. See the documentation patch about it later in this series. In addition, Rust for Linux is now being built-tested in Rust's pre-merge CI [1]. That is, every change that is attempting to land into the Rust compiler is tested against the kernel, and it is merged only if it passes -- thanks to the Rust project for that! Thus, with the pre-merge CI in place, both projects hope to avoid unintentional changes to Rust that break the kernel. This means that, in general, apart from intentional changes on their side (that we will need to workaround conditionally on our side), the upcoming Rust compiler versions should generally work. For instance, currently, the beta (1.80.0) and nightly (1.81.0) branches work as well. Of course, the Rust for Linux CI job in the Rust toolchain may still need to be temporarily disabled for different reasons, but the intention is to help bring Rust for Linux into stable Rust. Link: https://github.com/rust-lang/rust/pull/125209 [1] Reviewed-by: Finn Behrens Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-7-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Documentation/process/changes.rst | 4 +--- Documentation/rust/quick-start.rst | 15 +++++++-------- scripts/rust_is_available.sh | 8 -------- scripts/rust_is_available_test.py | 5 ----- 4 files changed, 8 insertions(+), 24 deletions(-) diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 5685d7bfe4d0..0d0b7120792b 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -88,9 +88,7 @@ docs on :ref:`Building Linux with Clang/LLVM `. Rust (optional) --------------- -A particular version of the Rust toolchain is required. Newer versions may or -may not work because the kernel depends on some unstable Rust features, for -the moment. +A recent version of the Rust compiler is required. Each Rust toolchain comes with several "components", some of which are required (like ``rustc``) and some that are optional. The ``rust-src`` component (which diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst index ac2f16288458..89bbfde8c96c 100644 --- a/Documentation/rust/quick-start.rst +++ b/Documentation/rust/quick-start.rst @@ -36,16 +36,15 @@ if that is the case. rustc ***** -A particular version of the Rust compiler is required. Newer versions may or -may not work because, for the moment, the kernel depends on some unstable -Rust features. +A recent version of the Rust compiler is required. If ``rustup`` is being used, enter the kernel build directory (or use -``--path=`` argument to the ``set`` sub-command) and run:: +``--path=`` argument to the ``set`` sub-command) and run, +for instance:: - rustup override set $(scripts/min-tool-version.sh rustc) + rustup override set stable -This will configure your working directory to use the correct version of +This will configure your working directory to use the given version of ``rustc`` without affecting your default toolchain. Note that the override applies to the current working directory (and its @@ -72,9 +71,9 @@ version later on requires re-adding the component. Otherwise, if a standalone installer is used, the Rust source tree may be downloaded into the toolchain's installation folder:: - curl -L "https://static.rust-lang.org/dist/rust-src-$(scripts/min-tool-version.sh rustc).tar.gz" | + curl -L "https://static.rust-lang.org/dist/rust-src-$(rustc --version | cut -d' ' -f2).tar.gz" | tar -xzf - -C "$(rustc --print sysroot)/lib" \ - "rust-src-$(scripts/min-tool-version.sh rustc)/rust-src/lib/" \ + "rust-src-$(rustc --version | cut -d' ' -f2)/rust-src/lib/" \ --strip-components=3 In this case, upgrading the Rust compiler version later on requires manually diff --git a/scripts/rust_is_available.sh b/scripts/rust_is_available.sh index 117018946b57..67cb900124cc 100755 --- a/scripts/rust_is_available.sh +++ b/scripts/rust_is_available.sh @@ -117,14 +117,6 @@ if [ "$rust_compiler_cversion" -lt "$rust_compiler_min_cversion" ]; then echo >&2 "***" exit 1 fi -if [ "$rust_compiler_cversion" -gt "$rust_compiler_min_cversion" ]; then - echo >&2 "***" - echo >&2 "*** Rust compiler '$RUSTC' is too new. This may or may not work." - echo >&2 "*** Your version: $rust_compiler_version" - echo >&2 "*** Expected version: $rust_compiler_min_version" - echo >&2 "***" - warning=1 -fi # Check that the Rust bindings generator is suitable. # diff --git a/scripts/rust_is_available_test.py b/scripts/rust_is_available_test.py index 57613fe5ed75..a255f79aafc2 100755 --- a/scripts/rust_is_available_test.py +++ b/scripts/rust_is_available_test.py @@ -193,11 +193,6 @@ else: result = self.run_script(self.Expected.FAILURE, { "RUSTC": rustc }) self.assertIn(f"Rust compiler '{rustc}' is too old.", result.stderr) - def test_rustc_new_version(self): - rustc = self.generate_rustc("rustc 1.999.0 (a8314ef7d 2099-06-27)") - result = self.run_script(self.Expected.SUCCESS_WITH_WARNINGS, { "RUSTC": rustc }) - self.assertIn(f"Rust compiler '{rustc}' is too new. This may or may not work.", result.stderr) - def test_bindgen_nonexecutable(self): result = self.run_script(self.Expected.FAILURE, { "BINDGEN": self.nonexecutable }) self.assertIn(f"Running '{self.nonexecutable}' to check the Rust bindings generator version failed with", result.stderr) From d49082faf6a001019693a837dea7b958048c731c Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:06:02 +0200 Subject: [PATCH 044/120] rust: avoid assuming a particular `bindgen` build `bindgen`'s logic to find `libclang` (via `clang-sys`) may change over time, and depends on how it was built (e.g. Linux distributions may decide to build it differently, and we are going to provide documentation on installing it via distributions later in this series). Therefore, clarify that `bindgen` may be built in several ways and simplify the documentation by only mentioning the most prominent environment variable (`LIBCLANG_PATH`) as an example on how to tweak the search of the library at runtime (i.e. when `bindgen` is built as our documentation explains). This also avoids duplicating the documentation, like `bindgen` itself does (i.e. it refers to `clang-sys`). Similarly, replace the test we had for this (which used the real program) with a mocked one, to avoid depending on the particular build as well. Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-8-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Documentation/rust/quick-start.rst | 21 ++++++++------------- scripts/rust_is_available_test.py | 25 +++++++++++++++---------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst index 89bbfde8c96c..5ea8c8914942 100644 --- a/Documentation/rust/quick-start.rst +++ b/Documentation/rust/quick-start.rst @@ -113,20 +113,15 @@ Install it via (note that this will download and build the tool from source):: cargo install --locked --version $(scripts/min-tool-version.sh bindgen) bindgen-cli -``bindgen`` needs to find a suitable ``libclang`` in order to work. If it is -not found (or a different ``libclang`` than the one found should be used), -the process can be tweaked using the environment variables understood by -``clang-sys`` (the Rust bindings crate that ``bindgen`` uses to access -``libclang``): +``bindgen`` uses the ``clang-sys`` crate to find a suitable ``libclang`` (which +may be linked statically, dynamically or loaded at runtime). By default, the +``cargo`` command above will produce a ``bindgen`` binary that will load +``libclang`` at runtime. If it is not found (or a different ``libclang`` than +the one found should be used), the process can be tweaked, e.g. by using the +``LIBCLANG_PATH`` environment variable. For details, please see ``clang-sys``'s +documentation at: -* ``LLVM_CONFIG_PATH`` can be pointed to an ``llvm-config`` executable. - -* Or ``LIBCLANG_PATH`` can be pointed to a ``libclang`` shared library - or to the directory containing it. - -* Or ``CLANG_PATH`` can be pointed to a ``clang`` executable. - -For details, please see ``clang-sys``'s documentation at: + https://github.com/KyleMayes/clang-sys#linking https://github.com/KyleMayes/clang-sys#environment-variables diff --git a/scripts/rust_is_available_test.py b/scripts/rust_is_available_test.py index a255f79aafc2..0481aab862ec 100755 --- a/scripts/rust_is_available_test.py +++ b/scripts/rust_is_available_test.py @@ -55,10 +55,15 @@ else: @classmethod def generate_bindgen(cls, version_stdout, libclang_stderr): + if libclang_stderr is None: + libclang_case = f"raise SystemExit({cls.bindgen_default_bindgen_libclang_failure_exit_code})" + else: + libclang_case = f"print({repr(libclang_stderr)}, file=sys.stderr)" + return cls.generate_executable(f"""#!/usr/bin/env python3 import sys if "rust_is_available_bindgen_libclang.h" in " ".join(sys.argv): - print({repr(libclang_stderr)}, file=sys.stderr) + {libclang_case} else: print({repr(version_stdout)}) """) @@ -67,6 +72,10 @@ else: def generate_bindgen_version(cls, stdout): return cls.generate_bindgen(stdout, cls.bindgen_default_bindgen_libclang_stderr) + @classmethod + def generate_bindgen_libclang_failure(cls): + return cls.generate_bindgen(cls.bindgen_default_bindgen_version_stdout, None) + @classmethod def generate_bindgen_libclang(cls, stderr): return cls.generate_bindgen(cls.bindgen_default_bindgen_version_stdout, stderr) @@ -89,6 +98,7 @@ else: cls.rust_default_sysroot = subprocess.check_output(("rustc", "--print", "sysroot")).decode().strip() cls.bindgen_default_bindgen_version_stdout = f"bindgen {cls.bindgen_default_version}" + cls.bindgen_default_bindgen_libclang_failure_exit_code = 42 cls.bindgen_default_bindgen_libclang_stderr = f"scripts/rust_is_available_bindgen_libclang.h:2:9: warning: clang version {cls.llvm_default_version} [-W#pragma-messages], err: false" cls.default_rustc = cls.generate_rustc(f"rustc {cls.rustc_default_version}") @@ -227,15 +237,10 @@ else: self.assertIn(f"Rust bindings generator '{bindgen}' is too new. This may or may not work.", result.stderr) def test_bindgen_libclang_failure(self): - for env in ( - { "LLVM_CONFIG_PATH": self.missing }, - { "LIBCLANG_PATH": self.missing }, - { "CLANG_PATH": self.missing }, - ): - with self.subTest(env=env): - result = self.run_script(self.Expected.FAILURE, env | { "PATH": os.environ["PATH"], "BINDGEN": "bindgen" }) - self.assertIn("Running 'bindgen' to check the libclang version (used by the Rust", result.stderr) - self.assertIn("bindings generator) failed with code ", result.stderr) + bindgen = self.generate_bindgen_libclang_failure() + result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen }) + self.assertIn(f"Running '{bindgen}' to check the libclang version (used by the Rust", result.stderr) + self.assertIn(f"bindings generator) failed with code {self.bindgen_default_bindgen_libclang_failure_exit_code}. This may be caused by", result.stderr) def test_bindgen_libclang_unexpected_version(self): bindgen = self.generate_bindgen_libclang("scripts/rust_is_available_bindgen_libclang.h:2:9: warning: clang version unexpected [-W#pragma-messages], err: false") From 9e98db17837093cb0f4dcfcc3524739d93249c45 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:06:03 +0200 Subject: [PATCH 045/120] rust: work around `bindgen` 0.69.0 issue `bindgen` 0.69.0 contains a bug: `--version` does not work without providing a header [1]: error: the following required arguments were not provided:
Usage: bindgen
-- ... Thus, in preparation for supporting several `bindgen` versions, work around the issue by passing a dummy argument. Include a comment so that we can remove the workaround in the future. Link: https://github.com/rust-lang/rust-bindgen/pull/2678 [1] Reviewed-by: Finn Behrens Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-9-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- init/Kconfig | 5 ++++- scripts/rust_is_available.sh | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index febdea2afc3b..94e20d3b99d4 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1928,7 +1928,10 @@ config RUSTC_VERSION_TEXT config BINDGEN_VERSION_TEXT string depends on RUST - default $(shell,command -v $(BINDGEN) >/dev/null 2>&1 && $(BINDGEN) --version || echo n) + # The dummy parameter `workaround-for-0.69.0` is required to support 0.69.0 + # (https://github.com/rust-lang/rust-bindgen/pull/2678). It can be removed when + # the minimum version is upgraded past that (0.69.1 already fixed the issue). + default $(shell,command -v $(BINDGEN) >/dev/null 2>&1 && $(BINDGEN) --version workaround-for-0.69.0 || echo n) # # Place an empty function call at each tracepoint site. Can be diff --git a/scripts/rust_is_available.sh b/scripts/rust_is_available.sh index 67cb900124cc..1881e8f2a2b9 100755 --- a/scripts/rust_is_available.sh +++ b/scripts/rust_is_available.sh @@ -121,8 +121,12 @@ fi # Check that the Rust bindings generator is suitable. # # Non-stable and distributions' versions may have a version suffix, e.g. `-dev`. +# +# The dummy parameter `workaround-for-0.69.0` is required to support 0.69.0 +# (https://github.com/rust-lang/rust-bindgen/pull/2678). It can be removed when +# the minimum version is upgraded past that (0.69.1 already fixed the issue). rust_bindings_generator_output=$( \ - LC_ALL=C "$BINDGEN" --version 2>/dev/null + LC_ALL=C "$BINDGEN" --version workaround-for-0.69.0 2>/dev/null ) || rust_bindings_generator_code=$? if [ -n "$rust_bindings_generator_code" ]; then echo >&2 "***" From c844fa64a2d46982fe75e834f4a46c46d2b3b2e5 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:06:04 +0200 Subject: [PATCH 046/120] rust: start supporting several `bindgen` versions With both the workaround for `bindgen` 0.69.0 and the warning about 0.66.0 and 0.66.1 in place, start supporting several `bindgen` versions, like it was done for the Rust compiler in a previous patch. All other versions, including the latest 0.69.4, build without errors. The `bindgen` project, like Rust, has also agreed to have the kernel in their CI [1] -- thanks! This should help both projects: `bindgen` will be able to detect early issues like those mentioned above, and the kernel will be very likely build with new releases (at least for the basic configuration being tested). Link: https://github.com/rust-lang/rust-bindgen/pull/2851 [1] Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-10-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Documentation/rust/quick-start.rst | 7 ++++--- scripts/rust_is_available.sh | 8 -------- scripts/rust_is_available_test.py | 5 ----- 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst index 5ea8c8914942..66cefbab8f9a 100644 --- a/Documentation/rust/quick-start.rst +++ b/Documentation/rust/quick-start.rst @@ -107,11 +107,12 @@ bindgen ******* The bindings to the C side of the kernel are generated at build time using -the ``bindgen`` tool. A particular version is required. +the ``bindgen`` tool. -Install it via (note that this will download and build the tool from source):: +Install it, for instance, via (note that this will download and build the tool +from source):: - cargo install --locked --version $(scripts/min-tool-version.sh bindgen) bindgen-cli + cargo install --locked bindgen-cli ``bindgen`` uses the ``clang-sys`` crate to find a suitable ``libclang`` (which may be linked statically, dynamically or loaded at runtime). By default, the diff --git a/scripts/rust_is_available.sh b/scripts/rust_is_available.sh index 1881e8f2a2b9..4531f9dd19d3 100755 --- a/scripts/rust_is_available.sh +++ b/scripts/rust_is_available.sh @@ -161,14 +161,6 @@ if [ "$rust_bindings_generator_cversion" -lt "$rust_bindings_generator_min_cvers echo >&2 "***" exit 1 fi -if [ "$rust_bindings_generator_cversion" -gt "$rust_bindings_generator_min_cversion" ]; then - echo >&2 "***" - echo >&2 "*** Rust bindings generator '$BINDGEN' is too new. This may or may not work." - echo >&2 "*** Your version: $rust_bindings_generator_version" - echo >&2 "*** Expected version: $rust_bindings_generator_min_version" - echo >&2 "***" - warning=1 -fi # Check that the `libclang` used by the Rust bindings generator is suitable. # diff --git a/scripts/rust_is_available_test.py b/scripts/rust_is_available_test.py index 0481aab862ec..d6d54b7ea42a 100755 --- a/scripts/rust_is_available_test.py +++ b/scripts/rust_is_available_test.py @@ -231,11 +231,6 @@ else: result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen }) self.assertIn(f"Rust bindings generator '{bindgen}' is too old.", result.stderr) - def test_bindgen_new_version(self): - bindgen = self.generate_bindgen_version("bindgen 0.999.0") - result = self.run_script(self.Expected.SUCCESS_WITH_WARNINGS, { "BINDGEN": bindgen }) - self.assertIn(f"Rust bindings generator '{bindgen}' is too new. This may or may not work.", result.stderr) - def test_bindgen_libclang_failure(self): bindgen = self.generate_bindgen_libclang_failure() result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen }) From 981ad93c89a3c600dee9795d3ead105acc805483 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:06:05 +0200 Subject: [PATCH 047/120] rust: warn about `bindgen` versions 0.66.0 and 0.66.1 `bindgen` versions 0.66.0 and 0.66.1 panic due to C string literals with NUL characters [1]: panicked at .cargo/registry/src/index.crates.io-6f17d22bba15001f/bindgen-0.66.0/codegen/mod.rs:717:71: called `Result::unwrap()` on an `Err` value: FromBytesWithNulError { kind: InteriorNul(4) } Thus, in preparation for supporting several `bindgen` versions, add a version check to warn the user about it. Since some distributions may have patched it (e.g. Debian did [2]), check if that seems to be the case (after the version check matches), in order to avoid printing a warning in that case. We could make it an error, but 1) it is going to fail anyway later in the build, 2) we would disable `RUST`, which is also painful, 3) someone could have patched it in a way that still makes our extra check fail (however unlikely), 4) the interior NUL may go away in the headers (however unlikely). Thus just warn about it so that users know why it is failing. In addition, add a couple tests for the new cases. Link: https://github.com/rust-lang/rust-bindgen/pull/2567 [1] Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1069047 [2] Link: https://lore.kernel.org/r/20240709160615.998336-11-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- scripts/rust_is_available.sh | 13 ++++++++++++ scripts/rust_is_available_bindgen_0_66.h | 2 ++ scripts/rust_is_available_test.py | 26 +++++++++++++++++++++--- 3 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 scripts/rust_is_available_bindgen_0_66.h diff --git a/scripts/rust_is_available.sh b/scripts/rust_is_available.sh index 4531f9dd19d3..5262c56dd674 100755 --- a/scripts/rust_is_available.sh +++ b/scripts/rust_is_available.sh @@ -161,6 +161,19 @@ if [ "$rust_bindings_generator_cversion" -lt "$rust_bindings_generator_min_cvers echo >&2 "***" exit 1 fi +if [ "$rust_bindings_generator_cversion" -eq 6600 ] || + [ "$rust_bindings_generator_cversion" -eq 6601 ]; then + # Distributions may have patched the issue (e.g. Debian did). + if ! "$BINDGEN" $(dirname $0)/rust_is_available_bindgen_0_66.h >/dev/null; then + echo >&2 "***" + echo >&2 "*** Rust bindings generator '$BINDGEN' versions 0.66.0 and 0.66.1 may not" + echo >&2 "*** work due to a bug (https://github.com/rust-lang/rust-bindgen/pull/2567)," + echo >&2 "*** unless patched (like Debian's)." + echo >&2 "*** Your version: $rust_bindings_generator_version" + echo >&2 "***" + warning=1 + fi +fi # Check that the `libclang` used by the Rust bindings generator is suitable. # diff --git a/scripts/rust_is_available_bindgen_0_66.h b/scripts/rust_is_available_bindgen_0_66.h new file mode 100644 index 000000000000..c0431293421c --- /dev/null +++ b/scripts/rust_is_available_bindgen_0_66.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#define A "\0" diff --git a/scripts/rust_is_available_test.py b/scripts/rust_is_available_test.py index d6d54b7ea42a..413741037fb3 100755 --- a/scripts/rust_is_available_test.py +++ b/scripts/rust_is_available_test.py @@ -54,23 +54,30 @@ else: """) @classmethod - def generate_bindgen(cls, version_stdout, libclang_stderr): + def generate_bindgen(cls, version_stdout, libclang_stderr, version_0_66_patched=False): if libclang_stderr is None: libclang_case = f"raise SystemExit({cls.bindgen_default_bindgen_libclang_failure_exit_code})" else: libclang_case = f"print({repr(libclang_stderr)}, file=sys.stderr)" + if version_0_66_patched: + version_0_66_case = "pass" + else: + version_0_66_case = "raise SystemExit(1)" + return cls.generate_executable(f"""#!/usr/bin/env python3 import sys if "rust_is_available_bindgen_libclang.h" in " ".join(sys.argv): {libclang_case} +elif "rust_is_available_bindgen_0_66.h" in " ".join(sys.argv): + {version_0_66_case} else: print({repr(version_stdout)}) """) @classmethod - def generate_bindgen_version(cls, stdout): - return cls.generate_bindgen(stdout, cls.bindgen_default_bindgen_libclang_stderr) + def generate_bindgen_version(cls, stdout, version_0_66_patched=False): + return cls.generate_bindgen(stdout, cls.bindgen_default_bindgen_libclang_stderr, version_0_66_patched) @classmethod def generate_bindgen_libclang_failure(cls): @@ -231,6 +238,19 @@ else: result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen }) self.assertIn(f"Rust bindings generator '{bindgen}' is too old.", result.stderr) + def test_bindgen_bad_version_0_66_0_and_0_66_1(self): + for version in ("0.66.0", "0.66.1"): + with self.subTest(version=version): + bindgen = self.generate_bindgen_version(f"bindgen {version}") + result = self.run_script(self.Expected.SUCCESS_WITH_WARNINGS, { "BINDGEN": bindgen }) + self.assertIn(f"Rust bindings generator '{bindgen}' versions 0.66.0 and 0.66.1 may not", result.stderr) + + def test_bindgen_bad_version_0_66_0_and_0_66_1_patched(self): + for version in ("0.66.0", "0.66.1"): + with self.subTest(version=version): + bindgen = self.generate_bindgen_version(f"bindgen {version}", True) + result = self.run_script(self.Expected.SUCCESS, { "BINDGEN": bindgen }) + def test_bindgen_libclang_failure(self): bindgen = self.generate_bindgen_libclang_failure() result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen }) From b1263411112305acf2af728728591465becb45b0 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 9 Jul 2024 18:06:08 +0200 Subject: [PATCH 048/120] docs: rust: quick-start: add section on Linux distributions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we are starting to support several Rust compiler and `bindgen` versions, there is a good chance some Linux distributions work out of the box. Thus, provide some instructions on how to set the toolchain up for a few major Linux distributions. This simplifies the setup users need to build the kernel. In addition, add an introduction to the document so that it is easier to understand its structure and move the LLVM+Rust kernel.org toolchains paragraph there (removing "depending on the Linux version"). We may want to reorganize the document or split it in the future, but I wanted to focus this commit on the new information added about each particular distribution. Finally, remove the `rustup`'s components mention in `changes.rst` since users do not need it if they install the toolchain via the distributions (and anyway it was too detailed for that main document). Cc: Jan Alexander Steffens Cc: Johannes Löthberg Cc: Fabian Grünbichler Cc: Josh Stone Cc: Randy Barlow Cc: Anna (navi) Figueiredo Gomes Cc: Matoro Mahri Cc: Ryan Scheel Cc: figsoda Cc: Jörg Thalheim Cc: Theodore Ni <43ngvg@masqt.com> Cc: Winter Cc: William Brown Cc: Xiaoguang Wang Cc: Andrea Righi Cc: Zixing Liu Cc: Nathan Chancellor Tested-by: Benno Lossin Tested-by: Andreas Hindborg Link: https://lore.kernel.org/r/20240709160615.998336-14-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Documentation/process/changes.rst | 5 -- Documentation/rust/quick-start.rst | 93 ++++++++++++++++++++++++++---- 2 files changed, 81 insertions(+), 17 deletions(-) diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 0d0b7120792b..0ce96ae2588c 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -90,11 +90,6 @@ Rust (optional) A recent version of the Rust compiler is required. -Each Rust toolchain comes with several "components", some of which are required -(like ``rustc``) and some that are optional. The ``rust-src`` component (which -is optional) needs to be installed to build the kernel. Other components are -useful for developing. - Please see Documentation/rust/quick-start.rst for instructions on how to satisfy the build requirements of Rust support. In particular, the ``Makefile`` target ``rustavailable`` is useful to check why the Rust toolchain may not diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst index 66cefbab8f9a..d06a36106cd4 100644 --- a/Documentation/rust/quick-start.rst +++ b/Documentation/rust/quick-start.rst @@ -5,24 +5,93 @@ Quick Start This document describes how to get started with kernel development in Rust. +There are a few ways to install a Rust toolchain needed for kernel development. +A simple way is to use the packages from your Linux distribution if they are +suitable -- the first section below explains this approach. An advantage of this +approach is that, typically, the distribution will match the LLVM used by Rust +and Clang. + +Another way is using the prebuilt stable versions of LLVM+Rust provided on +`kernel.org `_. These are the same slim +and fast LLVM toolchains from :ref:`Getting LLVM ` with versions +of Rust added to them that Rust for Linux supports. Two sets are provided: the +"latest LLVM" and "matching LLVM" (please see the link for more information). + +Alternatively, the next two "Requirements" sections explain each component and +how to install them through ``rustup``, the standalone installers from Rust +and/or building them. + +The rest of the document explains other aspects on how to get started. + + +Distributions +------------- + +Arch Linux +********** + +Arch Linux provides recent Rust releases and thus it should generally work out +of the box, e.g.:: + + pacman -S rust rust-src rust-bindgen + + +Debian +****** + +Debian Unstable (Sid), outside of the freeze period, provides recent Rust +releases and thus it should generally work out of the box, e.g.:: + + apt install rustc rust-src bindgen rustfmt rust-clippy + + +Fedora Linux +************ + +Fedora Linux provides recent Rust releases and thus it should generally work out +of the box, e.g.:: + + dnf install rust rust-src bindgen-cli rustfmt clippy + + +Gentoo Linux +************ + +Gentoo Linux (and especially the testing branch) provides recent Rust releases +and thus it should generally work out of the box, e.g.:: + + USE='rust-src rustfmt clippy' emerge dev-lang/rust dev-util/bindgen + +``LIBCLANG_PATH`` may need to be set. + + +Nix +*** + +Nix (unstable channel) provides recent Rust releases and thus it should +generally work out of the box, e.g.:: + + { pkgs ? import {} }: + pkgs.mkShell { + nativeBuildInputs = with pkgs; [ rustc rust-bindgen rustfmt clippy ]; + RUST_LIB_SRC = "${pkgs.rust.packages.stable.rustPlatform.rustLibSrc}"; + } + + +openSUSE +******** + +openSUSE Slowroll and openSUSE Tumbleweed provide recent Rust releases and thus +they should generally work out of the box, e.g.:: + + zypper install rust rust1.79-src rust-bindgen clang + Requirements: Building ---------------------- This section explains how to fetch the tools needed for building. -Some of these requirements might be available from Linux distributions -under names like ``rustc``, ``rust-src``, ``rust-bindgen``, etc. However, -at the time of writing, they are likely not to be recent enough unless -the distribution tracks the latest releases. - -Prebuilt stable versions of LLVM+Rust are provided on `kernel.org -`_. These are the same slim and fast -LLVM toolchains from :ref:`Getting LLVM ` with versions of Rust -added to them that Rust for Linux supports, depending on the Linux version. Two -sets are provided: the "latest LLVM" and "matching LLVM" (please see the link -for more information). - To easily check whether the requirements are met, the following target can be used:: From 8ecef8e01a08c7e3e4ffc8f08d9f9663984f334b Mon Sep 17 00:00:00 2001 From: peng guo Date: Wed, 10 Jul 2024 10:31:12 +0800 Subject: [PATCH 049/120] cxl/core: Fix incorrect vendor debug UUID define When user send a mbox command whose opcode is CXL_MBOX_OP_CLEAR_LOG and the in_payload is normal vendor debug log UUID according to the CXL specification cxl_payload_from_user_allowed() will return false unexpectedly, Sending mbox cmd operation fails and the kernel log will print: Clear Log: input payload not allowed. All CXL devices that support a debug log shall support the Vendor Debug Log to allow the log to be accessed through a common host driver, for any device, all versions of the CXL specification define the same value with Log Identifier of: 5e1819d9-11a9-400c-811f-d60719403d86 Refer to CXL spec r3.1 Table 8-71 Fix the definition value of DEFINE_CXL_VENDOR_DEBUG_UUID to match the CXL specification. Fixes: 472b1ce6e9d6 ("cxl/mem: Enable commands via CEL") Signed-off-by: peng guo Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20240710023112.8063-1-engguopeng@buaa.edu.cn Signed-off-by: Dave Jiang --- drivers/cxl/cxlmem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 19aba81cdf13..f55141c6d64e 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -562,7 +562,7 @@ enum cxl_opcode { 0x3b, 0x3f, 0x17) #define DEFINE_CXL_VENDOR_DEBUG_UUID \ - UUID_INIT(0xe1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19, \ + UUID_INIT(0x5e1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19, \ 0x40, 0x3d, 0x86) struct cxl_mbox_get_supported_logs { From bebfbbaffccfb69126c5a6a1d41cd868b6419370 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Fri, 28 Jun 2024 19:48:07 +0200 Subject: [PATCH 050/120] cxl/acpi: Warn on mixed CXL VH and RCH/RCD Hierarchy Each Host Bridge instance has a corresponding CXL Host Bridge Structure (CHBS) ACPI table that identifies its capabilities. CHBS tables can be two types (CXL 3.1 Table 9-21): The PCIe Root Complex Register Block (RCRB) and CXL Host Bridge Component Registers (CHBCR). If a Host Bridge is attached to a device that is operating in Restricted CXL Device Mode (RCD), BIOS publishes an RCRB with the base address of registers that describe its capabilities (CXL 3.1 sec. 9.11). Instead, the new (CXL 2.0+) Component registers can only be accessed by means of a base address published with a CHBCR (CXL 3.1 sec. 9.12). If an eRCD (a device that forces the host-bridge into CXL 1.1 Restricted CXL Host mode) is attached to a CXL 2.0+ Host-Bridge, the current CXL specification does not define a mechanism for finding CXL-2.0-only root-port component registers like HDM decoders and Extended Security capability. An algorithm to locate a CHBCR associated with an RCRB, would be too invasive to land without some concrete motivation. Therefore, just print a message to inform of unsupported config. Count how many different CHBS "Version" types are detected by cxl_get_chbs_iter(). Then make cxl_get_chbs() print a warning if that sum is greater than 1. Tested-by: Alison Schofield Signed-off-by: Fabio M. De Francesco Reviewed-by: Davidlohr Bueso Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20240628175535.272472-1-fabio.m.de.francesco@linux.intel.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index e51315ea4a6a..574918a9ae3a 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -482,6 +482,8 @@ struct cxl_chbs_context { unsigned long long uid; resource_size_t base; u32 cxl_version; + int nr_versions; + u32 saved_version; }; static int cxl_get_chbs_iter(union acpi_subtable_headers *header, void *arg, @@ -490,22 +492,31 @@ static int cxl_get_chbs_iter(union acpi_subtable_headers *header, void *arg, struct cxl_chbs_context *ctx = arg; struct acpi_cedt_chbs *chbs; - if (ctx->base != CXL_RESOURCE_NONE) - return 0; - chbs = (struct acpi_cedt_chbs *) header; - if (ctx->uid != chbs->uid) - return 0; - - ctx->cxl_version = chbs->cxl_version; - if (!chbs->base) - return 0; - if (chbs->cxl_version == ACPI_CEDT_CHBS_VERSION_CXL11 && chbs->length != CXL_RCRB_SIZE) return 0; + if (!chbs->base) + return 0; + + if (ctx->saved_version != chbs->cxl_version) { + /* + * cxl_version cannot be overwritten before the next two + * checks, then use saved_version + */ + ctx->saved_version = chbs->cxl_version; + ctx->nr_versions++; + } + + if (ctx->base != CXL_RESOURCE_NONE) + return 0; + + if (ctx->uid != chbs->uid) + return 0; + + ctx->cxl_version = chbs->cxl_version; ctx->base = chbs->base; return 0; @@ -529,10 +540,19 @@ static int cxl_get_chbs(struct device *dev, struct acpi_device *hb, .uid = uid, .base = CXL_RESOURCE_NONE, .cxl_version = UINT_MAX, + .saved_version = UINT_MAX, }; acpi_table_parse_cedt(ACPI_CEDT_TYPE_CHBS, cxl_get_chbs_iter, ctx); + if (ctx->nr_versions > 1) { + /* + * Disclaim eRCD support given some component register may + * only be found via CHBCR + */ + dev_info(dev, "Unsupported platform config, mixed Virtual Host and Restricted CXL Host hierarchy."); + } + return 0; } From 591209c79844c1ecbee79e6b5a019e5b61eab8d3 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Sat, 6 Jul 2024 18:53:43 -0700 Subject: [PATCH 051/120] cxl/memdev: Replace ENXIO with EBUSY for inject poison limit reached The CXL driver provides a debugfs interface offering users the ability to inject and clear poison to a memdev. Once a user has injected up to the devices limit further injection requests fail with ENXIO until a clear poison is issued. Users may not have device specs in hand or may want to intentionally hit the limit and then clear. Replace the usual ENXIO return status with EBUSY so users can recognize this failure. Signed-off-by: Alison Schofield Tested-by: Xingtao Yao Reviewed-by: Dan Williams Reviewed-by: Davidlohr Bueso Link: https://patch.msgid.link/825bd4c67fb55a4373c4182d999ad49d4e6b4fe7.1720316188.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- Documentation/ABI/testing/debugfs-cxl | 7 ++++--- drivers/cxl/cxlmem.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Documentation/ABI/testing/debugfs-cxl b/Documentation/ABI/testing/debugfs-cxl index c61f9b813973..12488c14be64 100644 --- a/Documentation/ABI/testing/debugfs-cxl +++ b/Documentation/ABI/testing/debugfs-cxl @@ -14,9 +14,10 @@ Description: event to its internal Informational Event log, updates the Event Status register, and if configured, interrupts the host. It is not an error to inject poison into an address that - already has poison present and no error is returned. The - inject_poison attribute is only visible for devices supporting - the capability. + already has poison present and no error is returned. If the + device returns 'Inject Poison Limit Reached' an -EBUSY error + is returned to the user. The inject_poison attribute is only + visible for devices supporting the capability. What: /sys/kernel/debug/memX/clear_poison diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index f55141c6d64e..40a465c37e03 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -161,7 +161,7 @@ struct cxl_mbox_cmd { C(FWRESET, -ENXIO, "FW failed to activate, needs cold reset"), \ C(HANDLE, -ENXIO, "one or more Event Record Handles were invalid"), \ C(PADDR, -EFAULT, "physical address specified is invalid"), \ - C(POISONLMT, -ENXIO, "poison injection limit has been reached"), \ + C(POISONLMT, -EBUSY, "poison injection limit has been reached"), \ C(MEDIAFAILURE, -ENXIO, "permanent issue with the media"), \ C(ABORT, -ENXIO, "background cmd was aborted by device"), \ C(SECURITY, -ENXIO, "not valid in the current security state"), \ From 3a8617c7df6eb351227aad9b0df647f34a7ef423 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Sat, 6 Jul 2024 18:53:44 -0700 Subject: [PATCH 052/120] cxl/test: Replace ENXIO with EBUSY for inject poison limit reached The CXL driver was recently updated to return EBUSY rather than ENXIO when the device reports that an injection request exceeds the device's limit. That change to EBUSY allows debug users to differentiate between limit reached and inject failures for any other reason. Change cxl-test to also return EBUSY and tidy up the dev_dbg() messaging to emit the correct limit. Reminder: the cxl-test per device injection limit is a configurable attribute: /sys/bus/platform/drivers/cxl_mock_mem/poison_inject_max Signed-off-by: Alison Schofield Tested-by: Xingtao Yao Reviewed-by: Dan Williams Reviewed-by: Davidlohr Bueso Link: https://patch.msgid.link/ba1b80e1658b644d85d0d5e2287112d00a48b9cf.1720316188.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- tools/testing/cxl/test/mem.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 94de2cf60f4f..129f179b0ac5 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1135,27 +1135,28 @@ static bool mock_poison_dev_max_injected(struct cxl_dev_state *cxlds) return (count >= poison_inject_dev_max); } -static bool mock_poison_add(struct cxl_dev_state *cxlds, u64 dpa) +static int mock_poison_add(struct cxl_dev_state *cxlds, u64 dpa) { + /* Return EBUSY to match the CXL driver handling */ if (mock_poison_dev_max_injected(cxlds)) { dev_dbg(cxlds->dev, "Device poison injection limit has been reached: %d\n", - MOCK_INJECT_DEV_MAX); - return false; + poison_inject_dev_max); + return -EBUSY; } for (int i = 0; i < MOCK_INJECT_TEST_MAX; i++) { if (!mock_poison_list[i].cxlds) { mock_poison_list[i].cxlds = cxlds; mock_poison_list[i].dpa = dpa; - return true; + return 0; } } dev_dbg(cxlds->dev, "Mock test poison injection limit has been reached: %d\n", MOCK_INJECT_TEST_MAX); - return false; + return -ENXIO; } static bool mock_poison_found(struct cxl_dev_state *cxlds, u64 dpa) @@ -1179,10 +1180,8 @@ static int mock_inject_poison(struct cxl_dev_state *cxlds, dev_dbg(cxlds->dev, "DPA: 0x%llx already poisoned\n", dpa); return 0; } - if (!mock_poison_add(cxlds, dpa)) - return -ENXIO; - return 0; + return mock_poison_add(cxlds, dpa); } static bool mock_poison_del(struct cxl_dev_state *cxlds, u64 dpa) From 9aa5f6235e16ac6fceed789a30e72addf1abd7d8 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Tue, 2 Jul 2024 22:29:49 -0700 Subject: [PATCH 053/120] cxl/core: Fold cxl_trace_hpa() into cxl_dpa_to_hpa() Although cxl_trace_hpa() is used to populate TRACE EVENTs with HPA addresses the work it performs is a DPA to HPA translation not a trace. Tidy up this naming by moving the minimal work done in cxl_trace_hpa() into cxl_dpa_to_hpa() and use cxl_dpa_to_hpa() for trace event callbacks. Suggested-by: Dan Williams Signed-off-by: Alison Schofield Reviewed-by: Dan Williams Reviewed-by: Robert Richter Link: https://patch.msgid.link/452a9b0c525b774c72d9d5851515ffa928750132.1719980933.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/core.h | 8 ++++---- drivers/cxl/core/mbox.c | 2 +- drivers/cxl/core/region.c | 33 +++++++++++++-------------------- drivers/cxl/core/trace.h | 4 ++-- 4 files changed, 20 insertions(+), 27 deletions(-) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 625394486459..72a506c9dbd0 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -28,12 +28,12 @@ int cxl_region_init(void); void cxl_region_exit(void); int cxl_get_poison_by_endpoint(struct cxl_port *port); struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa); -u64 cxl_trace_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, - u64 dpa); +u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, + u64 dpa); #else -static inline u64 -cxl_trace_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, u64 dpa) +static inline u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, + const struct cxl_memdev *cxlmd, u64 dpa) { return ULLONG_MAX; } diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index 2626f3fff201..eb0b08e5136f 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -878,7 +878,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd, dpa = le64_to_cpu(evt->common.phys_addr) & CXL_DPA_MASK; cxlr = cxl_dpa_to_region(cxlmd, dpa); if (cxlr) - hpa = cxl_trace_hpa(cxlr, cxlmd, dpa); + hpa = cxl_dpa_to_hpa(cxlr, cxlmd, dpa); if (event_type == CXL_CPER_EVENT_GEN_MEDIA) trace_cxl_general_media(cxlmd, type, cxlr, hpa, diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 3c2b6144be23..237c28d5f2cc 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2749,15 +2749,25 @@ static bool cxl_is_hpa_in_range(u64 hpa, struct cxl_region *cxlr, int pos) return false; } -static u64 cxl_dpa_to_hpa(u64 dpa, struct cxl_region *cxlr, - struct cxl_endpoint_decoder *cxled) +u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, + u64 dpa) { u64 dpa_offset, hpa_offset, bits_upper, mask_upper, hpa; struct cxl_region_params *p = &cxlr->params; - int pos = cxled->pos; + struct cxl_endpoint_decoder *cxled = NULL; u16 eig = 0; u8 eiw = 0; + int pos; + for (int i = 0; i < p->nr_targets; i++) { + cxled = p->targets[i]; + if (cxlmd == cxled_to_memdev(cxled)) + break; + } + if (!cxled || cxlmd != cxled_to_memdev(cxled)) + return ULLONG_MAX; + + pos = cxled->pos; ways_to_eiw(p->interleave_ways, &eiw); granularity_to_eig(p->interleave_granularity, &eig); @@ -2797,23 +2807,6 @@ static u64 cxl_dpa_to_hpa(u64 dpa, struct cxl_region *cxlr, return hpa; } -u64 cxl_trace_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, - u64 dpa) -{ - struct cxl_region_params *p = &cxlr->params; - struct cxl_endpoint_decoder *cxled = NULL; - - for (int i = 0; i < p->nr_targets; i++) { - cxled = p->targets[i]; - if (cxlmd == cxled_to_memdev(cxled)) - break; - } - if (!cxled || cxlmd != cxled_to_memdev(cxled)) - return ULLONG_MAX; - - return cxl_dpa_to_hpa(dpa, cxlr, cxled); -} - static struct lock_class_key cxl_pmem_region_key; static int cxl_pmem_region_alloc(struct cxl_region *cxlr) diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index ee5cd4eb2f16..21b76e9c5c60 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -704,8 +704,8 @@ TRACE_EVENT(cxl_poison, if (cxlr) { __assign_str(region); memcpy(__entry->uuid, &cxlr->params.uuid, 16); - __entry->hpa = cxl_trace_hpa(cxlr, cxlmd, - __entry->dpa); + __entry->hpa = cxl_dpa_to_hpa(cxlr, cxlmd, + __entry->dpa); } else { __assign_str(region); memset(__entry->uuid, 0, 16); From 3b2fedcd75e3991e77c2a8c3ebcab0ea68b2d69d Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Tue, 2 Jul 2024 22:29:50 -0700 Subject: [PATCH 054/120] cxl: Restore XOR'd position bits during address translation When a device reports a DPA in events like poison, general_media, and dram, the driver translates that DPA back to an HPA. Presently, the CXL driver translation only considers the Modulo position and will report the wrong HPA for XOR configured root decoders. Add a helper function that restores the XOR'd bits during DPA->HPA address translation. Plumb a root decoder callback to the new helper when XOR interleave arithmetic is in use. For Modulo arithmetic, just let the callback be NULL - as in no extra work required. Upon completion of a DPA->HPA translation a couple of checks are performed on the result. One simply confirms that the calculated HPA is within the address range of the region. That test is useful for both Modulo and XOR interleave arithmetic decodes. A second check confirms that the HPA is within an expected chunk based on the endpoints position in the region and the region granularity. An XOR decode disrupts the Modulo pattern making the chunk check useless. To align the checks with the proper decode, pull the region range check inline and use the helper to do the chunk check for Modulo decodes only. A cxl-test unit test is posted for upstream review here: https://lore.kernel.org/20240624210644.495563-1-alison.schofield@intel.com/ Fixes: 28a3ae4ff66c ("cxl/trace: Add an HPA to cxl_poison trace events") Signed-off-by: Alison Schofield Tested-by: Diego Garcia Rodriguez Reviewed-by: Dan Williams Link: https://patch.msgid.link/1a1ac880d9f889bd6384e657e810431b9a0a72e5.1719980933.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 40 +++++++++++++++++++++++++++++++++++++++ drivers/cxl/core/region.c | 23 +++++++++++++--------- drivers/cxl/cxl.h | 3 +++ 3 files changed, 57 insertions(+), 9 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 571069863c62..6b6ae9c81368 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -74,6 +74,43 @@ static struct cxl_dport *cxl_hb_xor(struct cxl_root_decoder *cxlrd, int pos) return cxlrd->cxlsd.target[n]; } +static u64 cxl_xor_hpa_to_spa(struct cxl_root_decoder *cxlrd, u64 hpa) +{ + struct cxl_cxims_data *cximsd = cxlrd->platform_data; + int hbiw = cxlrd->cxlsd.nr_targets; + u64 val; + int pos; + + /* No xormaps for host bridge interleave ways of 1 or 3 */ + if (hbiw == 1 || hbiw == 3) + return hpa; + + /* + * For root decoders using xormaps (hbiw: 2,4,6,8,12,16) restore + * the position bit to its value before the xormap was applied at + * HPA->DPA translation. + * + * pos is the lowest set bit in an XORMAP + * val is the XORALLBITS(HPA & XORMAP) + * + * XORALLBITS: The CXL spec (3.1 Table 9-22) defines XORALLBITS + * as an operation that outputs a single bit by XORing all the + * bits in the input (hpa & xormap). Implement XORALLBITS using + * hweight64(). If the hamming weight is even the XOR of those + * bits results in val==0, if odd the XOR result is val==1. + */ + + for (int i = 0; i < cximsd->nr_maps; i++) { + if (!cximsd->xormaps[i]) + continue; + pos = __ffs(cximsd->xormaps[i]); + val = (hweight64(hpa & cximsd->xormaps[i]) & 1); + hpa = (hpa & ~(1ULL << pos)) | (val << pos); + } + + return hpa; +} + struct cxl_cxims_context { struct device *dev; struct cxl_root_decoder *cxlrd; @@ -434,6 +471,9 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, cxlrd->qos_class = cfmws->qtg_id; + if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) + cxlrd->hpa_to_spa = cxl_xor_hpa_to_spa; + rc = cxl_decoder_add(cxld, target_map); if (rc) return rc; diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 237c28d5f2cc..23abd0f7b856 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -2723,20 +2723,13 @@ struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa) return ctx.cxlr; } -static bool cxl_is_hpa_in_range(u64 hpa, struct cxl_region *cxlr, int pos) +static bool cxl_is_hpa_in_chunk(u64 hpa, struct cxl_region *cxlr, int pos) { struct cxl_region_params *p = &cxlr->params; int gran = p->interleave_granularity; int ways = p->interleave_ways; u64 offset; - /* Is the hpa within this region at all */ - if (hpa < p->res->start || hpa > p->res->end) { - dev_dbg(&cxlr->dev, - "Addr trans fail: hpa 0x%llx not in region\n", hpa); - return false; - } - /* Is the hpa in an expected chunk for its pos(-ition) */ offset = hpa - p->res->start; offset = do_div(offset, gran * ways); @@ -2752,6 +2745,7 @@ static bool cxl_is_hpa_in_range(u64 hpa, struct cxl_region *cxlr, int pos) u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, u64 dpa) { + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); u64 dpa_offset, hpa_offset, bits_upper, mask_upper, hpa; struct cxl_region_params *p = &cxlr->params; struct cxl_endpoint_decoder *cxled = NULL; @@ -2801,7 +2795,18 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, /* Apply the hpa_offset to the region base address */ hpa = hpa_offset + p->res->start; - if (!cxl_is_hpa_in_range(hpa, cxlr, cxled->pos)) + /* Root decoder translation overrides typical modulo decode */ + if (cxlrd->hpa_to_spa) + hpa = cxlrd->hpa_to_spa(cxlrd, hpa); + + if (hpa < p->res->start || hpa > p->res->end) { + dev_dbg(&cxlr->dev, + "Addr trans fail: hpa 0x%llx not in region\n", hpa); + return ULLONG_MAX; + } + + /* Simple chunk check, by pos & gran, only applies to modulo decodes */ + if (!cxlrd->hpa_to_spa && (!cxl_is_hpa_in_chunk(hpa, cxlr, pos))) return ULLONG_MAX; return hpa; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 603c0120cff8..dfc4f27d195c 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -434,12 +434,14 @@ struct cxl_switch_decoder { struct cxl_root_decoder; typedef struct cxl_dport *(*cxl_calc_hb_fn)(struct cxl_root_decoder *cxlrd, int pos); +typedef u64 (*cxl_hpa_to_spa_fn)(struct cxl_root_decoder *cxlrd, u64 hpa); /** * struct cxl_root_decoder - Static platform CXL address decoder * @res: host / parent resource for region allocations * @region_id: region id for next region provisioning event * @calc_hb: which host bridge covers the n'th position by granularity + * @hpa_to_spa: translate CXL host-physical-address to Platform system-physical-address * @platform_data: platform specific configuration data * @range_lock: sync region autodiscovery by address range * @qos_class: QoS performance class cookie @@ -449,6 +451,7 @@ struct cxl_root_decoder { struct resource *res; atomic_t region_id; cxl_calc_hb_fn calc_hb; + cxl_hpa_to_spa_fn hpa_to_spa; void *platform_data; struct mutex range_lock; int qos_class; From 82a3e3a235633aa0575fac9507d648dd80f3437f Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Tue, 2 Jul 2024 22:29:51 -0700 Subject: [PATCH 055/120] cxl/region: Verify target positions using the ordered target list When a root decoder is configured the interleave target list is read from the BIOS populated CFMWS structure. Per the CXL spec 3.1 Table 9-22 the target list is in interleave order. The CXL driver populates its decoder target list in the same order and stores it in 'struct cxl_switch_decoder' field "@target: active ordered target list in current decoder configuration" Given the promise of an ordered list, the driver can stop duplicating the work of BIOS and simply check target positions against the ordered list during region configuration. The simplified check against the ordered list is presented here. A follow-on patch will remove the unused code. For Modulo arithmetic this is not a fix, only a simplification. For XOR arithmetic this is a fix for HB IW of 3,6,12. Fixes: f9db85bfec0d ("cxl/acpi: Support CXL XOR Interleave Math (CXIMS)") Signed-off-by: Alison Schofield Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/35d08d3aba08fee0f9b86ab1cef0c25116ca8a55.1719980933.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 23abd0f7b856..2772828ca6ca 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1559,10 +1559,13 @@ static int cxl_region_attach_position(struct cxl_region *cxlr, const struct cxl_dport *dport, int pos) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd; + struct cxl_decoder *cxld = &cxlsd->cxld; + int iw = cxld->interleave_ways; struct cxl_port *iter; int rc; - if (cxlrd->calc_hb(cxlrd, pos) != dport) { + if (dport != cxlrd->cxlsd.target[pos % iw]) { dev_dbg(&cxlr->dev, "%s:%s invalid target position for %s\n", dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), dev_name(&cxlrd->cxlsd.cxld.dev)); From 8f55ada796565ce801418bf579f31a6a522d0337 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Tue, 2 Jul 2024 22:29:52 -0700 Subject: [PATCH 056/120] cxl: Remove defunct code calculating host bridge target positions The CXL Spec 3.1 Table 9-22 requires that the BIOS populate the CFMWS target list in interleave target order. This means the calculations the CXL driver added to determine positions when XOR math is in use, along with the entire XOR vs Modulo call back setup is not needed. A prior patch added a common method to verify positions. Remove the now unused code related to the cxl_calc_hb_fn. Signed-off-by: Alison Schofield Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/2e2c32a2d0f1007e920b58712d15edad2e48d857.1719980933.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 60 ++--------------------------------------- drivers/cxl/core/port.c | 20 +------------- drivers/cxl/cxl.h | 8 +----- 3 files changed, 4 insertions(+), 84 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 6b6ae9c81368..921cee3bb980 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -22,57 +22,6 @@ static const guid_t acpi_cxl_qtg_id_guid = GUID_INIT(0xF365F9A6, 0xA7DE, 0x4071, 0xA6, 0x6A, 0xB4, 0x0C, 0x0B, 0x4F, 0x8E, 0x52); -/* - * Find a targets entry (n) in the host bridge interleave list. - * CXL Specification 3.0 Table 9-22 - */ -static int cxl_xor_calc_n(u64 hpa, struct cxl_cxims_data *cximsd, int iw, - int ig) -{ - int i = 0, n = 0; - u8 eiw; - - /* IW: 2,4,6,8,12,16 begin building 'n' using xormaps */ - if (iw != 3) { - for (i = 0; i < cximsd->nr_maps; i++) - n |= (hweight64(hpa & cximsd->xormaps[i]) & 1) << i; - } - /* IW: 3,6,12 add a modulo calculation to 'n' */ - if (!is_power_of_2(iw)) { - if (ways_to_eiw(iw, &eiw)) - return -1; - hpa &= GENMASK_ULL(51, eiw + ig); - n |= do_div(hpa, 3) << i; - } - return n; -} - -static struct cxl_dport *cxl_hb_xor(struct cxl_root_decoder *cxlrd, int pos) -{ - struct cxl_cxims_data *cximsd = cxlrd->platform_data; - struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd; - struct cxl_decoder *cxld = &cxlsd->cxld; - int ig = cxld->interleave_granularity; - int iw = cxld->interleave_ways; - int n = 0; - u64 hpa; - - if (dev_WARN_ONCE(&cxld->dev, - cxld->interleave_ways != cxlsd->nr_targets, - "misconfigured root decoder\n")) - return NULL; - - hpa = cxlrd->res->start + pos * ig; - - /* Entry (n) is 0 for no interleave (iw == 1) */ - if (iw != 1) - n = cxl_xor_calc_n(hpa, cximsd, iw, ig); - - if (n < 0) - return NULL; - - return cxlrd->cxlsd.target[n]; -} static u64 cxl_xor_hpa_to_spa(struct cxl_root_decoder *cxlrd, u64 hpa) { @@ -398,7 +347,6 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, struct cxl_port *root_port = ctx->root_port; struct cxl_cxims_context cxims_ctx; struct device *dev = ctx->dev; - cxl_calc_hb_fn cxl_calc_hb; struct cxl_decoder *cxld; unsigned int ways, i, ig; int rc; @@ -426,13 +374,9 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws, if (rc) return rc; - if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_MODULO) - cxl_calc_hb = cxl_hb_modulo; - else - cxl_calc_hb = cxl_hb_xor; - struct cxl_root_decoder *cxlrd __free(put_cxlrd) = - cxl_root_decoder_alloc(root_port, ways, cxl_calc_hb); + cxl_root_decoder_alloc(root_port, ways); + if (IS_ERR(cxlrd)) return PTR_ERR(cxlrd); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 887ed6e358fb..bc9e18d02598 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1733,21 +1733,6 @@ static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd, return 0; } -struct cxl_dport *cxl_hb_modulo(struct cxl_root_decoder *cxlrd, int pos) -{ - struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd; - struct cxl_decoder *cxld = &cxlsd->cxld; - int iw; - - iw = cxld->interleave_ways; - if (dev_WARN_ONCE(&cxld->dev, iw != cxlsd->nr_targets, - "misconfigured root decoder\n")) - return NULL; - - return cxlrd->cxlsd.target[pos % iw]; -} -EXPORT_SYMBOL_NS_GPL(cxl_hb_modulo, CXL); - static struct lock_class_key cxl_decoder_key; /** @@ -1807,7 +1792,6 @@ static int cxl_switch_decoder_init(struct cxl_port *port, * cxl_root_decoder_alloc - Allocate a root level decoder * @port: owning CXL root of this decoder * @nr_targets: static number of downstream targets - * @calc_hb: which host bridge covers the n'th position by granularity * * Return: A new cxl decoder to be registered by cxl_decoder_add(). A * 'CXL root' decoder is one that decodes from a top-level / static platform @@ -1815,8 +1799,7 @@ static int cxl_switch_decoder_init(struct cxl_port *port, * topology. */ struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, - unsigned int nr_targets, - cxl_calc_hb_fn calc_hb) + unsigned int nr_targets) { struct cxl_root_decoder *cxlrd; struct cxl_switch_decoder *cxlsd; @@ -1838,7 +1821,6 @@ struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, return ERR_PTR(rc); } - cxlrd->calc_hb = calc_hb; mutex_init(&cxlrd->range_lock); cxld = &cxlsd->cxld; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index dfc4f27d195c..e27a0cd4f107 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -432,15 +432,12 @@ struct cxl_switch_decoder { }; struct cxl_root_decoder; -typedef struct cxl_dport *(*cxl_calc_hb_fn)(struct cxl_root_decoder *cxlrd, - int pos); typedef u64 (*cxl_hpa_to_spa_fn)(struct cxl_root_decoder *cxlrd, u64 hpa); /** * struct cxl_root_decoder - Static platform CXL address decoder * @res: host / parent resource for region allocations * @region_id: region id for next region provisioning event - * @calc_hb: which host bridge covers the n'th position by granularity * @hpa_to_spa: translate CXL host-physical-address to Platform system-physical-address * @platform_data: platform specific configuration data * @range_lock: sync region autodiscovery by address range @@ -450,7 +447,6 @@ typedef u64 (*cxl_hpa_to_spa_fn)(struct cxl_root_decoder *cxlrd, u64 hpa); struct cxl_root_decoder { struct resource *res; atomic_t region_id; - cxl_calc_hb_fn calc_hb; cxl_hpa_to_spa_fn hpa_to_spa; void *platform_data; struct mutex range_lock; @@ -775,9 +771,7 @@ bool is_root_decoder(struct device *dev); bool is_switch_decoder(struct device *dev); bool is_endpoint_decoder(struct device *dev); struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, - unsigned int nr_targets, - cxl_calc_hb_fn calc_hb); -struct cxl_dport *cxl_hb_modulo(struct cxl_root_decoder *cxlrd, int pos); + unsigned int nr_targets); struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, unsigned int nr_targets); int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map); From 745d9f4a31defec731119ee8aad8ba9f2536dd9a Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Thu, 29 Feb 2024 23:42:36 +0300 Subject: [PATCH 057/120] ubi: eba: properly rollback inside self_check_eba In case of a memory allocation failure in the volumes loop we can only process the already allocated scan_eba and fm_eba array elements on the error path - others are still uninitialized. Found by Linux Verification Center (linuxtesting.org). Fixes: 00abf3041590 ("UBI: Add self_check_eba()") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/eba.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c index e5ac3cd0bbae..c7ba7a15c9f7 100644 --- a/drivers/mtd/ubi/eba.c +++ b/drivers/mtd/ubi/eba.c @@ -1564,6 +1564,7 @@ int self_check_eba(struct ubi_device *ubi, struct ubi_attach_info *ai_fastmap, GFP_KERNEL); if (!fm_eba[i]) { ret = -ENOMEM; + kfree(scan_eba[i]); goto out_free; } @@ -1599,7 +1600,7 @@ int self_check_eba(struct ubi_device *ubi, struct ubi_attach_info *ai_fastmap, } out_free: - for (i = 0; i < num_volumes; i++) { + while (--i >= 0) { if (!ubi->volumes[i]) continue; From 299af26eb46374295a3289be990e536b75c9376a Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 5 Mar 2024 16:35:38 -0300 Subject: [PATCH 058/120] mtd: ubi: make ubi_class constant Since commit 43a7206b0963 ("driver core: class: make class_register() take a const *"), the driver core allows for struct class to be in read-only memory, so move the ubi_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Zhihao Cheng Reviewed-by: Miquel Raynal Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/build.c | 2 +- drivers/mtd/ubi/ubi.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index a7e3a6246c0e..952c80269f57 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -112,7 +112,7 @@ static struct attribute *ubi_class_attrs[] = { ATTRIBUTE_GROUPS(ubi_class); /* Root UBI "class" object (corresponds to '//class/ubi/') */ -struct class ubi_class = { +const struct class ubi_class = { .name = UBI_NAME_STR, .class_groups = ubi_class_groups, }; diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index 32009a24869e..1aead55090d0 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -814,7 +814,7 @@ extern struct kmem_cache *ubi_wl_entry_slab; extern const struct file_operations ubi_ctrl_cdev_operations; extern const struct file_operations ubi_cdev_operations; extern const struct file_operations ubi_vol_cdev_operations; -extern struct class ubi_class; +extern const struct class ubi_class; extern struct mutex ubi_devices_mutex; extern struct blocking_notifier_head ubi_notifiers; From 02096a0cf150fc8dc1cfe39afcd768d06b70e722 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 13 Mar 2024 09:46:52 +0100 Subject: [PATCH 059/120] mtd: ubi: avoid expensive do_div() on 32-bit machines The use of do_div() in ubi_nvmem_reg_read() makes calling it on 32-bit machines rather expensive. Since the 'from' variable is known to be a 32-bit quantity, it is clearly never needed and can be optimized into a regular division operation. Fixes: b8a77b9a5f9c ("mtd: ubi: fix NVMEM over UBI volumes on 32-bit systems") Fixes: 3ce485803da1 ("mtd: ubi: provide NVMEM layer over UBI volumes") Signed-off-by: Arnd Bergmann Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/nvmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/ubi/nvmem.c b/drivers/mtd/ubi/nvmem.c index 8aeb9c428e51..a94a1a9aaec1 100644 --- a/drivers/mtd/ubi/nvmem.c +++ b/drivers/mtd/ubi/nvmem.c @@ -6,7 +6,6 @@ /* UBI NVMEM provider */ #include "ubi.h" #include -#include /* List of all NVMEM devices */ static LIST_HEAD(nvmem_devices); @@ -27,14 +26,15 @@ static int ubi_nvmem_reg_read(void *priv, unsigned int from, struct ubi_nvmem *unv = priv; struct ubi_volume_desc *desc; uint32_t offs; - uint64_t lnum = from; + uint32_t lnum; int err = 0; desc = ubi_open_volume(unv->ubi_num, unv->vol_id, UBI_READONLY); if (IS_ERR(desc)) return PTR_ERR(desc); - offs = do_div(lnum, unv->usable_leb_size); + offs = from % unv->usable_leb_size; + lnum = from / unv->usable_leb_size; while (bytes_left) { to_read = unv->usable_leb_size - offs; From 02eb1846b77d2896903e94b966ba632cf48d39e0 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:43 +0800 Subject: [PATCH 060/120] ubifs: Fix unattached xattr inode if powercut happens after deleting When powercut happens after deleting file, the xattr inode could be alone existing in TNC but its' xattr entry cannot be found in TNC. File inode and xattr inode are added into orphan list after deleting file, file inode's nlink is 0 but xattr inode's nlink is not 0 (PS: zero nlink xattr inode is written on disk in evicting process by ubifs_jnl_write_inode). So, following process could happen: 1. touch file 2. setxattr(file) 3. unlink file // inode(nlink=0), xattr inode(nlink=1) are added into orphan list 4. commit // write inode inum and xattr inum into orphan area 5. powercut 6. mount do_kill_orphans // inode(nlink=0) is deleted from TNC by ubifs_tnc_remove_range, // xattr entry is deleted too. // xattr inode(nlink=1) is not deleted from TNC Finally we could see following error while debugging UBIFS: UBIFS error (ubi0:0 pid 1093): dbg_check_filesystem [ubifs]: inode 66 nlink is 1, but calculated nlink is 0 UBIFS (ubi0:0): dump of the inode 66 sitting in LEB 12:2128 node_type 0 (inode node) group_type 1 (in node group) len 197 key (66, inode) size 37 nlink 1 flags 0x20 xattr_cnt 0 xattr_size 0 xattr_names 0 data len 37 Fix it by removing entire inode with it's xattrs while replaying orphan, just replace function ubifs_tnc_remove_range by ubifs_tnc_remove_ino. Fixes: ee1438ce5dc4 ("ubifs: Check link count of inodes when killing orphans.") Link: https://bugzilla.kernel.org/show_bug.cgi?id=218661 Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/orphan.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 4909321d84cf..ddeb125e6930 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -691,12 +691,12 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3; for (i = 0; i < n; i++) { - union ubifs_key key1, key2; + union ubifs_key key; inum = le64_to_cpu(orph->inos[i]); - ino_key_init(c, &key1, inum); - err = ubifs_tnc_lookup(c, &key1, ino); + ino_key_init(c, &key, inum); + err = ubifs_tnc_lookup(c, &key, ino); if (err && err != -ENOENT) goto out_free; @@ -708,10 +708,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, dbg_rcvry("deleting orphaned inode %lu", (unsigned long)inum); - lowest_ino_key(c, &key1, inum); - highest_ino_key(c, &key2, inum); - - err = ubifs_tnc_remove_range(c, &key1, &key2); + err = ubifs_tnc_remove_ino(c, inum); if (err) goto out_ro; } From 354c1796639a743b49bd13549ec0ec6ca121b85e Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:44 +0800 Subject: [PATCH 061/120] ubifs: Don't add xattr inode into orphan area Now, the entire inode with its' xattrs are removed while replaying orphan nodes. There is no need to add xattr inodes into orphan area, which is based on the fact that xattr entries won't be cleared from disk before deleting xattr inodes, in another words, current logic can make sure that xattr inode be deleted in any cases even UBIFS not record xattr inode into orphan area. Let's looking for possible paths that could clear xattr entries from disk but leave the xattr inode on TNC: 1. unlink/tmpfile -> ubifs_jnl_update: inode(nlink=0) is written into bud LEB and added into orphan list, then: a. powercut: ubifs_tnc_remove_ino(xattr entry/inode can be found from TNC and being deleted) is invoked in replaying journal. b. commit + powercut: inode is written into orphan area, and ubifs_tnc_remove_ino is invoked in replaying orphan nodes. c. evicting + powercut: xattr inode(nlink=0) is written on disk, xattr is removed from TNC, gc could clear xattr entries from disk. ubifs_tnc_remove_ino will apply on inode and xattr inode in replaying journal, so lost xattr entries will make no influence. d. evicting + commit + powercut: xattr inode/entry are removed from index tree(on disk) by ubifs_jnl_write_inode, xattr inode is cleared from orphan area by ubifs_jnl_write_inode + commit. e. commit + evicting + powercut: inode is written into orphan area, then equivalent to c. 2. remove xattr -> ubifs_jnl_delete_xattr: xattr entry(inum=0) and xattr inode(nlink=0) is written into bud LEB, xattr entry/inode are removed from TNC, then: a. powercut: gc could clear xattr entries from disk, which won't affect deleting xattr entry from TNC. ubifs_tnc_remove_ino will apply on xattr inode in replaying journal, ubifs_tnc_remove_nm will apply on xattr entry in replaying journal. b. commit + powercut: xattr entry/inode are removed from index tree (on disk). Tracking xattr inode in orphan list is imported by commit 988bec41318f3f ("ubifs: orphan: Handle xattrs like files"), it aims to fix the similar problem described in commit 7959cf3a7506d4a ("ubifs: journal: Handle xattrs like files"). Actually, the problem only exist in journal case but not the orphan case. So, we can remove the orphan tracking for xattr inodes. Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/orphan.c | 85 ++++++++--------------------------------------- fs/ubifs/ubifs.h | 3 -- 2 files changed, 14 insertions(+), 74 deletions(-) diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index ddeb125e6930..88fbf331ad8c 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -42,24 +42,30 @@ static int dbg_check_orphans(struct ubifs_info *c); -static struct ubifs_orphan *orphan_add(struct ubifs_info *c, ino_t inum, - struct ubifs_orphan *parent_orphan) +/** + * ubifs_add_orphan - add an orphan. + * @c: UBIFS file-system description object + * @inum: orphan inode number + * + * Add an orphan. This function is called when an inodes link count drops to + * zero. + */ +int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) { struct ubifs_orphan *orphan, *o; struct rb_node **p, *parent = NULL; orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS); if (!orphan) - return ERR_PTR(-ENOMEM); + return -ENOMEM; orphan->inum = inum; orphan->new = 1; - INIT_LIST_HEAD(&orphan->child_list); spin_lock(&c->orphan_lock); if (c->tot_orphans >= c->max_orphans) { spin_unlock(&c->orphan_lock); kfree(orphan); - return ERR_PTR(-ENFILE); + return -ENFILE; } p = &c->orph_tree.rb_node; while (*p) { @@ -73,7 +79,7 @@ static struct ubifs_orphan *orphan_add(struct ubifs_info *c, ino_t inum, ubifs_err(c, "orphaned twice"); spin_unlock(&c->orphan_lock); kfree(orphan); - return ERR_PTR(-EINVAL); + return -EINVAL; } } c->tot_orphans += 1; @@ -83,14 +89,9 @@ static struct ubifs_orphan *orphan_add(struct ubifs_info *c, ino_t inum, list_add_tail(&orphan->list, &c->orph_list); list_add_tail(&orphan->new_list, &c->orph_new); - if (parent_orphan) { - list_add_tail(&orphan->child_list, - &parent_orphan->child_list); - } - spin_unlock(&c->orphan_lock); dbg_gen("ino %lu", (unsigned long)inum); - return orphan; + return 0; } static struct ubifs_orphan *lookup_orphan(struct ubifs_info *c, ino_t inum) @@ -144,59 +145,6 @@ static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) __orphan_drop(c, orph); } -/** - * ubifs_add_orphan - add an orphan. - * @c: UBIFS file-system description object - * @inum: orphan inode number - * - * Add an orphan. This function is called when an inodes link count drops to - * zero. - */ -int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) -{ - int err = 0; - ino_t xattr_inum; - union ubifs_key key; - struct ubifs_dent_node *xent, *pxent = NULL; - struct fscrypt_name nm = {0}; - struct ubifs_orphan *xattr_orphan; - struct ubifs_orphan *orphan; - - orphan = orphan_add(c, inum, NULL); - if (IS_ERR(orphan)) - return PTR_ERR(orphan); - - lowest_xent_key(c, &key, inum); - while (1) { - xent = ubifs_tnc_next_ent(c, &key, &nm); - if (IS_ERR(xent)) { - err = PTR_ERR(xent); - if (err == -ENOENT) - break; - kfree(pxent); - return err; - } - - fname_name(&nm) = xent->name; - fname_len(&nm) = le16_to_cpu(xent->nlen); - xattr_inum = le64_to_cpu(xent->inum); - - xattr_orphan = orphan_add(c, xattr_inum, orphan); - if (IS_ERR(xattr_orphan)) { - kfree(pxent); - kfree(xent); - return PTR_ERR(xattr_orphan); - } - - kfree(pxent); - pxent = xent; - key_read(c, &xent->key, &key); - } - kfree(pxent); - - return 0; -} - /** * ubifs_delete_orphan - delete an orphan. * @c: UBIFS file-system description object @@ -206,7 +154,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) */ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) { - struct ubifs_orphan *orph, *child_orph, *tmp_o; + struct ubifs_orphan *orph; spin_lock(&c->orphan_lock); @@ -219,11 +167,6 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) return; } - list_for_each_entry_safe(child_orph, tmp_o, &orph->child_list, child_list) { - list_del(&child_orph->child_list); - orphan_delete(c, child_orph); - } - orphan_delete(c, orph); spin_unlock(&c->orphan_lock); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 1f3ea879d93a..821c6ae99e62 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -923,8 +923,6 @@ struct ubifs_budget_req { * @rb: rb-tree node of rb-tree of orphans sorted by inode number * @list: list head of list of orphans in order added * @new_list: list head of list of orphans added since the last commit - * @child_list: list of xattr children if this orphan hosts xattrs, list head - * if this orphan is a xattr, not used otherwise. * @cnext: next orphan to commit * @dnext: next orphan to delete * @inum: inode number @@ -936,7 +934,6 @@ struct ubifs_orphan { struct rb_node rb; struct list_head list; struct list_head new_list; - struct list_head child_list; struct ubifs_orphan *cnext; struct ubifs_orphan *dnext; ino_t inum; From 7bed61a1cf166b5c113047fc8f60ff22dcb04893 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:45 +0800 Subject: [PATCH 062/120] Revert "ubifs: ubifs_symlink: Fix memleak of inode->i_link in error path" This reverts commit 6379b44cdcd67f5f5d986b73953e99700591edfa. Commit 1e022216dcd2 ("ubifs: ubifs_symlink: Fix memleak of inode->i_link in error path") is applied again in commit 6379b44cdcd6 ("ubifs: ubifs_symlink: Fix memleak of inode->i_link in error path"), which changed ubifs_mknod (It won't become a real problem). Just revert it. Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index eac0fef801f1..551148de66cd 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1133,8 +1133,6 @@ out_cancel: dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); out_inode: - /* Free inode->i_link before inode is marked as bad. */ - fscrypt_free_inode(inode); make_bad_inode(inode); iput(inode); out_fname: From 6376d7503b02796cc6109198d555ddebfb4f5f81 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:46 +0800 Subject: [PATCH 063/120] ubifs: Remove insert_dead_orphan from replaying orphan process UBIFS will do commit at the end of mounting process(rw mode), dead orphans(added by insert_dead_orphan in replaying orphan) are deleted by ubifs_orphan_end_commit(). The only reason why dead orphans are added into orphan list is that old orpans may be lost when powercut happens in ubifs_orphan_end_commit(): ubifs_orphan_end_commit // TNC(updated by orphans) is not written yet if (c->cmt_orphans != 0) commit_orphans consolidate // traverse orphan list write_orph_nodes // rewrite all orphans by ubifs_leb_change // If dead orphans are not in list, they will be lost when powercut // happens, then TNC won't be updated by old orphans in next mounting. Luckily, the condition 'c->cmt_orphans != 0' will never be true in mounting process, there can't be new orphans added into orphan list before mounting returned, but commit will be done at the end of mounting. Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/orphan.c | 49 ----------------------------------------------- 1 file changed, 49 deletions(-) diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 88fbf331ad8c..6e843e8fc3db 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -513,51 +513,6 @@ int ubifs_clear_orphans(struct ubifs_info *c) return 0; } -/** - * insert_dead_orphan - insert an orphan. - * @c: UBIFS file-system description object - * @inum: orphan inode number - * - * This function is a helper to the 'do_kill_orphans()' function. The orphan - * must be kept until the next commit, so it is added to the rb-tree and the - * deletion list. - */ -static int insert_dead_orphan(struct ubifs_info *c, ino_t inum) -{ - struct ubifs_orphan *orphan, *o; - struct rb_node **p, *parent = NULL; - - orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL); - if (!orphan) - return -ENOMEM; - orphan->inum = inum; - - p = &c->orph_tree.rb_node; - while (*p) { - parent = *p; - o = rb_entry(parent, struct ubifs_orphan, rb); - if (inum < o->inum) - p = &(*p)->rb_left; - else if (inum > o->inum) - p = &(*p)->rb_right; - else { - /* Already added - no problem */ - kfree(orphan); - return 0; - } - } - c->tot_orphans += 1; - rb_link_node(&orphan->rb, parent, p); - rb_insert_color(&orphan->rb, &c->orph_tree); - list_add_tail(&orphan->list, &c->orph_list); - orphan->del = 1; - orphan->dnext = c->orph_dnext; - c->orph_dnext = orphan; - dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum, - c->new_orphans, c->tot_orphans); - return 0; -} - /** * do_kill_orphans - remove orphan inodes from the index. * @c: UBIFS file-system description object @@ -655,10 +610,6 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, if (err) goto out_ro; } - - err = insert_dead_orphan(c, inum); - if (err) - goto out_free; } *last_cmt_no = cmt_no; From 7efc34b53b3950ad4fc98b5d25210bae80ab0fde Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:47 +0800 Subject: [PATCH 064/120] ubifs: Fix adding orphan entry twice for the same inode The tmpfile could be added into orphan list twice, first time is creation, the second time is removing after it is linked. The orphan entry could be added twice for tmpfile if following sequence is satisfied: ubifs_tmpfile ubifs_jnl_update ubifs_add_orphan // first time to add orphan entry P1 P2 ubifs_link do_commit ubifs_orphan_start_commit orphan->cmt = 1 ubifs_delete_orphan orphan_delete if (orph->cmt) orph->del = 1; // orphan entry is not deleted from tree return ubifs_unlink ubifs_jnl_update ubifs_add_orphan orphan_add // found old orphan entry, second time to add orphan entry ubifs_err(c, "orphaned twice") return -EINVAL // unlink failed! ubifs_orphan_end_commit erase_deleted // delete old orphan entry rb_erase(&orphan->rb, &c->orph_tree) Fix it by removing orphan entry from orphan tree in advance, rather than remove it from orphan tree in committing process. Fixes: 32fe905c17f0 ("ubifs: Fix O_TMPFILE corner case in ubifs_link()") Link: https://bugzilla.kernel.org/show_bug.cgi?id=218672 Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/orphan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 6e843e8fc3db..37d206097112 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -136,6 +136,7 @@ static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) if (orph->cmt) { orph->del = 1; + rb_erase(&orph->rb, &c->orph_tree); orph->dnext = c->orph_dnext; c->orph_dnext = orph; dbg_gen("delete later ino %lu", (unsigned long)orph->inum); @@ -461,7 +462,6 @@ static void erase_deleted(struct ubifs_info *c) dnext = orphan->dnext; ubifs_assert(c, !orphan->new); ubifs_assert(c, orphan->del); - rb_erase(&orphan->rb, &c->orph_tree); list_del(&orphan->list); c->tot_orphans -= 1; dbg_gen("deleting orphan ino %lu", (unsigned long)orphan->inum); From 9f5ecacfcee2bda21b100399a9130248d55c2a0c Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:48 +0800 Subject: [PATCH 065/120] ubifs: Move ui->data initialization after initializing security Host inode and its' xattr will be written on disk after initializing security when creating symlink or dev, then the host inode and its dentry will be written again in ubifs_jnl_update. There is no need to write inode data in the security initialization pass, just move the ui->data initialization after initializing security. Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 551148de66cd..848988f2b7dc 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1102,16 +1102,18 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, goto out_fname; } + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) { + kfree(dev); + goto out_inode; + } + init_special_inode(inode, inode->i_mode, rdev); inode->i_size = ubifs_inode(inode)->ui_size = devlen; ui = ubifs_inode(inode); ui->data = dev; ui->data_len = devlen; - err = ubifs_init_security(dir, inode, &dentry->d_name); - if (err) - goto out_inode; - mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; @@ -1184,6 +1186,10 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out_fname; } + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_inode; + ui = ubifs_inode(inode); ui->data = kmalloc(disk_link.len, GFP_NOFS); if (!ui->data) { @@ -1209,10 +1215,6 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, ui->data_len = disk_link.len - 1; inode->i_size = ubifs_inode(inode)->ui_size = disk_link.len - 1; - err = ubifs_init_security(dir, inode, &dentry->d_name); - if (err) - goto out_inode; - mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; From b25e6a5f78e84598ec62552ef342a58a23b6f7c4 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:49 +0800 Subject: [PATCH 066/120] ubifs: Fix space leak when powercut happens in linking tmpfile There is a potential space leak problem when powercut happens in linking tmpfile, in which case, inode node (with nlink=0) and its' data nodes can be found from tnc (on flash), but there are no dentries related to the inode, so the file is invisible but takes free space. Detailed process is shown as: ubifs_tmpfile ubifs_jnl_update // Add bud A into log area ubifs_add_orphan // Add inode into orphan list P1 P2 ubifs_link ubifs_delete_orphan // Delete inode from orphan list, then inode won't // be written into orphan area, there is no chance // to delete inode by replaying orphan. commit // bud A won't be replayed in next mounting >> powercut << ubifs_jnl_update // Link inode to dentry The root cause is that orphan entry deletion and journal writing(for link) are interrupted by commit, which makes the two operations are not atomic. Fix it by doing ubifs_delete_orphan under the protection of c->commit_sem within ubifs_jnl_update. This is also a preparation to support all creating new files by orphan inode. v1 is https://lore.kernel.org/linux-mtd/20200701093227.674945-1-chengzhihao1@huawei.com/ Fixes: 32fe905c17f0 ("ubifs: Fix O_TMPFILE corner case in ubifs_link()") Link: https://bugzilla.kernel.org/show_bug.cgi?id=208405 Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 22 ++++++++-------------- fs/ubifs/journal.c | 6 +++++- fs/ubifs/ubifs.h | 2 +- fs/ubifs/xattr.c | 2 +- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 848988f2b7dc..fe16443243ab 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -325,7 +325,7 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir, dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -479,7 +479,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, mutex_unlock(&ui->ui_mutex); lock_2_inodes(dir, inode); - err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 0); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -760,10 +760,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, lock_2_inodes(dir, inode); - /* Handle O_TMPFILE corner case, it is allowed to link a O_TMPFILE. */ - if (inode->i_nlink == 0) - ubifs_delete_orphan(c, inode->i_ino); - inc_nlink(inode); ihold(inode); inode_set_ctime_current(inode); @@ -771,7 +767,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, inode->i_nlink == 1); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -785,8 +781,6 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; drop_nlink(inode); - if (inode->i_nlink == 0) - ubifs_add_orphan(c, inode->i_ino); unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); iput(inode); @@ -846,7 +840,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 0); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -950,7 +944,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 0); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -1025,7 +1019,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); if (err) { ubifs_err(c, "cannot create directory, error %d", err); goto out_cancel; @@ -1119,7 +1113,7 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -1220,7 +1214,7 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 74aee92433d7..2b4b05c2d9b2 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -643,6 +643,7 @@ static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) * @inode: inode to update * @deletion: indicates a directory entry deletion i.e unlink or rmdir * @xent: non-zero if the directory entry is an extended attribute entry + * @delete_orphan: indicates an orphan entry deletion for @inode * * This function updates an inode by writing a directory entry (or extended * attribute entry), the inode itself, and the parent directory inode (or the @@ -664,7 +665,7 @@ static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) */ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, const struct fscrypt_name *nm, const struct inode *inode, - int deletion, int xent) + int deletion, int xent, int delete_orphan) { int err, dlen, ilen, len, lnum, ino_offs, dent_offs, orphan_added = 0; int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir); @@ -806,6 +807,9 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, if (err) goto out_ro; + if (delete_orphan) + ubifs_delete_orphan(c, inode->i_ino); + finish_reservation(c); spin_lock(&ui->ui_lock); ui->synced_i_size = ui->ui_size; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 821c6ae99e62..d0ed78345617 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1800,7 +1800,7 @@ int ubifs_consolidate_log(struct ubifs_info *c); /* journal.c */ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, const struct fscrypt_name *nm, const struct inode *inode, - int deletion, int xent); + int deletion, int xent, int delete_orphan); int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, const union ubifs_key *key, const void *buf, int len); int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 0847db521984..f734588b224a 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -149,7 +149,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, if (strcmp(fname_name(nm), UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) == 0) host_ui->flags |= UBIFS_CRYPT_FL; - err = ubifs_jnl_update(c, host, nm, inode, 0, 1); + err = ubifs_jnl_update(c, host, nm, inode, 0, 1, 0); if (err) goto out_cancel; ubifs_set_inode_flags(host); From 3af2d3a8c56fe7dc24f60c4df0ab85b7ac941902 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:50 +0800 Subject: [PATCH 067/120] ubifs: Fix unattached inode when powercut happens in creating For selinux or encryption scenarios, UBIFS could become inconsistent while creating new files in powercut case. Encryption/selinux related xattrs will be created before creating file dentry, which makes creation process is not atomic, details are shown as: Encryption case: ubifs_create ubifs_new_inode fscrypt_set_context ubifs_xattr_set create_xattr ubifs_jnl_update // Disk: xentry xinode inode(LAST_OF_NODE_GROUP) >> power cut << ubifs_jnl_update // Disk: dentry inode parent_inode(LAST_OF_NODE_GROUP) Selinux case: ubifs_create ubifs_new_inode ubifs_init_security security_inode_init_security ubifs_xattr_set create_xattr ubifs_jnl_update // Disk: xentry xinode inode(LAST_OF_NODE_GROUP) >> power cut << ubifs_jnl_update // Disk: dentry inode parent_inode(LAST_OF_NODE_GROUP) Above process will make chk_fs failed in next mounting: UBIFS error (ubi0:0 pid 7995): dbg_check_filesystem [ubifs]: inode 66 nlink is 1, but calculated nlink is 0 Fix it by allocating orphan inode for each non-xattr file creation, then removing orphan list in journal writing process, which ensures that both xattr and dentry be effective in atomic when powercut happens. Fixes: d7f0b70d30ff ("UBIFS: Add security.* XATTR support for the UBIFS") Fixes: d475a507457b ("ubifs: Add skeleton for fscrypto") Link: https://bugzilla.kernel.org/show_bug.cgi?id=218309 Suggested-by: Zhang Yi Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 59 +++++++++++++++++++++++++++++++--------------- fs/ubifs/journal.c | 14 +++++++---- fs/ubifs/ubifs.h | 4 ++-- 3 files changed, 51 insertions(+), 26 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index fe16443243ab..c77ea57fe696 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -71,8 +71,13 @@ static int inherit_flags(const struct inode *dir, umode_t mode) * @is_xattr: whether the inode is xattr inode * * This function finds an unused inode number, allocates new inode and - * initializes it. Returns new inode in case of success and an error code in - * case of failure. + * initializes it. Non-xattr new inode may be written with xattrs(selinux/ + * encryption) before writing dentry, which could cause inconsistent problem + * when powercut happens between two operations. To deal with it, non-xattr + * new inode is initialized with zero-nlink and added into orphan list, caller + * should make sure that inode is relinked later, and make sure that orphan + * removing and journal writing into an committing atomic operation. Returns + * new inode in case of success and an error code in case of failure. */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, umode_t mode, bool is_xattr) @@ -163,9 +168,25 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, ui->creat_sqnum = ++c->max_sqnum; spin_unlock(&c->cnt_lock); + if (!is_xattr) { + set_nlink(inode, 0); + err = ubifs_add_orphan(c, inode->i_ino); + if (err) { + ubifs_err(c, "ubifs_add_orphan failed: %i", err); + goto out_iput; + } + down_read(&c->commit_sem); + ui->del_cmtno = c->cmt_no; + up_read(&c->commit_sem); + } + if (encrypted) { err = fscrypt_set_context(inode, NULL); if (err) { + if (!is_xattr) { + set_nlink(inode, 1); + ubifs_delete_orphan(c, inode->i_ino); + } ubifs_err(c, "fscrypt_set_context failed: %i", err); goto out_iput; } @@ -320,12 +341,13 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir, if (err) goto out_inode; + set_nlink(inode, 1); mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -340,8 +362,8 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); + set_nlink(inode, 0); out_inode: - make_bad_inode(inode); iput(inode); out_fname: fscrypt_free_filename(&nm); @@ -386,7 +408,6 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) return inode; out_inode: - make_bad_inode(inode); iput(inode); out_free: ubifs_err(c, "cannot create whiteout file, error %d", err); @@ -470,6 +491,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, if (err) goto out_inode; + set_nlink(inode, 1); mutex_lock(&ui->ui_mutex); insert_inode_hash(inode); d_tmpfile(file, inode); @@ -479,7 +501,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, mutex_unlock(&ui->ui_mutex); lock_2_inodes(dir, inode); - err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 1); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -492,7 +514,6 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, out_cancel: unlock_2_inodes(dir, inode); out_inode: - make_bad_inode(inode); if (!instantiated) iput(inode); out_budg: @@ -1011,6 +1032,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (err) goto out_inode; + set_nlink(inode, 1); mutex_lock(&dir_ui->ui_mutex); insert_inode_hash(inode); inc_nlink(inode); @@ -1019,7 +1041,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1); if (err) { ubifs_err(c, "cannot create directory, error %d", err); goto out_cancel; @@ -1036,8 +1058,8 @@ out_cancel: dir_ui->ui_size = dir->i_size; drop_nlink(dir); mutex_unlock(&dir_ui->ui_mutex); + set_nlink(inode, 0); out_inode: - make_bad_inode(inode); iput(inode); out_fname: fscrypt_free_filename(&nm); @@ -1107,13 +1129,14 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, ui = ubifs_inode(inode); ui->data = dev; ui->data_len = devlen; + set_nlink(inode, 1); mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -1128,8 +1151,8 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); + set_nlink(inode, 0); out_inode: - make_bad_inode(inode); iput(inode); out_fname: fscrypt_free_filename(&nm); @@ -1208,13 +1231,14 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, */ ui->data_len = disk_link.len - 1; inode->i_size = ubifs_inode(inode)->ui_size = disk_link.len - 1; + set_nlink(inode, 1); mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); - err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -1228,10 +1252,10 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); + set_nlink(inode, 0); out_inode: /* Free inode->i_link before inode is marked as bad. */ fscrypt_free_inode(inode); - make_bad_inode(inode); iput(inode); out_fname: fscrypt_free_filename(&nm); @@ -1399,14 +1423,10 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, */ err = ubifs_budget_space(c, &wht_req); if (err) { - /* - * Whiteout inode can not be written on flash by - * ubifs_jnl_write_inode(), because it's neither - * dirty nor zero-nlink. - */ iput(whiteout); goto out_release; } + set_nlink(whiteout, 1); /* Add the old_dentry size to the old_dir size. */ old_sz -= CALC_DENT_SIZE(fname_len(&old_nm)); @@ -1485,7 +1505,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, } err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir, - new_inode, &new_nm, whiteout, sync); + new_inode, &new_nm, whiteout, sync, !!whiteout); if (err) goto out_cancel; @@ -1538,6 +1558,7 @@ out_cancel: unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); if (whiteout) { ubifs_release_budget(c, &wht_req); + set_nlink(whiteout, 0); iput(whiteout); } out_release: diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 2b4b05c2d9b2..517f5de76b49 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -643,7 +643,7 @@ static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) * @inode: inode to update * @deletion: indicates a directory entry deletion i.e unlink or rmdir * @xent: non-zero if the directory entry is an extended attribute entry - * @delete_orphan: indicates an orphan entry deletion for @inode + * @in_orphan: indicates whether the @inode is in orphan list * * This function updates an inode by writing a directory entry (or extended * attribute entry), the inode itself, and the parent directory inode (or the @@ -665,7 +665,7 @@ static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) */ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, const struct fscrypt_name *nm, const struct inode *inode, - int deletion, int xent, int delete_orphan) + int deletion, int xent, int in_orphan) { int err, dlen, ilen, len, lnum, ino_offs, dent_offs, orphan_added = 0; int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir); @@ -751,7 +751,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, if (err) goto out_release; - if (last_reference) { + if (last_reference && !in_orphan) { err = ubifs_add_orphan(c, inode->i_ino); if (err) { release_head(c, BASEHD); @@ -807,7 +807,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, if (err) goto out_ro; - if (delete_orphan) + if (in_orphan && inode->i_nlink) ubifs_delete_orphan(c, inode->i_ino); finish_reservation(c); @@ -1340,6 +1340,7 @@ out_free: * @new_nm: new name of the new directory entry * @whiteout: whiteout inode * @sync: non-zero if the write-buffer has to be synchronized + * @delete_orphan: indicates an orphan entry deletion for @whiteout * * This function implements the re-name operation which may involve writing up * to 4 inodes(new inode, whiteout inode, old and new parent directory inodes) @@ -1352,7 +1353,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct inode *new_dir, const struct inode *new_inode, const struct fscrypt_name *new_nm, - const struct inode *whiteout, int sync) + const struct inode *whiteout, int sync, int delete_orphan) { void *p; union ubifs_key key; @@ -1569,6 +1570,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, goto out_ro; } + if (delete_orphan) + ubifs_delete_orphan(c, whiteout->i_ino); + finish_reservation(c); if (new_inode) { mark_inode_clean(c, new_ui); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index d0ed78345617..b2c6554fa857 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1800,7 +1800,7 @@ int ubifs_consolidate_log(struct ubifs_info *c); /* journal.c */ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, const struct fscrypt_name *nm, const struct inode *inode, - int deletion, int xent, int delete_orphan); + int deletion, int xent, int in_orphan); int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, const union ubifs_key *key, const void *buf, int len); int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); @@ -1817,7 +1817,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, const struct inode *new_dir, const struct inode *new_inode, const struct fscrypt_name *new_nm, - const struct inode *whiteout, int sync); + const struct inode *whiteout, int sync, int delete_orphan); int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, loff_t old_size, loff_t new_size); int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, From 06776df740660e27c351d89f314d05b2720ba41f Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 10 Apr 2024 15:37:51 +0800 Subject: [PATCH 068/120] ubifs: dbg_orphan_check: Fix missed key type checking When selinux/encryption is enabled, xattr entry node is added into TNC before host inode when creating new file. So it is possible to find xattr entry without host inode from TNC. Orphan debug checking is called by ubifs_orphan_end_commit(), at that time, the commit semaphore is already unlock, so the new creation won't be blocked. Fixes: d7f0b70d30ff ("UBIFS: Add security.* XATTR support for the UBIFS") Fixes: d475a507457b ("ubifs: Add skeleton for fscrypto") Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/orphan.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 37d206097112..fb957d963ba6 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -816,8 +816,12 @@ static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr, inum = key_inum(c, &zbr->key); if (inum != ci->last_ino) { - /* Lowest node type is the inode node, so it comes first */ - if (key_type(c, &zbr->key) != UBIFS_INO_KEY) + /* + * Lowest node type is the inode node or xattr entry(when + * selinux/encryption is enabled), so it comes first + */ + if (key_type(c, &zbr->key) != UBIFS_INO_KEY && + key_type(c, &zbr->key) != UBIFS_XENT_KEY) ubifs_err(c, "found orphan node ino %lu, type %d", (unsigned long)inum, key_type(c, &zbr->key)); ci->last_ino = inum; From 72f3d3daddd740f744a24cd7ef8c27bd0cd5489d Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 11 Apr 2024 00:42:41 +0200 Subject: [PATCH 069/120] mtd: ubi: Restore missing cleanup on ubi_init() failure path We need to clean-up debugfs and ubiblock if we fail after initialising them. Signed-off-by: Ben Hutchings Fixes: 927c145208b0 ("mtd: ubi: attach from device tree") Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/build.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 952c80269f57..30be4ed68fad 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -1372,7 +1372,7 @@ static int __init ubi_init(void) /* See comment above re-ubi_is_module(). */ if (ubi_is_module()) - goto out_slab; + goto out_debugfs; } register_mtd_user(&ubi_mtd_notifier); @@ -1387,6 +1387,9 @@ static int __init ubi_init(void) out_mtd_notifier: unregister_mtd_user(&ubi_mtd_notifier); + ubiblock_exit(); +out_debugfs: + ubi_debugfs_exit(); out_slab: kmem_cache_destroy(ubi_wl_entry_slab); out_dev_unreg: From 7037c96d8c42965eb93e6b96cf921399c283d431 Mon Sep 17 00:00:00 2001 From: ZhaoLong Wang Date: Thu, 18 Apr 2024 15:07:04 +0800 Subject: [PATCH 070/120] ubifs: correct UBIFS_DFS_DIR_LEN macro definition and improve code clarity The UBIFS_DFS_DIR_LEN macro, which defines the maximum length of the UBIFS debugfs directory name, has an incorrect formula and misleading comments. The current formula is (3 + 1 + 2*2 + 1), which assumes that both UBI device number and volume ID are limited to 2 characters. However, UBI device number ranges from 0 to 31 (2 characters), and volume ID ranges from 0 to 127 (up to 3 characters). Although the current code works due to the cancellation of mathematical errors (9 + 1 = 10, which matches the correct UBIFS_DFS_DIR_LEN value), it can lead to confusion and potential issues in the future. This patch aims to improve the code clarity and maintainability by making the following changes: 1. Corrects the UBIFS_DFS_DIR_LEN macro definition to (3 + 1 + 2 + 3 + 1), accommodating the maximum lengths of both UBI device number and volume ID, plus the separators and null terminator. 2. Updates the snprintf calls to use UBIFS_DFS_DIR_LEN instead of UBIFS_DFS_DIR_LEN + 1, removing the unnecessary +1. 3. Modifies the error checks to compare against UBIFS_DFS_DIR_LEN using >= instead of >, aligning with the corrected macro definition. 4. Removes the redundant +1 in the dfs_dir_name array definitions in ubi.h and debug.h. While these changes do not affect the runtime behavior, they make the code more readable, maintainable, and less prone to future errors. v2->v3: - Removes the duplicated UBIFS_DFS_DIR_LEN and UBIFS_DFS_DIR_NAME macro definitions in ubifs.h, as they are already defined in debug.h. Signed-off-by: ZhaoLong Wang Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/debug.c | 4 ++-- drivers/mtd/ubi/ubi.h | 2 +- fs/ubifs/debug.c | 4 ++-- fs/ubifs/debug.h | 7 ++++--- fs/ubifs/sysfs.c | 6 +++--- fs/ubifs/ubifs.h | 7 ------- 6 files changed, 12 insertions(+), 18 deletions(-) diff --git a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c index d57f52bd2ff3..9ec3b8b6a0aa 100644 --- a/drivers/mtd/ubi/debug.c +++ b/drivers/mtd/ubi/debug.c @@ -598,9 +598,9 @@ int ubi_debugfs_init_dev(struct ubi_device *ubi) if (!IS_ENABLED(CONFIG_DEBUG_FS)) return 0; - n = snprintf(d->dfs_dir_name, UBI_DFS_DIR_LEN + 1, UBI_DFS_DIR_NAME, + n = snprintf(d->dfs_dir_name, UBI_DFS_DIR_LEN, UBI_DFS_DIR_NAME, ubi->ubi_num); - if (n > UBI_DFS_DIR_LEN) { + if (n >= UBI_DFS_DIR_LEN) { /* The array size is too small */ return -EINVAL; } diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index 1aead55090d0..1c9e874e8ede 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -420,7 +420,7 @@ struct ubi_debug_info { unsigned int power_cut_min; unsigned int power_cut_max; unsigned int emulate_failures; - char dfs_dir_name[UBI_DFS_DIR_LEN + 1]; + char dfs_dir_name[UBI_DFS_DIR_LEN]; struct dentry *dfs_dir; struct dentry *dfs_chk_gen; struct dentry *dfs_chk_io; diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index ac77ac1fd73e..d91cec93d968 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2827,9 +2827,9 @@ void dbg_debugfs_init_fs(struct ubifs_info *c) const char *fname; struct ubifs_debug_info *d = c->dbg; - n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, + n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN, UBIFS_DFS_DIR_NAME, c->vi.ubi_num, c->vi.vol_id); - if (n > UBIFS_DFS_DIR_LEN) { + if (n >= UBIFS_DFS_DIR_LEN) { /* The array size is too small */ return; } diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index ed966108da80..d425861e6b82 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -19,10 +19,11 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c, /* * The UBIFS debugfs directory name pattern and maximum name length (3 for "ubi" - * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte. + * + 1 for "_" and 2 for UBI device numbers and 3 for volume number and 1 for + * the trailing zero byte. */ #define UBIFS_DFS_DIR_NAME "ubi%d_%d" -#define UBIFS_DFS_DIR_LEN (3 + 1 + 2*2 + 1) +#define UBIFS_DFS_DIR_LEN (3 + 1 + 2 + 3 + 1) /** * ubifs_debug_info - per-FS debugging information. @@ -103,7 +104,7 @@ struct ubifs_debug_info { unsigned int chk_fs:1; unsigned int tst_rcvry:1; - char dfs_dir_name[UBIFS_DFS_DIR_LEN + 1]; + char dfs_dir_name[UBIFS_DFS_DIR_LEN]; struct dentry *dfs_dir; struct dentry *dfs_dump_lprops; struct dentry *dfs_dump_budg; diff --git a/fs/ubifs/sysfs.c b/fs/ubifs/sysfs.c index 1c958148bb87..aae32222f11b 100644 --- a/fs/ubifs/sysfs.c +++ b/fs/ubifs/sysfs.c @@ -91,17 +91,17 @@ static struct kset ubifs_kset = { int ubifs_sysfs_register(struct ubifs_info *c) { int ret, n; - char dfs_dir_name[UBIFS_DFS_DIR_LEN+1]; + char dfs_dir_name[UBIFS_DFS_DIR_LEN]; c->stats = kzalloc(sizeof(struct ubifs_stats_info), GFP_KERNEL); if (!c->stats) { ret = -ENOMEM; goto out_last; } - n = snprintf(dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, + n = snprintf(dfs_dir_name, UBIFS_DFS_DIR_LEN, UBIFS_DFS_DIR_NAME, c->vi.ubi_num, c->vi.vol_id); - if (n > UBIFS_DFS_DIR_LEN) { + if (n >= UBIFS_DFS_DIR_LEN) { /* The array size is too small */ ret = -EINVAL; goto out_free; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index b2c6554fa857..d69a5a42d693 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -157,13 +157,6 @@ #define UBIFS_HMAC_ARR_SZ 0 #endif -/* - * The UBIFS sysfs directory name pattern and maximum name length (3 for "ubi" - * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte. - */ -#define UBIFS_DFS_DIR_NAME "ubi%d_%d" -#define UBIFS_DFS_DIR_LEN (3 + 1 + 2*2 + 1) - /* * Lockdep classes for UBIFS inode @ui_mutex. */ From 39986148bc2ab4436ec169c8f4db76f6cc83705d Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 2 May 2024 13:16:55 -0700 Subject: [PATCH 071/120] ubifs: fix kernel-doc warnings make C=1 reports the following kernel-doc warnings: fs/ubifs/compress.c:103: warning: Function parameter or struct member 'c' not described in 'ubifs_compress' fs/ubifs/compress.c:155: warning: Function parameter or struct member 'c' not described in 'ubifs_decompress' fs/ubifs/find.c:353: warning: Excess function parameter 'data' description in 'scan_for_free_cb' fs/ubifs/find.c:353: warning: Function parameter or struct member 'arg' not described in 'scan_for_free_cb' fs/ubifs/find.c:594: warning: Excess function parameter 'data' description in 'scan_for_idx_cb' fs/ubifs/find.c:594: warning: Function parameter or struct member 'arg' not described in 'scan_for_idx_cb' fs/ubifs/find.c:786: warning: Excess function parameter 'data' description in 'scan_dirty_idx_cb' fs/ubifs/find.c:786: warning: Function parameter or struct member 'arg' not described in 'scan_dirty_idx_cb' fs/ubifs/find.c:86: warning: Excess function parameter 'data' description in 'scan_for_dirty_cb' fs/ubifs/find.c:86: warning: Function parameter or struct member 'arg' not described in 'scan_for_dirty_cb' fs/ubifs/journal.c:369: warning: expecting prototype for wake_up_reservation(). Prototype was for add_or_start_queue() instead fs/ubifs/lprops.c:1018: warning: Excess function parameter 'lst' description in 'scan_check_cb' fs/ubifs/lprops.c:1018: warning: Function parameter or struct member 'arg' not described in 'scan_check_cb' fs/ubifs/lpt.c:1938: warning: Function parameter or struct member 'ptr' not described in 'lpt_scan_node' fs/ubifs/replay.c:60: warning: Function parameter or struct member 'hash' not described in 'replay_entry' Fix them. Signed-off-by: Jeff Johnson Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/compress.c | 2 ++ fs/ubifs/find.c | 8 ++++---- fs/ubifs/journal.c | 2 +- fs/ubifs/lprops.c | 2 +- fs/ubifs/lpt.c | 1 + fs/ubifs/replay.c | 1 + 6 files changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c index 75461777c466..0b48cbab8a3d 100644 --- a/fs/ubifs/compress.c +++ b/fs/ubifs/compress.c @@ -82,6 +82,7 @@ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; /** * ubifs_compress - compress data. + * @c: UBIFS file-system description object * @in_buf: data to compress * @in_len: length of the data to compress * @out_buf: output buffer where compressed data should be stored @@ -140,6 +141,7 @@ no_compr: /** * ubifs_decompress - decompress data. + * @c: UBIFS file-system description object * @in_buf: data to decompress * @in_len: length of the data to decompress * @out_buf: output buffer where decompressed data should diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 6ebf3c04ac5f..643718906b9f 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -73,7 +73,7 @@ static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops) * @c: the UBIFS file-system description object * @lprops: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @data: information passed to and from the caller of the scan + * @arg: information passed to and from the caller of the scan * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree @@ -340,7 +340,7 @@ out: * @c: the UBIFS file-system description object * @lprops: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @data: information passed to and from the caller of the scan + * @arg: information passed to and from the caller of the scan * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree @@ -581,7 +581,7 @@ out: * @c: the UBIFS file-system description object * @lprops: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @data: information passed to and from the caller of the scan + * @arg: information passed to and from the caller of the scan * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree @@ -773,7 +773,7 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c) * @c: the UBIFS file-system description object * @lprops: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @data: information passed to and from the caller of the scan + * @arg: information passed to and from the caller of the scan * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 517f5de76b49..4a35f9e8f668 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -359,7 +359,7 @@ static void wake_up_reservation(struct ubifs_info *c) } /** - * wake_up_reservation - add current task in queue or start queuing. + * add_or_start_queue - add current task in queue or start queuing. * @c: UBIFS file-system description object * * This function starts queuing if queuing is not started, otherwise adds diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index a11c3dab7e16..8788740ec57f 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1005,7 +1005,7 @@ out: * @c: the UBIFS file-system description object * @lp: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @lst: lprops statistics to update + * @arg: lprops statistics to update * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index 778a22bf9a92..441d0beca4cf 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -1918,6 +1918,7 @@ out_err: * @pnode: where to keep a pnode * @cnode: where to keep a cnode * @in_tree: is the node in the tree in memory + * @ptr: union of node pointers * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in * the tree * @ptr.pnode: ditto for pnode diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 17da28d6247a..a950c5f2560e 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -29,6 +29,7 @@ * @lnum: logical eraseblock number of the node * @offs: node offset * @len: node length + * @hash: node hash * @deletion: non-zero if this entry corresponds to a node deletion * @sqnum: node sequence number * @list: links the replay list From 4f9d406c8c90dc17470cf63342c16f66ec2d978e Mon Sep 17 00:00:00 2001 From: Li Nan Date: Thu, 23 May 2024 01:10:35 +0800 Subject: [PATCH 072/120] ubi: block: fix null-pointer-dereference in ubiblock_create() Similar to commit adbf4c4954e3 ("ubi: block: fix memleak in ubiblock_create()"), 'dev->gd' is not assigned but dereferenced if blk_mq_alloc_tag_set() fails, and leading to a null-pointer-dereference. Fix it by using pr_err() and variable 'dev' to print error log. Additionally, the log in the error handle path of idr_alloc() has been improved by using pr_err(), too. Before initializing device name, using dev_err() will print error log with 'null' instead of the actual device name, like this: block (null): ... ~~~~~~ It is unclear. Using pr_err() can print more details of the device. The improved log is: ubiblock0_0: ... Fixes: 77567b25ab9f ("ubi: use blk_mq_alloc_disk and blk_cleanup_disk") Reported-by: Dan Carpenter Signed-off-by: Li Nan Reviewed-by: Zhihao Cheng Reviewed-by: Daniel Golle Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/block.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index f82e3423acb9..bf7308e8ec2f 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -390,7 +390,8 @@ int ubiblock_create(struct ubi_volume_info *vi) ret = blk_mq_alloc_tag_set(&dev->tag_set); if (ret) { - dev_err(disk_to_dev(dev->gd), "blk_mq_alloc_tag_set failed"); + pr_err("ubiblock%d_%d: blk_mq_alloc_tag_set failed\n", + dev->ubi_num, dev->vol_id); goto out_free_dev; } @@ -407,8 +408,8 @@ int ubiblock_create(struct ubi_volume_info *vi) gd->minors = 1; gd->first_minor = idr_alloc(&ubiblock_minor_idr, dev, 0, 0, GFP_KERNEL); if (gd->first_minor < 0) { - dev_err(disk_to_dev(gd), - "block: dynamic minor allocation failed"); + pr_err("ubiblock%d_%d: block: dynamic minor allocation failed\n", + dev->ubi_num, dev->vol_id); ret = -ENODEV; goto out_cleanup_disk; } From 25e79a7f2c9e54872fb2c8932bb582a500284505 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Tue, 4 Jun 2024 19:32:07 +0800 Subject: [PATCH 073/120] ubifs: Fix inconsistent inode size when powercut happens during appendant writing UBIFS always make sure that the data length won't beyond the inode size by writing inode before writing page(See ubifs_writepage.). After commit c35acef383f4a2f2cfc30("ubifs: Convert ubifs_writepage to use a folio"), the rule is broken in one case: Given a file with size 3, then write 4096 from the offset 0, following process will make inode size be smaller than file data length after powercut & recovery: P1 P2 ubifs_writepage len = folio_size(folio) // 4096 if (folio_pos(folio) + len <= i_size) // condition 1: 0 + 4096 <= 4096 //(i_size is updated as 4096 in ubifs_write_end) if (folio_pos(folio) >= synced_i_size) // condition 2: 0 >= 3, false write_inode // Skipped, because condition 2 is false do_writepage(folio, len) // write one page do_commit // data node won't be replayed in next mounting >> Powercut << So, inode size(4096) is not updated into disk, we will get following error messages in next mounting(chk_fs = 1): check_leaf [ubifs]: data node at LEB 14:2048 is not within inode size 3 dbg_walk_index [ubifs]: leaf checking function returned error -22, for leaf at LEB 14:2048 Fix it by modifying condition 2 as original comparison(Compare the page index of synced_i_size with current page index). Fixes: c35acef383f4 ("ubifs: Convert ubifs_writepage to use a folio") Link: https://bugzilla.kernel.org/show_bug.cgi?id=218934 Signed-off-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index a1f46919934c..68e104423a48 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1027,7 +1027,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, /* Is the folio fully inside i_size? */ if (folio_pos(folio) + len <= i_size) { - if (folio_pos(folio) >= synced_i_size) { + if (folio_pos(folio) + len > synced_i_size) { err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_redirty; From 054fd15984454f031611d6c63675fc578aad0cb1 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Thu, 20 Jun 2024 16:19:26 +0800 Subject: [PATCH 074/120] ubifs: add check for crypto_shash_tfm_digest Add check for the return value of crypto_shash_tfm_digest() and return the error if it fails in order to catch the error. Fixes: 817aa094842d ("ubifs: support offline signed images") Signed-off-by: Chen Ni Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/master.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 7adc37c10b6a..a148760fa49e 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -67,10 +67,13 @@ static int mst_node_check_hash(const struct ubifs_info *c, { u8 calc[UBIFS_MAX_HASH_LEN]; const void *node = mst; + int ret; - crypto_shash_tfm_digest(c->hash_tfm, node + sizeof(struct ubifs_ch), + ret = crypto_shash_tfm_digest(c->hash_tfm, node + sizeof(struct ubifs_ch), UBIFS_MST_NODE_SZ - sizeof(struct ubifs_ch), calc); + if (ret) + return ret; if (ubifs_check_hash(c, expected, calc)) return -EPERM; From ab091ec536cb7b271983c0c063b17f62f3591583 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Mon, 15 Jul 2024 17:31:44 +0800 Subject: [PATCH 075/120] nvme/pci: Add APST quirk for Lenovo N60z laptop There is a hardware power-saving problem with the Lenovo N60z board. When turn it on and leave it for 10 hours, there is a 20% chance that a nvme disk will not wake up until reboot. Link: https://lore.kernel.org/all/2B5581C46AC6E335+9c7a81f1-05fb-4fd0-9fbb-108757c21628@uniontech.com Signed-off-by: hmy Signed-off-by: Wentao Guan Signed-off-by: WangYuli Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 21f8e1b9801f..bc24a85e96ce 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2968,6 +2968,13 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND; } + /* + * NVMe SSD drops off the PCIe bus after system idle + * for 10 hours on a Lenovo N60z board. + */ + if (dmi_match(DMI_BOARD_NAME, "LXKT-ZXEG-N6")) + return NVME_QUIRK_NO_APST; + return 0; } From 1a7812b25e69f6c63c52d3bc93c0970ab7ad9660 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 13 Jul 2024 15:43:17 +0200 Subject: [PATCH 076/120] nvme-fabrics: Use seq_putc() in __nvmf_concat_opt_tokens() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Single characters should be put into a sequence. Thus use the corresponding function “seq_putc”. This issue was transformed by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 44e342a46f39..f5f545fa0103 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -1403,10 +1403,10 @@ static void __nvmf_concat_opt_tokens(struct seq_file *seq_file) tok = &opt_tokens[idx]; if (tok->token == NVMF_OPT_ERR) continue; - seq_puts(seq_file, ","); + seq_putc(seq_file, ','); seq_puts(seq_file, tok->pattern); } - seq_puts(seq_file, "\n"); + seq_putc(seq_file, '\n'); } static int nvmf_dev_show(struct seq_file *seq_file, void *private) From 88c918d1ee2c88cc5be3aa67ce048414fee471ff Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Thu, 11 Jul 2024 14:40:53 +0300 Subject: [PATCH 077/120] nvme: remove redundant bdev local variable Use disk directly instead of getting it from bdev->bd_disk. Signed-off-by: Israel Rukshin Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/sysfs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 3c55f7edd181..ba05faaac562 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -233,13 +233,12 @@ static ssize_t nuse_show(struct device *dev, struct device_attribute *attr, { struct nvme_ns_head *head = dev_to_ns_head(dev); struct gendisk *disk = dev_to_disk(dev); - struct block_device *bdev = disk->part0; int ret; - if (nvme_disk_is_ns_head(bdev->bd_disk)) + if (nvme_disk_is_ns_head(disk)) ret = ns_head_update_nuse(head); else - ret = ns_update_nuse(bdev->bd_disk->private_data); + ret = ns_update_nuse(disk->private_data); if (ret) return ret; From 92fc2c469eb26060384e9b2cd4cb0cc228aba582 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 11 Jul 2024 15:59:52 -0700 Subject: [PATCH 078/120] nvme-pci: Fix the instructions for disabling power management pcie_aspm=off tells the kernel not to modify the ASPM configuration. This setting does not guarantee that ASPM (Active State Power Management) is disabled. Hence add pcie_port_pm=off. This disables power management for all PCIe ports. This patch has been tested on a workstation with a Samsung SSD 970 EVO Plus NVMe SSD. Fixes: 4641a8e6e145 ("nvme-pci: add trouble shooting steps for timeouts") Cc: Keith Busch Cc: Christoph Hellwig Cc: Chaitanya Kulkarni Signed-off-by: Bart Van Assche Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bc24a85e96ce..8087b054da66 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1309,7 +1309,7 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) dev_warn(dev->ctrl.device, "Does your device have a faulty power saving mode enabled?\n"); dev_warn(dev->ctrl.device, - "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off\" and report a bug\n"); + "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off pcie_port_pm=off\" and report a bug\n"); } static enum blk_eh_timer_return nvme_timeout(struct request *req) From 415fb383ec2bbe4d4bb1a1b5593cd67eb058f1de Mon Sep 17 00:00:00 2001 From: Francis Pravin Date: Mon, 15 Jul 2024 06:31:57 +0530 Subject: [PATCH 079/120] nvme-core: choose PIF from QPIF if QPIFS supports and PIF is QTYPE As per TP4141a: "If the Qualified Protection Information Format Support(QPIFS) bit is set to 1 and the Protection Information Format(PIF) field is set to 11b (i.e., Qualified Type), then the pif is as defined in the Qualified Protection Information Format (QPIF) field." So, choose PIF from QPIF if QPIFS supports and PIF is QTYPE. Signed-off-by: Francis Pravin Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 8 +++++++- include/linux/nvme.h | 9 +++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 448b8239bc99..e78ef31eeef0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1875,12 +1875,18 @@ static void nvme_configure_pi_elbas(struct nvme_ns_head *head, struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm) { u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]); + u8 guard_type; /* no support for storage tag formats right now */ if (nvme_elbaf_sts(elbaf)) return; - head->guard_type = nvme_elbaf_guard_type(elbaf); + guard_type = nvme_elbaf_guard_type(elbaf); + if ((nvm->pic & NVME_ID_NS_NVM_QPIFS) && + guard_type == NVME_NVM_NS_QTYPE_GUARD) + guard_type = nvme_elbaf_qualified_guard_type(elbaf); + + head->guard_type = guard_type; switch (head->guard_type) { case NVME_NVM_NS_64B_GUARD: head->pi_size = sizeof(struct crc64_pi_tuple); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 57e27e48c913..5c2290f0d5af 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -483,6 +483,9 @@ enum { NVME_ID_NS_NVM_STS_MASK = 0x7f, NVME_ID_NS_NVM_GUARD_SHIFT = 7, NVME_ID_NS_NVM_GUARD_MASK = 0x3, + NVME_ID_NS_NVM_QPIF_SHIFT = 9, + NVME_ID_NS_NVM_QPIF_MASK = 0xf, + NVME_ID_NS_NVM_QPIFS = 1 << 3, }; static inline __u8 nvme_elbaf_sts(__u32 elbaf) @@ -495,6 +498,11 @@ static inline __u8 nvme_elbaf_guard_type(__u32 elbaf) return (elbaf >> NVME_ID_NS_NVM_GUARD_SHIFT) & NVME_ID_NS_NVM_GUARD_MASK; } +static inline __u8 nvme_elbaf_qualified_guard_type(__u32 elbaf) +{ + return (elbaf >> NVME_ID_NS_NVM_QPIF_SHIFT) & NVME_ID_NS_NVM_QPIF_MASK; +} + struct nvme_id_ctrl_nvm { __u8 vsl; __u8 wzsl; @@ -574,6 +582,7 @@ enum { NVME_NVM_NS_16B_GUARD = 0, NVME_NVM_NS_32B_GUARD = 1, NVME_NVM_NS_64B_GUARD = 2, + NVME_NVM_NS_QTYPE_GUARD = 3, }; static inline __u8 nvme_lbaf_index(__u8 flbas) From a0328b397f3339d8d17a6ec356e94b3c110b010c Mon Sep 17 00:00:00 2001 From: Foryun Ma Date: Tue, 4 Jun 2024 11:21:51 +0800 Subject: [PATCH 080/120] cxl/core/pci: Move reading of control register to immediately before usage Relocate the reading of the DVSEC control register to immediately before usage and avoid unnecessary PCI config access from the read if DVSEC capability check, hdm_count check, or device validity check results in failure. Signed-off-by: Foryun Ma Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20240604032151.655-1-foryun.ma@jaguarmicro.com Signed-off-by: Dave Jiang --- drivers/cxl/core/pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 8567dd11eaac..a663e7566c48 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -338,10 +338,6 @@ int cxl_dvsec_rr_decode(struct device *dev, int d, if (rc) return rc; - rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); - if (rc) - return rc; - if (!(cap & CXL_DVSEC_MEM_CAPABLE)) { dev_dbg(dev, "Not MEM Capable\n"); return -ENXIO; @@ -368,6 +364,10 @@ int cxl_dvsec_rr_decode(struct device *dev, int d, * disabled, and they will remain moot after the HDM Decoder * capability is enabled. */ + rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); + if (rc) + return rc; + info->mem_enabled = FIELD_GET(CXL_DVSEC_MEM_ENABLE, ctrl); if (!info->mem_enabled) return 0; From b8a4518b5c2d1164f9bb2e586733a658c5239adf Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 23 Jul 2024 10:41:52 +0100 Subject: [PATCH 081/120] drbd: Add peer_device to Kernel doc Add missing documentation of peer_device parameter to Kernel doc. These parameters were added in commit 8164dd6c8ae1 ("drbd: Add peer device parameter to whole-bitmap I/O handlers") Flagged by W=1 builds. Signed-off-by: Simon Horman Link: https://lore.kernel.org/r/20240723-drbd-doc-v1-1-a04d9b7a9688@kernel.org Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index f92673f05c7a..a9e49b212341 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3422,6 +3422,7 @@ void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local) /** * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() * @device: DRBD device. + * @peer_device: Peer DRBD device. * * Sets all bits in the bitmap and writes the whole bitmap to stable storage. */ @@ -3448,6 +3449,7 @@ int drbd_bmio_set_n_write(struct drbd_device *device, /** * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() * @device: DRBD device. + * @peer_device: Peer DRBD device. * * Clears all bits in the bitmap and writes the whole bitmap to stable storage. */ @@ -3501,6 +3503,7 @@ static int w_bitmap_io(struct drbd_work *w, int unused) * @done: callback to be called after the bitmap IO was performed * @why: Descriptive text of the reason for doing the IO * @flags: Bitmap flags + * @peer_device: Peer DRBD device. * * While IO on the bitmap happens we freeze application IO thus we ensure * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be @@ -3549,6 +3552,7 @@ void drbd_queue_bitmap_io(struct drbd_device *device, * @io_fn: IO callback to be called when bitmap IO is possible * @why: Descriptive text of the reason for doing the IO * @flags: Bitmap flags + * @peer_device: Peer DRBD device. * * freezes application IO while that the actual IO operations runs. This * functions MAY NOT be called from worker context. From 193cc89ea0ca1da311877d2b4bb5e9f03bcc82a2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 21 Jul 2024 15:45:56 -0500 Subject: [PATCH 082/120] cifs: fix potential null pointer use in destroy_workqueue in init_cifs error path Dan Carpenter reported a Smack static checker warning: fs/smb/client/cifsfs.c:1981 init_cifs() error: we previously assumed 'serverclose_wq' could be null (see line 1895) The patch which introduced the serverclose workqueue used the wrong oredering in error paths in init_cifs() for freeing it on errors. Fixes: 173217bd7336 ("smb3: retrying on failed server close") Cc: stable@vger.kernel.org Cc: Ritvik Budhiraja Reported-by: Dan Carpenter Reviewed-by: Dan Carpenter Reviewed-by: David Howells Signed-off-by: Steve French --- fs/smb/client/cifsfs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index c92937bed133..2c4b357d85e2 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -1894,12 +1894,12 @@ init_cifs(void) WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); if (!serverclose_wq) { rc = -ENOMEM; - goto out_destroy_serverclose_wq; + goto out_destroy_deferredclose_wq; } rc = cifs_init_inodecache(); if (rc) - goto out_destroy_deferredclose_wq; + goto out_destroy_serverclose_wq; rc = cifs_init_netfs(); if (rc) @@ -1967,6 +1967,8 @@ out_destroy_netfs: cifs_destroy_netfs(); out_destroy_inodecache: cifs_destroy_inodecache(); +out_destroy_serverclose_wq: + destroy_workqueue(serverclose_wq); out_destroy_deferredclose_wq: destroy_workqueue(deferredclose_wq); out_destroy_cifsoplockd_wq: @@ -1977,8 +1979,6 @@ out_destroy_decrypt_wq: destroy_workqueue(decrypt_wq); out_destroy_cifsiod_wq: destroy_workqueue(cifsiod_wq); -out_destroy_serverclose_wq: - destroy_workqueue(serverclose_wq); out_clean_proc: cifs_proc_clean(); return rc; From a214384ce26b6111ea8c8d58fa82a1ca63996c38 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 22 Jul 2024 23:40:08 -0500 Subject: [PATCH 083/120] cifs: fix reconnect with SMB1 UNIX Extensions When mounting with the SMB1 Unix Extensions (e.g. mounts to Samba with vers=1.0), reconnects no longer reset the Unix Extensions (SetFSInfo SET_FILE_UNIX_BASIC) after tcon so most operations (e.g. stat, ls, open, statfs) will fail continuously with: "Operation not supported" if the connection ever resets (e.g. due to brief network disconnect) Cc: stable@vger.kernel.org Reviewed-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/connect.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 7a16e12f5da8..89d9f86cc29a 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -3686,6 +3686,7 @@ error: } #endif +#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY /* * Issue a TREE_CONNECT request. */ @@ -3807,11 +3808,25 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, else tcon->Flags = 0; cifs_dbg(FYI, "Tcon flags: 0x%x\n", tcon->Flags); - } + /* + * reset_cifs_unix_caps calls QFSInfo which requires + * need_reconnect to be false, but we would not need to call + * reset_caps if this were not a reconnect case so must check + * need_reconnect flag here. The caller will also clear + * need_reconnect when tcon was successful but needed to be + * cleared earlier in the case of unix extensions reconnect + */ + if (tcon->need_reconnect && tcon->unix_ext) { + cifs_dbg(FYI, "resetting caps for %s\n", tcon->tree_name); + tcon->need_reconnect = false; + reset_cifs_unix_caps(xid, tcon, NULL, NULL); + } + } cifs_buf_release(smb_buffer); return rc; } +#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ static void delayed_free(struct rcu_head *p) { From 0e314e452687ce0ec5874e42cdb993a34325d3d2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 23 Jul 2024 00:44:48 -0500 Subject: [PATCH 084/120] cifs: mount with "unix" mount option for SMB1 incorrectly handled Although by default we negotiate CIFS Unix Extensions for SMB1 mounts to Samba (and they work if the user does not specify "unix" or "posix" or "linux" on mount), and we do properly handle when a user turns them off with "nounix" mount parm. But with the changes to the mount API we broke cases where the user explicitly specifies the "unix" option (or equivalently "linux" or "posix") on mount with vers=1.0 to Samba or other servers which support the CIFS Unix Extensions. "mount error(95): Operation not supported" and logged: "CIFS: VFS: Check vers= mount option. SMB3.11 disabled but required for POSIX extensions" even though CIFS Unix Extensions are supported for vers=1.0 This patch fixes the case where the user specifies both "unix" (or equivalently "posix" or "linux") and "vers=1.0" on mount to a server which supports the CIFS Unix Extensions. Cc: stable@vger.kernel.org Reviewed-by: David Howells Reviewed-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French --- fs/smb/client/connect.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 89d9f86cc29a..d2307162a2de 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2614,6 +2614,13 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) cifs_dbg(VFS, "Server does not support mounting with posix SMB3.11 extensions\n"); rc = -EOPNOTSUPP; goto out_fail; + } else if (ses->server->vals->protocol_id == SMB10_PROT_ID) + if (cap_unix(ses)) + cifs_dbg(FYI, "Unix Extensions requested on SMB1 mount\n"); + else { + cifs_dbg(VFS, "SMB1 Unix Extensions not supported by server\n"); + rc = -EOPNOTSUPP; + goto out_fail; } else { cifs_dbg(VFS, "Check vers= mount option. SMB3.11 disabled but required for POSIX extensions\n"); From ba6c664081afd18da86ac49cb22ceb266f89a561 Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Wed, 24 Jul 2024 10:46:55 +0200 Subject: [PATCH 085/120] kbuild: rpm-pkg: Fix C locale setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit semicolon separation in LC_ALL is wrong. Either variable needs to be exported before as a separate commit or set as part of the commit in the beginning. Used second variant. This fixes broken build on user's locale setup which makes 'date' binary to produce invalid characters in rpm changelog (e.g. cs_CZ.UTF-8 'čec'): $ make binrpm-pkg GEN rpmbuild/SPECS/kernel.spec rpmbuild -bb rpmbuild/SPECS/kernel.spec --define='_topdirlinux/rpmbuild' \ --target x86_64-linux --build-in-place --noprep --define='_smp_mflags \ %{nil}' $(rpm -q rpm >/dev/null 2>&1 || echo --nodeps) Building target platforms: x86_64-linux Building for target x86_64-linux error: bad date in %changelog: St čec 24 2024 user make[2]: *** [scripts/Makefile.package:71: binrpm-pkg] Error 1 make[1]: *** [linux/Makefile:1546: binrpm-pkg] Error 2 make: *** [Makefile:224: __sub-make] Error 2 Fixes: 301c10908e42 ("kbuild: rpm-pkg: introduce a simple changelog section for kernel.spec") Signed-off-by: Petr Vorel Reviewed-by: Miguel Ojeda Signed-off-by: Masahiro Yamada --- scripts/package/mkspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/package/mkspec b/scripts/package/mkspec index ead54d67a024..4dc1466dfc81 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -50,6 +50,6 @@ fi cat << EOF %changelog -* $(LC_ALL=C; date +'%a %b %d %Y') ${name} <${email}> +* $(LC_ALL=C date +'%a %b %d %Y') ${name} <${email}> - Custom built Linux kernel. EOF From f8b632e89a101dae349a7b212c1771d7925f441b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 24 Jul 2024 12:16:16 +0100 Subject: [PATCH 086/120] io_uring: tighten task exit cancellations io_uring_cancel_generic() should retry if any state changes like a request is completed, however in case of a task exit it only goes for another loop and avoids schedule() if any tracked (i.e. REQ_F_INFLIGHT) request got completed. Let's assume we have a non-tracked request executing in iowq and a tracked request linked to it. Let's also assume io_uring_cancel_generic() fails to find and cancel the request, i.e. via io_run_local_work(), which may happen as io-wq has gaps. Next, the request logically completes, io-wq still hold a ref but queues it for completion via tw, which happens in io_uring_try_cancel_requests(). After, right before prepare_to_wait() io-wq puts the request, grabs the linked one and tries executes it, e.g. arms polling. Finally the cancellation loop calls prepare_to_wait(), there are no tw to run, no tracked request was completed, so the tctx_inflight() check passes and the task is put to indefinite sleep. Cc: stable@vger.kernel.org Fixes: 3f48cf18f886c ("io_uring: unify files and task cancel") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/acac7311f4e02ce3c43293f8f1fda9c705d158f1.1721819383.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8e6faa942a6f..10c409e56241 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3031,8 +3031,11 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) bool loop = false; io_uring_drop_tctx_refs(current); + if (!tctx_inflight(tctx, !cancel_all)) + break; + /* read completions before cancelations */ - inflight = tctx_inflight(tctx, !cancel_all); + inflight = tctx_inflight(tctx, false); if (!inflight) break; From bd44d7e902c2b34c217d3b48874b079760ca7b6e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 24 Jul 2024 12:16:17 +0100 Subject: [PATCH 087/120] io_uring: don't allow netpolling with SETUP_IOPOLL IORING_SETUP_IOPOLL rings don't have any netpoll handling, let's fail attempts to register netpolling in this case, there might be people who will mix up IOPOLL and netpoll. Cc: stable@vger.kernel.org Fixes: ef1186c1a875b ("io_uring: add register/unregister napi function") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1e7553aee0a8ae4edec6742cd6dd0c1e6914fba8.1721819383.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/napi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/napi.c b/io_uring/napi.c index 762254a7ff3f..327e5f3a8abe 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -222,6 +222,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) }; struct io_uring_napi napi; + if (ctx->flags & IORING_SETUP_IOPOLL) + return -EINVAL; if (copy_from_user(&napi, arg, sizeof(napi))) return -EFAULT; if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv) From e142e9cd8891b0c6f277ac2c2c254199a6aa56e3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 24 Jul 2024 12:16:18 +0100 Subject: [PATCH 088/120] io_uring: fix io_match_task must_hold The __must_hold annotation in io_match_task() uses a non existing parameter "req", fix it. Fixes: 6af3f48bf6156 ("io_uring: fix link traversal locking") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3e65ee7709e96507cef3d93291746f2c489f2307.1721819383.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/timeout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 1c9bf07499b1..9973876d91b0 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -639,7 +639,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) static bool io_match_task(struct io_kiocb *head, struct task_struct *task, bool cancel_all) - __must_hold(&req->ctx->timeout_lock) + __must_hold(&head->ctx->timeout_lock) { struct io_kiocb *req; From f1dcdfcadb0c8c13dddd931c1f4dc58e54fdc9c0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 24 Jul 2024 12:16:19 +0100 Subject: [PATCH 089/120] io_uring: simplify io_uring_cmd return We don't have to return error code from an op handler back to core io_uring, so once io_uring_cmd() sets the results and handles errors we can juts return IOU_OK and simplify the code. Note, only valid with e0b23d9953b0c ("io_uring: optimise ltimeout for inline execution"), there was a problem with iopoll before. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8eae2be5b2a49236cd5f1dadbd1aa5730e9e2d4f.1721819383.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index a54163a83968..8391c7c7c1ec 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -265,7 +265,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); io_req_uring_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); - return ret < 0 ? ret : IOU_OK; + return IOU_OK; } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, From a2b72b81fb3ba18717fc000949ca9d45a3351130 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 24 Jul 2024 12:16:20 +0100 Subject: [PATCH 090/120] io_uring: kill REQ_F_CANCEL_SEQ We removed the reliance on the flag by the cancellation path and now it's unused. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e57afe566bbe4fefeb44daffb08900f2a4756577.1721819383.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3bb6198d1523..e62aa9f0629f 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -461,7 +461,6 @@ enum { REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, REQ_F_POLL_NO_LAZY_BIT, - REQ_F_CANCEL_SEQ_BIT, REQ_F_CAN_POLL_BIT, REQ_F_BL_EMPTY_BIT, REQ_F_BL_NO_RECYCLE_BIT, @@ -536,8 +535,6 @@ enum { REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT), /* don't use lazy poll wake for this request */ REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), - /* cancel sequence is set and valid */ - REQ_F_CANCEL_SEQ = IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT), /* file is pollable */ REQ_F_CAN_POLL = IO_REQ_FLAG(REQ_F_CAN_POLL_BIT), /* buffer list was empty after selection of buffer */ From 29d63b94036e561a016ec8878b44aad6650d23e2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 24 Jul 2024 12:16:21 +0100 Subject: [PATCH 091/120] io_uring: align iowq and task request error handling There is a difference in how io_queue_sqe and io_wq_submit_work treat error codes they get from io_issue_sqe. The first one fails anything unknown but latter only fails when the code is negative. It doesn't make sense to have this discrepancy, align them to the io_queue_sqe behaviour. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c550e152bf4a290187f91a4322ddcb5d6d1f2c73.1721819383.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 10c409e56241..2626424f5d73 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1849,7 +1849,7 @@ fail: } while (1); /* avoid locking problems by failing it from a clean context */ - if (ret < 0) + if (ret) io_req_task_queue_fail(req, ret); } From 7e04da2dc7013af50ed3a2beb698d5168d1e594b Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 24 Jul 2024 15:04:12 +0800 Subject: [PATCH 092/120] block: fix deadlock between sd_remove & sd_release Our test report the following hung task: [ 2538.459400] INFO: task "kworker/0:0":7 blocked for more than 188 seconds. [ 2538.459427] Call trace: [ 2538.459430] __switch_to+0x174/0x338 [ 2538.459436] __schedule+0x628/0x9c4 [ 2538.459442] schedule+0x7c/0xe8 [ 2538.459447] schedule_preempt_disabled+0x24/0x40 [ 2538.459453] __mutex_lock+0x3ec/0xf04 [ 2538.459456] __mutex_lock_slowpath+0x14/0x24 [ 2538.459459] mutex_lock+0x30/0xd8 [ 2538.459462] del_gendisk+0xdc/0x350 [ 2538.459466] sd_remove+0x30/0x60 [ 2538.459470] device_release_driver_internal+0x1c4/0x2c4 [ 2538.459474] device_release_driver+0x18/0x28 [ 2538.459478] bus_remove_device+0x15c/0x174 [ 2538.459483] device_del+0x1d0/0x358 [ 2538.459488] __scsi_remove_device+0xa8/0x198 [ 2538.459493] scsi_forget_host+0x50/0x70 [ 2538.459497] scsi_remove_host+0x80/0x180 [ 2538.459502] usb_stor_disconnect+0x68/0xf4 [ 2538.459506] usb_unbind_interface+0xd4/0x280 [ 2538.459510] device_release_driver_internal+0x1c4/0x2c4 [ 2538.459514] device_release_driver+0x18/0x28 [ 2538.459518] bus_remove_device+0x15c/0x174 [ 2538.459523] device_del+0x1d0/0x358 [ 2538.459528] usb_disable_device+0x84/0x194 [ 2538.459532] usb_disconnect+0xec/0x300 [ 2538.459537] hub_event+0xb80/0x1870 [ 2538.459541] process_scheduled_works+0x248/0x4dc [ 2538.459545] worker_thread+0x244/0x334 [ 2538.459549] kthread+0x114/0x1bc [ 2538.461001] INFO: task "fsck.":15415 blocked for more than 188 seconds. [ 2538.461014] Call trace: [ 2538.461016] __switch_to+0x174/0x338 [ 2538.461021] __schedule+0x628/0x9c4 [ 2538.461025] schedule+0x7c/0xe8 [ 2538.461030] blk_queue_enter+0xc4/0x160 [ 2538.461034] blk_mq_alloc_request+0x120/0x1d4 [ 2538.461037] scsi_execute_cmd+0x7c/0x23c [ 2538.461040] ioctl_internal_command+0x5c/0x164 [ 2538.461046] scsi_set_medium_removal+0x5c/0xb0 [ 2538.461051] sd_release+0x50/0x94 [ 2538.461054] blkdev_put+0x190/0x28c [ 2538.461058] blkdev_release+0x28/0x40 [ 2538.461063] __fput+0xf8/0x2a8 [ 2538.461066] __fput_sync+0x28/0x5c [ 2538.461070] __arm64_sys_close+0x84/0xe8 [ 2538.461073] invoke_syscall+0x58/0x114 [ 2538.461078] el0_svc_common+0xac/0xe0 [ 2538.461082] do_el0_svc+0x1c/0x28 [ 2538.461087] el0_svc+0x38/0x68 [ 2538.461090] el0t_64_sync_handler+0x68/0xbc [ 2538.461093] el0t_64_sync+0x1a8/0x1ac T1: T2: sd_remove del_gendisk __blk_mark_disk_dead blk_freeze_queue_start ++q->mq_freeze_depth bdev_release mutex_lock(&disk->open_mutex) sd_release scsi_execute_cmd blk_queue_enter wait_event(!q->mq_freeze_depth) mutex_lock(&disk->open_mutex) SCSI does not set GD_OWNS_QUEUE, so QUEUE_FLAG_DYING is not set in this scenario. This is a classic ABBA deadlock. To fix the deadlock, make sure we don't try to acquire disk->open_mutex after freezing the queue. Cc: stable@vger.kernel.org Fixes: eec1be4c30df ("block: delete partitions later in del_gendisk") Signed-off-by: Yang Yang Reviewed-by: Ming Lei Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Fixes: and Cc: stable tags are missing. Otherwise this patch looks fine Link: https://lore.kernel.org/r/20240724070412.22521-1-yang.yang@vivo.com Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 4dc95a463505..1c05dd4c6980 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -663,12 +663,12 @@ void del_gendisk(struct gendisk *disk) */ if (!test_bit(GD_DEAD, &disk->state)) blk_report_disk_dead(disk, false); - __blk_mark_disk_dead(disk); /* * Drop all partitions now that the disk is marked dead. */ mutex_lock(&disk->open_mutex); + __blk_mark_disk_dead(disk); xa_for_each_start(&disk->part_tbl, idx, part, 1) drop_partition(part); mutex_unlock(&disk->open_mutex); From 55fbb9a5d64e0e590cad5eacc16c99f2482a008f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 24 Jul 2024 22:33:11 +0800 Subject: [PATCH 093/120] ublk: fix UBLK_CMD_DEL_DEV_ASYNC handling In ublk_ctrl_uring_cmd(), ioctl command NR should be used for matching _IOC_NR(cmd_op). Fix it by adding one private macro, and this way is clean. Fixes: 13fe8e6825e4 ("ublk: add UBLK_CMD_DEL_DEV_ASYNC") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20240724143311.2646330-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 6fb799ec0d88..890c08792ba8 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -48,6 +48,9 @@ #define UBLK_MINORS (1U << MINORBITS) +/* private ioctl command mirror */ +#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) + /* All UBLK_F_* have to be included into UBLK_F_ALL */ #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ | UBLK_F_URING_CMD_COMP_IN_TASK \ @@ -2903,7 +2906,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, case UBLK_CMD_DEL_DEV: ret = ublk_ctrl_del_dev(&ub, true); break; - case UBLK_U_CMD_DEL_DEV_ASYNC: + case UBLK_CMD_DEL_DEV_ASYNC: ret = ublk_ctrl_del_dev(&ub, false); break; case UBLK_CMD_GET_QUEUE_AFFINITY: From 33be0cfa5ba522ba88ba25cb95e582932843409b Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 28 Jun 2024 17:37:12 +0200 Subject: [PATCH 094/120] apparmor: take nosymfollow flag into account A "nosymfollow" flag was added in commit dab741e0e02b ("Add a "nosymfollow" mount option.") While we don't need to implement any special logic on the AppArmor kernel side to handle it, we should provide user with a correct list of mount flags in audit logs. Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Georgia Garcia Signed-off-by: John Johansen --- security/apparmor/mount.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c index 49fe8da6fea4..bf8863253e07 100644 --- a/security/apparmor/mount.c +++ b/security/apparmor/mount.c @@ -44,6 +44,8 @@ static void audit_mnt_flags(struct audit_buffer *ab, unsigned long flags) audit_log_format(ab, ", mand"); if (flags & MS_DIRSYNC) audit_log_format(ab, ", dirsync"); + if (flags & MS_NOSYMFOLLOW) + audit_log_format(ab, ", nosymfollow"); if (flags & MS_NOATIME) audit_log_format(ab, ", noatime"); if (flags & MS_NODIRATIME) From 4b954a025591a1c7d3a0c0111b6d4730596046b6 Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Wed, 29 May 2024 18:21:39 -0700 Subject: [PATCH 095/120] apparmor: test: add MODULE_DESCRIPTION() Fix the 'make W=1' warning: WARNING: modpost: missing MODULE_DESCRIPTION() in security/apparmor/apparmor_policy_unpack_test.o Signed-off-by: Jeff Johnson Signed-off-by: John Johansen --- security/apparmor/policy_unpack_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/apparmor/policy_unpack_test.c b/security/apparmor/policy_unpack_test.c index 5c9bde25e56d..874fcf97794e 100644 --- a/security/apparmor/policy_unpack_test.c +++ b/security/apparmor/policy_unpack_test.c @@ -604,4 +604,5 @@ static struct kunit_suite apparmor_policy_unpack_test_module = { kunit_test_suite(apparmor_policy_unpack_test_module); +MODULE_DESCRIPTION("KUnit tests for AppArmor's policy unpack"); MODULE_LICENSE("GPL"); From f4fee216df7d28b87d1c9cc60bcebfecb51c1a05 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 20 Jun 2024 19:15:27 +0200 Subject: [PATCH 096/120] apparmor: try to avoid refing the label in apparmor_file_open If the label is not stale (which is the common case), the fact that the passed file object holds a reference can be leverged to avoid the ref/unref cycle. Doing so reduces performance impact of apparmor on parallel open() invocations. When benchmarking on a 24-core vm using will-it-scale's open1_process ("Separate file open"), the results are (ops/s): before: 6092196 after: 8309726 (+36%) Signed-off-by: Mateusz Guzik Signed-off-by: John Johansen --- security/apparmor/include/cred.h | 20 ++++++++++++++++++++ security/apparmor/lsm.c | 5 +++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/security/apparmor/include/cred.h b/security/apparmor/include/cred.h index 58fdc72af664..7265d2f81dd5 100644 --- a/security/apparmor/include/cred.h +++ b/security/apparmor/include/cred.h @@ -63,6 +63,26 @@ static inline struct aa_label *aa_get_newest_cred_label(const struct cred *cred) return aa_get_newest_label(aa_cred_raw_label(cred)); } +static inline struct aa_label *aa_get_newest_cred_label_condref(const struct cred *cred, + bool *needput) +{ + struct aa_label *l = aa_cred_raw_label(cred); + + if (unlikely(label_is_stale(l))) { + *needput = true; + return aa_get_newest_label(l); + } + + *needput = false; + return l; +} + +static inline void aa_put_label_condref(struct aa_label *l, bool needput) +{ + if (unlikely(needput)) + aa_put_label(l); +} + /** * aa_current_raw_label - find the current tasks confining label * diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 717204fba938..242d4cf857a7 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -461,6 +461,7 @@ static int apparmor_file_open(struct file *file) struct aa_file_ctx *fctx = file_ctx(file); struct aa_label *label; int error = 0; + bool needput; if (!path_mediated_fs(file->f_path.dentry)) return 0; @@ -477,7 +478,7 @@ static int apparmor_file_open(struct file *file) return 0; } - label = aa_get_newest_cred_label(file->f_cred); + label = aa_get_newest_cred_label_condref(file->f_cred, &needput); if (!unconfined(label)) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); @@ -494,7 +495,7 @@ static int apparmor_file_open(struct file *file) /* todo cache full allowed permissions set and state */ fctx->allow = aa_map_file_to_perms(file); } - aa_put_label(label); + aa_put_label_condref(label, needput); return error; } From e0ff0cff1f6cdce0aa596aac04129893201c4162 Mon Sep 17 00:00:00 2001 From: Georgia Garcia Date: Mon, 10 Jun 2024 09:51:48 -0300 Subject: [PATCH 097/120] apparmor: unpack transition table if dfa is not present Due to a bug in earlier userspaces, a transition table may be present even when the dfa is not. Commit 7572fea31e3e ("apparmor: convert fperm lookup to use accept as an index") made the verification check more rigourous regressing old userspaces with the bug. For compatibility reasons allow the orphaned transition table during unpack and discard. Fixes: 7572fea31e3e ("apparmor: convert fperm lookup to use accept as an index") Signed-off-by: Georgia Garcia Signed-off-by: John Johansen --- security/apparmor/policy_unpack.c | 42 ++++++++++++++++++------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c index 75452acd0e35..5a570235427d 100644 --- a/security/apparmor/policy_unpack.c +++ b/security/apparmor/policy_unpack.c @@ -747,34 +747,42 @@ static int unpack_pdb(struct aa_ext *e, struct aa_policydb **policy, *info = "missing required dfa"; goto fail; } - goto out; + } else { + /* + * only unpack the following if a dfa is present + * + * sadly start was given different names for file and policydb + * but since it is optional we can try both + */ + if (!aa_unpack_u32(e, &pdb->start[0], "start")) + /* default start state */ + pdb->start[0] = DFA_START; + if (!aa_unpack_u32(e, &pdb->start[AA_CLASS_FILE], "dfa_start")) { + /* default start state for xmatch and file dfa */ + pdb->start[AA_CLASS_FILE] = DFA_START; + } /* setup class index */ + for (i = AA_CLASS_FILE + 1; i <= AA_CLASS_LAST; i++) { + pdb->start[i] = aa_dfa_next(pdb->dfa, pdb->start[0], + i); + } } /* - * only unpack the following if a dfa is present - * - * sadly start was given different names for file and policydb - * but since it is optional we can try both + * Unfortunately due to a bug in earlier userspaces, a + * transition table may be present even when the dfa is + * not. For compatibility reasons unpack and discard. */ - if (!aa_unpack_u32(e, &pdb->start[0], "start")) - /* default start state */ - pdb->start[0] = DFA_START; - if (!aa_unpack_u32(e, &pdb->start[AA_CLASS_FILE], "dfa_start")) { - /* default start state for xmatch and file dfa */ - pdb->start[AA_CLASS_FILE] = DFA_START; - } /* setup class index */ - for (i = AA_CLASS_FILE + 1; i <= AA_CLASS_LAST; i++) { - pdb->start[i] = aa_dfa_next(pdb->dfa, pdb->start[0], - i); - } if (!unpack_trans_table(e, &pdb->trans) && required_trans) { *info = "failed to unpack profile transition table"; goto fail; } + if (!pdb->dfa && pdb->trans.table) + aa_free_str_table(&pdb->trans); + /* TODO: move compat mapping here, requires dfa merging first */ /* TODO: move verify here, it has to be done after compat mappings */ -out: + *policy = pdb; return 0; From c31fad1470389666ac7169fe43aa65bf5b7e2cfd Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 24 Jul 2024 13:31:14 +0300 Subject: [PATCH 098/120] nvme-pci: add missing condition check for existence of mapped data nvme_map_data() is called when request has physical segments, hence the nvme_unmap_data() should have same condition to avoid dereference. Fixes: 4aedb705437f ("nvme-pci: split metadata handling from nvme_map_data / nvme_unmap_data") Signed-off-by: Leon Romanovsky Reviewed-by: Christoph Hellwig Reviewed-by: Nitesh Shetty Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8087b054da66..6cd9395ba9ec 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -863,7 +863,8 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) nvme_start_request(req); return BLK_STS_OK; out_unmap_data: - nvme_unmap_data(dev, req); + if (blk_rq_nr_phys_segments(req)) + nvme_unmap_data(dev, req); out_free_cmd: nvme_cleanup_cmd(req); return ret; From 0db4618e8fabfcc404af4dda23799bba726785a5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Jul 2024 08:41:35 -0600 Subject: [PATCH 099/120] io_uring/msg_ring: fix uninitialized use of target_req->flags syzbot reports that KMSAN complains that 'nr_tw' is an uninit-value with the following report: BUG: KMSAN: uninit-value in io_req_local_work_add io_uring/io_uring.c:1192 [inline] BUG: KMSAN: uninit-value in io_req_task_work_add_remote+0x588/0x5d0 io_uring/io_uring.c:1240 io_req_local_work_add io_uring/io_uring.c:1192 [inline] io_req_task_work_add_remote+0x588/0x5d0 io_uring/io_uring.c:1240 io_msg_remote_post io_uring/msg_ring.c:102 [inline] io_msg_data_remote io_uring/msg_ring.c:133 [inline] io_msg_ring_data io_uring/msg_ring.c:152 [inline] io_msg_ring+0x1c38/0x1ef0 io_uring/msg_ring.c:305 io_issue_sqe+0x383/0x22c0 io_uring/io_uring.c:1710 io_queue_sqe io_uring/io_uring.c:1924 [inline] io_submit_sqe io_uring/io_uring.c:2180 [inline] io_submit_sqes+0x1259/0x2f20 io_uring/io_uring.c:2295 __do_sys_io_uring_enter io_uring/io_uring.c:3205 [inline] __se_sys_io_uring_enter+0x40c/0x3ca0 io_uring/io_uring.c:3142 __x64_sys_io_uring_enter+0x11f/0x1a0 io_uring/io_uring.c:3142 x64_sys_call+0x2d82/0x3c10 arch/x86/include/generated/asm/syscalls_64.h:427 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f which is the following check: if (nr_tw < nr_wait) return; in io_req_local_work_add(). While nr_tw itself cannot be uninitialized, it does depend on req->flags, which off the msg ring issue path can indeed be uninitialized. Fix this by always clearing the allocated 'req' fully if we can't grab one from the cache itself. Fixes: 50cf5f3842af ("io_uring/msg_ring: add an alloc cache for io_kiocb entries") Reported-by: syzbot+82609b8937a4458106ca@syzkaller.appspotmail.com Link: https://lore.kernel.org/io-uring/000000000000fd3d8d061dfc0e4a@google.com/ Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 29fa9285a33d..7fd9badcfaf8 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -110,10 +110,10 @@ static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) if (spin_trylock(&ctx->msg_lock)) { req = io_alloc_cache_get(&ctx->msg_cache); spin_unlock(&ctx->msg_lock); + if (req) + return req; } - if (req) - return req; - return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN); + return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); } static int io_msg_data_remote(struct io_kiocb *req) From 342b2e395d5f34c9f111a818556e617939f83a8c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 26 Jul 2024 15:24:30 +0100 Subject: [PATCH 100/120] io_uring/napi: use ktime in busy polling It's more natural to use ktime/ns instead of keeping around usec, especially since we're comparing it against user provided timers, so convert napi busy poll internal handling to ktime. It's also nicer since the type (ktime_t vs unsigned long) now tells the unit of measure. Keep everything as ktime, which we convert to/from micro seconds for IORING_[UN]REGISTER_NAPI. The net/ busy polling works seems to work with usec, however it's not real usec as shift by 10 is used to get it from nsecs, see busy_loop_current_time(), so it's easy to get truncated nsec back and we get back better precision. Note, we can further improve it later by removing the truncation and maybe convincing net/ to use ktime/ns instead. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/95e7ec8d095069a3ed5d40a4bc6f8b586698bc7e.1722003776.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.h | 2 +- io_uring/napi.c | 48 +++++++++++++++++++--------------- io_uring/napi.h | 2 +- 4 files changed, 30 insertions(+), 24 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e62aa9f0629f..3315005df117 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -404,7 +404,7 @@ struct io_ring_ctx { spinlock_t napi_lock; /* napi_list lock */ /* napi busy poll default timeout */ - unsigned int napi_busy_poll_to; + ktime_t napi_busy_poll_dt; bool napi_prefer_busy_poll; bool napi_enabled; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e1ce908f0679..c2acf6180845 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -43,7 +43,7 @@ struct io_wait_queue { ktime_t timeout; #ifdef CONFIG_NET_RX_BUSY_POLL - unsigned int napi_busy_poll_to; + ktime_t napi_busy_poll_dt; bool napi_prefer_busy_poll; #endif }; diff --git a/io_uring/napi.c b/io_uring/napi.c index 327e5f3a8abe..6bdb267e9c33 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -33,6 +33,12 @@ static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, return NULL; } +static inline ktime_t net_to_ktime(unsigned long t) +{ + /* napi approximating usecs, reverse busy_loop_current_time */ + return ns_to_ktime(t << 10); +} + void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock) { struct hlist_head *hash_list; @@ -102,14 +108,14 @@ static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) __io_napi_remove_stale(ctx); } -static inline bool io_napi_busy_loop_timeout(unsigned long start_time, - unsigned long bp_usec) +static inline bool io_napi_busy_loop_timeout(ktime_t start_time, + ktime_t bp) { - if (bp_usec) { - unsigned long end_time = start_time + bp_usec; - unsigned long now = busy_loop_current_time(); + if (bp) { + ktime_t end_time = ktime_add(start_time, bp); + ktime_t now = net_to_ktime(busy_loop_current_time()); - return time_after(now, end_time); + return ktime_after(now, end_time); } return true; @@ -124,7 +130,8 @@ static bool io_napi_busy_loop_should_end(void *data, return true; if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return true; - if (io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to)) + if (io_napi_busy_loop_timeout(net_to_ktime(start_time), + iowq->napi_busy_poll_dt)) return true; return false; @@ -181,10 +188,12 @@ static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, */ void io_napi_init(struct io_ring_ctx *ctx) { + u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC; + INIT_LIST_HEAD(&ctx->napi_list); spin_lock_init(&ctx->napi_lock); ctx->napi_prefer_busy_poll = false; - ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll); + ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt); } /* @@ -217,7 +226,7 @@ void io_napi_free(struct io_ring_ctx *ctx) int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { - .busy_poll_to = ctx->napi_busy_poll_to, + .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), .prefer_busy_poll = ctx->napi_prefer_busy_poll }; struct io_uring_napi napi; @@ -232,7 +241,7 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) if (copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; - WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to); + WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC); WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll); WRITE_ONCE(ctx->napi_enabled, true); return 0; @@ -249,14 +258,14 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { - .busy_poll_to = ctx->napi_busy_poll_to, + .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), .prefer_busy_poll = ctx->napi_prefer_busy_poll }; if (arg && copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; - WRITE_ONCE(ctx->napi_busy_poll_to, 0); + WRITE_ONCE(ctx->napi_busy_poll_dt, 0); WRITE_ONCE(ctx->napi_prefer_busy_poll, false); WRITE_ONCE(ctx->napi_enabled, false); return 0; @@ -275,23 +284,20 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, struct timespec64 *ts) { - unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to); + ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); if (ts) { struct timespec64 poll_to_ts; - poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); + poll_to_ts = ns_to_timespec64(ktime_to_ns(poll_dt)); if (timespec64_compare(ts, &poll_to_ts) < 0) { s64 poll_to_ns = timespec64_to_ns(ts); - if (poll_to_ns > 0) { - u64 val = poll_to_ns + 999; - do_div(val, 1000); - poll_to = val; - } + if (poll_to_ns > 0) + poll_dt = ns_to_ktime(poll_to_ns); } } - iowq->napi_busy_poll_to = poll_to; + iowq->napi_busy_poll_dt = poll_dt; } /* @@ -320,7 +326,7 @@ int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) LIST_HEAD(napi_list); bool is_stale = false; - if (!READ_ONCE(ctx->napi_busy_poll_to)) + if (!READ_ONCE(ctx->napi_busy_poll_dt)) return 0; if (list_empty_careful(&ctx->napi_list)) return 0; diff --git a/io_uring/napi.h b/io_uring/napi.h index 6fc0393d0dbe..babbee36cd3e 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -55,7 +55,7 @@ static inline void io_napi_add(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct socket *sock; - if (!READ_ONCE(ctx->napi_busy_poll_to)) + if (!READ_ONCE(ctx->napi_busy_poll_dt)) return; sock = sock_from_file(req->file); From 358169617602f6f71b31e5c9532a09b95a34b043 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 26 Jul 2024 15:24:31 +0100 Subject: [PATCH 101/120] io_uring/napi: pass ktime to io_napi_adjust_timeout Pass the waiting time for __io_napi_adjust_timeout as ktime and get rid of all timespec64 conversions. It's especially simpler since the caller already have a ktime. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4f5b8e8eed4f53a1879e031a6712b25381adc23d.1722003776.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 ++++-- io_uring/napi.c | 14 +++----------- io_uring/napi.h | 8 ++++---- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2626424f5d73..3942db160f18 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2416,12 +2416,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (uts) { struct timespec64 ts; + ktime_t dt; if (get_timespec64(&ts, uts)) return -EFAULT; - iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); - io_napi_adjust_timeout(ctx, &iowq, &ts); + dt = timespec64_to_ktime(ts); + iowq.timeout = ktime_add(dt, ktime_get()); + io_napi_adjust_timeout(ctx, &iowq, dt); } if (sig) { diff --git a/io_uring/napi.c b/io_uring/napi.c index 6bdb267e9c33..4fd6bb331e1e 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -282,20 +282,12 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) * the NAPI timeout accordingly. */ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - struct timespec64 *ts) + ktime_t to_wait) { ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); - if (ts) { - struct timespec64 poll_to_ts; - - poll_to_ts = ns_to_timespec64(ktime_to_ns(poll_dt)); - if (timespec64_compare(ts, &poll_to_ts) < 0) { - s64 poll_to_ns = timespec64_to_ns(ts); - if (poll_to_ns > 0) - poll_dt = ns_to_ktime(poll_to_ns); - } - } + if (to_wait) + poll_dt = min(poll_dt, to_wait); iowq->napi_busy_poll_dt = poll_dt; } diff --git a/io_uring/napi.h b/io_uring/napi.h index babbee36cd3e..88f1c21d5548 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -18,7 +18,7 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock); void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, struct timespec64 *ts); + struct io_wait_queue *iowq, ktime_t to_wait); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); @@ -29,11 +29,11 @@ static inline bool io_napi(struct io_ring_ctx *ctx) static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - struct timespec64 *ts) + ktime_t to_wait) { if (!io_napi(ctx)) return; - __io_napi_adjust_timeout(ctx, iowq, ts); + __io_napi_adjust_timeout(ctx, iowq, to_wait); } static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, @@ -88,7 +88,7 @@ static inline void io_napi_add(struct io_kiocb *req) } static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, - struct timespec64 *ts) + ktime_t to_wait) { } static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, From 5779d398dbcd74c30a641c209946b8498e668a53 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 23 Jul 2024 18:12:40 -0500 Subject: [PATCH 102/120] smb3: add dynamic tracepoint for reflink errors There are cases where debugging clone_range ("smb2_duplicate_extents" function) and in the future copy_range ("smb2_copychunk_range") can be helpful. Add dynamic trace points for any errors in clone, and a followon patch will add them for copychunk. "trace-cmd record -e smb3_clone_err" Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 5 ++++ fs/smb/client/trace.h | 57 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 7fe59235f090..1539463285ee 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -2075,6 +2075,11 @@ smb2_duplicate_extents(const unsigned int xid, cifs_dbg(FYI, "Non-zero response length in duplicate extents\n"); duplicate_extents_out: + if (rc) + trace_smb3_clone_err(xid, srcfile->fid.volatile_fid, + trgtfile->fid.volatile_fid, + tcon->tid, tcon->ses->Suid, src_off, + dest_off, len, rc); return rc; } diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index 36d5295c2a6f..b027282d6db7 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -206,6 +206,63 @@ DEFINE_SMB3_OTHER_ERR_EVENT(query_dir_err); DEFINE_SMB3_OTHER_ERR_EVENT(zero_err); DEFINE_SMB3_OTHER_ERR_EVENT(falloc_err); +/* + * For logging errors in reflink and copy_range ops e.g. smb2_copychunk_range + * and smb2_duplicate_extents + */ +DECLARE_EVENT_CLASS(smb3_copy_range_err_class, + TP_PROTO(unsigned int xid, + __u64 src_fid, + __u64 target_fid, + __u32 tid, + __u64 sesid, + __u64 src_offset, + __u64 target_offset, + __u32 len, + int rc), + TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, src_fid) + __field(__u64, target_fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, src_offset) + __field(__u64, target_offset) + __field(__u32, len) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->src_fid = src_fid; + __entry->target_fid = target_fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->src_offset = src_offset; + __entry->target_offset = target_offset; + __entry->len = len; + __entry->rc = rc; + ), + TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x rc=%d", + __entry->xid, __entry->sesid, __entry->tid, __entry->target_fid, + __entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len, __entry->rc) +) + +#define DEFINE_SMB3_COPY_RANGE_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_copy_range_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 src_fid, \ + __u64 target_fid, \ + __u32 tid, \ + __u64 sesid, \ + __u64 src_offset, \ + __u64 target_offset, \ + __u32 len, \ + int rc), \ + TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len, rc)) + +DEFINE_SMB3_COPY_RANGE_ERR_EVENT(clone_err); +/* TODO: Add SMB3_COPY_RANGE_ERR_EVENT(copychunk_err) */ /* For logging successful read or write */ DECLARE_EVENT_CLASS(smb3_rw_done_class, From 6629f87b97e0740431b7b29b8dfdfa9d842c4bc5 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 24 Jul 2024 11:57:18 -0500 Subject: [PATCH 103/120] smb3: add four dynamic tracepoints for copy_file_range and reflink Add more dynamic tracepoints to help debug copy_file_range (copychunk) and clone_range ("duplicate extents"). These are tracepoints for entering the function and completing without error. For example: "trace-cmd record -e smb3_copychunk_enter -e smb3_copychunk_done" or "trace-cmd record -e smb3_clone_enter -e smb3_clone_done" Here is sample output: TASK-PID CPU# ||||| TIMESTAMP FUNCTION | | | ||||| | | cp-5964 [005] ..... 2176.168977: smb3_clone_enter: xid=17 sid=0xeb275be4 tid=0x7ffa7cdb source fid=0x1ed02e15 source offset=0x0 target fid=0x1ed02e15 target offset=0x0 len=0xa0000 cp-5964 [005] ..... 2176.170668: smb3_clone_done: xid=17 sid=0xeb275be4 tid=0x7ffa7cdb source fid=0x1ed02e15 source offset=0x0 target fid=0x1ed02e15 target offset=0x0 len=0xa0000 Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 15 +++++++++++- fs/smb/client/trace.h | 53 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 1539463285ee..322cabc69c6f 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -1812,6 +1812,10 @@ smb2_copychunk_range(const unsigned int xid, tcon = tlink_tcon(trgtfile->tlink); + trace_smb3_copychunk_enter(xid, srcfile->fid.volatile_fid, + trgtfile->fid.volatile_fid, tcon->tid, + tcon->ses->Suid, src_off, dest_off, len); + while (len > 0) { pcchunk->SourceOffset = cpu_to_le64(src_off); pcchunk->TargetOffset = cpu_to_le64(dest_off); @@ -1863,6 +1867,9 @@ smb2_copychunk_range(const unsigned int xid, le32_to_cpu(retbuf->ChunksWritten), le32_to_cpu(retbuf->ChunkBytesWritten), bytes_written); + trace_smb3_copychunk_done(xid, srcfile->fid.volatile_fid, + trgtfile->fid.volatile_fid, tcon->tid, + tcon->ses->Suid, src_off, dest_off, len); } else if (rc == -EINVAL) { if (ret_data_len != sizeof(struct copychunk_ioctl_rsp)) goto cchunk_out; @@ -2046,7 +2053,9 @@ smb2_duplicate_extents(const unsigned int xid, dup_ext_buf.ByteCount = cpu_to_le64(len); cifs_dbg(FYI, "Duplicate extents: src off %lld dst off %lld len %lld\n", src_off, dest_off, len); - + trace_smb3_clone_enter(xid, srcfile->fid.volatile_fid, + trgtfile->fid.volatile_fid, tcon->tid, + tcon->ses->Suid, src_off, dest_off, len); inode = d_inode(trgtfile->dentry); if (inode->i_size < dest_off + len) { rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false); @@ -2080,6 +2089,10 @@ duplicate_extents_out: trgtfile->fid.volatile_fid, tcon->tid, tcon->ses->Suid, src_off, dest_off, len, rc); + else + trace_smb3_clone_done(xid, srcfile->fid.volatile_fid, + trgtfile->fid.volatile_fid, tcon->tid, + tcon->ses->Suid, src_off, dest_off, len); return rc; } diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index b027282d6db7..bc4f8b3ad6ff 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -264,6 +264,59 @@ DEFINE_EVENT(smb3_copy_range_err_class, smb3_##name, \ DEFINE_SMB3_COPY_RANGE_ERR_EVENT(clone_err); /* TODO: Add SMB3_COPY_RANGE_ERR_EVENT(copychunk_err) */ +DECLARE_EVENT_CLASS(smb3_copy_range_done_class, + TP_PROTO(unsigned int xid, + __u64 src_fid, + __u64 target_fid, + __u32 tid, + __u64 sesid, + __u64 src_offset, + __u64 target_offset, + __u32 len), + TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u64, src_fid) + __field(__u64, target_fid) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, src_offset) + __field(__u64, target_offset) + __field(__u32, len) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->src_fid = src_fid; + __entry->target_fid = target_fid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->src_offset = src_offset; + __entry->target_offset = target_offset; + __entry->len = len; + ), + TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x", + __entry->xid, __entry->sesid, __entry->tid, __entry->target_fid, + __entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len) +) + +#define DEFINE_SMB3_COPY_RANGE_DONE_EVENT(name) \ +DEFINE_EVENT(smb3_copy_range_done_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u64 src_fid, \ + __u64 target_fid, \ + __u32 tid, \ + __u64 sesid, \ + __u64 src_offset, \ + __u64 target_offset, \ + __u32 len), \ + TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len)) + +DEFINE_SMB3_COPY_RANGE_DONE_EVENT(copychunk_enter); +DEFINE_SMB3_COPY_RANGE_DONE_EVENT(clone_enter); +DEFINE_SMB3_COPY_RANGE_DONE_EVENT(copychunk_done); +DEFINE_SMB3_COPY_RANGE_DONE_EVENT(clone_done); + + /* For logging successful read or write */ DECLARE_EVENT_CLASS(smb3_rw_done_class, TP_PROTO(unsigned int rreq_debug_id, From b6f6a7aa689f1c255e06fee3ca13c9f9e5c12780 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 26 Jul 2024 01:06:20 -0500 Subject: [PATCH 104/120] smb3: add dynamic trace point for session setup key expired failures There are cases where services need to remount (or change their credentials files) when keys have expired, but it can be helpful to have a dynamic trace point to make it easier to notify the service to refresh the storage account key. Here is sample output, one from mount with bad password, one from a reconnect where the password has been changed or expired and reconnect fails (requiring remount with new storage account key) TASK-PID CPU# ||||| TIMESTAMP FUNCTION | | | ||||| | | mount.cifs-11362 [000] ..... 6000.241620: smb3_key_expired: rc=-13 user=testpassu conn_id=0x2 server=localhost addr=127.0.0.1:445 kworker/4:0-8458 [004] ..... 6044.892283: smb3_key_expired: rc=-13 user=testpassu conn_id=0x3 server=localhost addr=127.0.0.1:445 Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 8 +++++++- fs/smb/client/trace.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 9fc5b11c0b6c..9a06b5594669 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1562,8 +1562,14 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) cifs_small_buf_release(sess_data->iov[0].iov_base); if (rc == 0) sess_data->ses->expired_pwd = false; - else if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) + else if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) { + if (sess_data->ses->expired_pwd == false) + trace_smb3_key_expired(sess_data->server->hostname, + sess_data->ses->user_name, + sess_data->server->conn_id, + &sess_data->server->dstaddr, rc); sess_data->ses->expired_pwd = true; + } memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec)); diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index bc4f8b3ad6ff..6b3bdfb97211 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -1281,6 +1281,46 @@ DEFINE_EVENT(smb3_connect_err_class, smb3_##name, \ DEFINE_SMB3_CONNECT_ERR_EVENT(connect_err); +DECLARE_EVENT_CLASS(smb3_sess_setup_err_class, + TP_PROTO(char *hostname, char *username, __u64 conn_id, + const struct __kernel_sockaddr_storage *dst_addr, int rc), + TP_ARGS(hostname, username, conn_id, dst_addr, rc), + TP_STRUCT__entry( + __string(hostname, hostname) + __string(username, username) + __field(__u64, conn_id) + __array(__u8, dst_addr, sizeof(struct sockaddr_storage)) + __field(int, rc) + ), + TP_fast_assign( + struct sockaddr_storage *pss = NULL; + + __entry->conn_id = conn_id; + __entry->rc = rc; + pss = (struct sockaddr_storage *)__entry->dst_addr; + *pss = *dst_addr; + __assign_str(hostname); + __assign_str(username); + ), + TP_printk("rc=%d user=%s conn_id=0x%llx server=%s addr=%pISpsfc", + __entry->rc, + __get_str(username), + __entry->conn_id, + __get_str(hostname), + __entry->dst_addr) +) + +#define DEFINE_SMB3_SES_SETUP_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_sess_setup_err_class, smb3_##name, \ + TP_PROTO(char *hostname, \ + char *username, \ + __u64 conn_id, \ + const struct __kernel_sockaddr_storage *addr, \ + int rc), \ + TP_ARGS(hostname, username, conn_id, addr, rc)) + +DEFINE_SMB3_SES_SETUP_ERR_EVENT(key_expired); + DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_PROTO(__u64 currmid, __u64 conn_id, From 52e130764ab6bdc439bcf124ac3e15f52ca0c8e5 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 10 Jul 2024 11:34:38 +0200 Subject: [PATCH 105/120] tools/power turbostat: Move debug prints from stdout to stderr This leaves the stdout cleaner, having only counter data. It makes it easier for programs to parse the output of turbostat, for example selftests. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index be345a4bbe96..cccd9f4311a0 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1984,6 +1984,7 @@ void help(void) " -c, --cpu cpu-set limit output to summary plus cpu-set:\n" " {core | package | j,k,l..m,n-p }\n" " -d, --debug displays usec, Time_Of_Day_Seconds and more debugging\n" + " debug messages are printed to stderr\n" " -D, --Dump displays the raw counter values\n" " -e, --enable [all | column]\n" " shows all or the specified disabled column\n" @@ -8262,7 +8263,7 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo) pinfo->scale = perf_scale; if (debug) - printf("Add perf/%s/%s cpu%d: %d\n", + fprintf(stderr, "Add perf/%s/%s cpu%d: %d\n", pinfo->device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]); } @@ -8422,7 +8423,7 @@ struct msr_counter *find_msrp_by_name(struct msr_counter *head, char *name) for (mp = head; mp; mp = mp->next) { if (debug) - printf("%s: %s %s\n", __func__, name, mp->name); + fprintf(stderr, "%s: %s %s\n", __func__, name, mp->name); if (!strncmp(name, mp->name, strlen(mp->name))) return mp; } @@ -8439,8 +8440,8 @@ int add_counter(unsigned int msr_num, char *path, char *name, errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num); if (debug) - printf("%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", __func__, msr_num, - path, name, width, scope, type, format, flags, id); + fprintf(stderr, "%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", + __func__, msr_num, path, name, width, scope, type, format, flags, id); switch (scope) { @@ -8448,7 +8449,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = find_msrp_by_name(sys.tp, name); if (msrp) { if (debug) - printf("%s: %s FOUND\n", __func__, name); + fprintf(stderr, "%s: %s FOUND\n", __func__, name); break; } if (sys.added_thread_counters++ >= MAX_ADDED_THREAD_COUNTERS) { @@ -8460,7 +8461,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = find_msrp_by_name(sys.cp, name); if (msrp) { if (debug) - printf("%s: %s FOUND\n", __func__, name); + fprintf(stderr, "%s: %s FOUND\n", __func__, name); break; } if (sys.added_core_counters++ >= MAX_ADDED_CORE_COUNTERS) { @@ -8472,7 +8473,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, msrp = find_msrp_by_name(sys.pp, name); if (msrp) { if (debug) - printf("%s: %s FOUND\n", __func__, name); + fprintf(stderr, "%s: %s FOUND\n", __func__, name); break; } if (sys.added_package_counters++ >= MAX_ADDED_PACKAGE_COUNTERS) { @@ -8617,7 +8618,7 @@ int add_perf_counter(const char *perf_device, const char *perf_event, const char // FIXME: we might not have debug here yet if (debug) - printf("%s: %s/%s, name: %s, scope%d\n", + fprintf(stderr, "%s: %s/%s, name: %s, scope%d\n", __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope); return 0; From b2e4a5dfafcce87ed2b9aac0b2887f421cd2930a Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 10 Jul 2024 14:28:03 +0200 Subject: [PATCH 106/120] tools/power turbostat: Move verbose counter messages to level 2 Printing information about the source and value during initialization and reading of the counter for each cpu, while useful when debugging, results in too verbose output. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index cccd9f4311a0..86bbea7d8e57 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3797,7 +3797,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct unsigned long long perf_data[NUM_RAPL_COUNTERS + 1]; struct rapl_counter_info_t *rci; - if (debug) + if (debug >= 2) fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain); assert(rapl_counter_info_perdomain); @@ -3827,7 +3827,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct assert(pi < ARRAY_SIZE(perf_data)); assert(rci->fd_perf != -1); - if (debug) + if (debug >= 2) fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n", i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]); @@ -3837,7 +3837,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct break; case COUNTER_SOURCE_MSR: - if (debug) + if (debug >= 2) fprintf(stderr, "Reading rapl counter via msr at %u\n", i); assert(!no_msr); @@ -3895,7 +3895,7 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat struct cstate_counter_info_t *cci; - if (debug) + if (debug >= 2) fprintf(stderr, "%s: cpu%d\n", __func__, cpu); assert(ccstate_counter_info); @@ -3965,9 +3965,8 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat assert(pi < ARRAY_SIZE(perf_data)); assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1); - if (debug) { + if (debug >= 2) fprintf(stderr, "cstate via %s %u: %llu\n", "perf", i, perf_data[pi]); - } cci->data[i] = perf_data[pi]; @@ -3979,9 +3978,8 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat if (get_msr(cpu, cci->msr[i], &cci->data[i])) return -13 - i; - if (debug) { + if (debug >= 2) fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", "msr", cci->msr[i], i, cci->data[i]); - } break; } @@ -4036,7 +4034,7 @@ int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t) struct msr_counter_info_t *mci; - if (debug) + if (debug >= 2) fprintf(stderr, "%s: cpu%d\n", __func__, cpu); assert(msr_counter_info); @@ -4066,7 +4064,7 @@ int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t) assert(pi < ARRAY_SIZE(perf_data)); assert(mci->fd_perf != -1); - if (debug) + if (debug >= 2) fprintf(stderr, "Reading msr counter via perf at %u: %llu\n", i, perf_data[pi]); mci->data[i] = perf_data[pi]; @@ -4082,7 +4080,7 @@ int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t) mci->data[i] &= mci->msr_mask[i]; - if (debug) + if (debug >= 2) fprintf(stderr, "Reading msr counter via msr at %u: %llu\n", i, mci->data[i]); break; @@ -7125,7 +7123,7 @@ int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct { int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit); - if (debug) + if (debug >= 2) fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); return ret; @@ -7284,7 +7282,7 @@ int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const st { int ret = add_cstate_perf_counter_(cpu, cci, cai); - if (debug) + if (debug >= 2) fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu); return ret; From 1f8add13e6c86653da2ec599dfc94e7a8865cdcb Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 3 Jul 2024 23:14:14 +0200 Subject: [PATCH 107/120] tools/power turbostat: Add selftests for SMI, APERF and MPERF counters The test requests BICs that are dependent on SMI, APERF and MPERF counters and checks if the columns show up in the output and the turbostat doesn't crash. Read the counters in both --no-msr and --no-perf mode. The test skips counters that are not present or user does not have permissions to read. It is not an error, but the test may not be as exhaustive. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- .../selftests/turbostat/smi_aperf_mperf.py | 157 ++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100755 tools/testing/selftests/turbostat/smi_aperf_mperf.py diff --git a/tools/testing/selftests/turbostat/smi_aperf_mperf.py b/tools/testing/selftests/turbostat/smi_aperf_mperf.py new file mode 100755 index 000000000000..6289cc47d5f0 --- /dev/null +++ b/tools/testing/selftests/turbostat/smi_aperf_mperf.py @@ -0,0 +1,157 @@ +#!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +from shutil import which +from os import pread + +# CDLL calls dlopen underneath. +# Calling it with None (null), we get handle to the our own image (python interpreter). +# We hope to find sched_getcpu() inside ;] +# This is a bit ugly, but helps shipping working software, so.. +try: + import ctypes + + this_image = ctypes.CDLL(None) + BASE_CPU = this_image.sched_getcpu() +except: + BASE_CPU = 0 # If we fail, set to 0 and pray it's not offline. + +MSR_IA32_MPERF = 0x000000e7 +MSR_IA32_APERF = 0x000000e8 + +def check_perf_access(): + perf = which('perf') + if perf is None: + print('SKIP: Could not find perf binary, thus could not determine perf access.') + return False + + def has_perf_counter_access(counter_name): + proc_perf = subprocess.run([perf, 'stat', '-e', counter_name, '--timeout', '10'], + capture_output = True) + + if proc_perf.returncode != 0: + print(f'SKIP: Could not read {counter_name} perf counter, assuming no access.') + return False + + if b'' in proc_perf.stderr: + print(f'SKIP: Could not read {counter_name} perf counter, assuming no access.') + return False + + return True + + if not has_perf_counter_access('msr/mperf/'): + return False + if not has_perf_counter_access('msr/aperf/'): + return False + if not has_perf_counter_access('msr/smi/'): + return False + + return True + +def check_msr_access(): + try: + file_msr = open(f'/dev/cpu/{BASE_CPU}/msr', 'rb') + except: + return False + + if len(pread(file_msr.fileno(), 8, MSR_IA32_MPERF)) != 8: + return False + + if len(pread(file_msr.fileno(), 8, MSR_IA32_APERF)) != 8: + return False + + return True + +has_perf_access = check_perf_access() +has_msr_access = check_msr_access() + +turbostat_counter_source_opts = [''] + +if has_msr_access: + turbostat_counter_source_opts.append('--no-perf') +else: + print('SKIP: doesn\'t have MSR access, skipping run with --no-perf') + +if has_perf_access: + turbostat_counter_source_opts.append('--no-msr') +else: + print('SKIP: doesn\'t have perf access, skipping run with --no-msr') + +if not has_msr_access and not has_perf_access: + print('SKIP: No MSR nor perf access detected. Skipping the tests entirely') + exit(0) + +turbostat = which('turbostat') +if turbostat is None: + print('Could not find turbostat binary') + exit(1) + +timeout = which('timeout') +if timeout is None: + print('Could not find timeout binary') + exit(1) + +proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +EXPECTED_COLUMNS_DEBUG_DEFAULT = b'usec\tTime_Of_Day_Seconds\tAPIC\tX2APIC' + +SMI_APERF_MPERF_DEPENDENT_BICS = [ + 'SMI', + 'Avg_MHz', + 'Busy%', + 'Bzy_MHz', +] +if has_perf_access: + SMI_APERF_MPERF_DEPENDENT_BICS.append('IPC') + +for bic in SMI_APERF_MPERF_DEPENDENT_BICS: + for counter_source_opt in turbostat_counter_source_opts: + + # Ugly special case, but it is what it is.. + if counter_source_opt == '--no-perf' and bic == 'IPC': + continue + + expected_columns = bic.encode() + expected_columns_debug = EXPECTED_COLUMNS_DEBUG_DEFAULT + f'\t{bic}'.encode() + + # + # Run turbostat for some time and send SIGINT + # + timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '0.2s'] + turbostat_argv = [turbostat, '-i', '0.50', '--show', bic] + + if counter_source_opt: + turbostat_argv.append(counter_source_opt) + + print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) + proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) + if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + + actual_columns = proc_turbostat.stdout.split(b'\n')[0] + if expected_columns != actual_columns: + print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}') + exit(1) + print('OK') + + # + # Same, but with --debug + # + turbostat_argv.append('--debug') + + print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True) + proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) + if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + + actual_columns = proc_turbostat.stdout.split(b'\n')[0] + if expected_columns_debug != actual_columns: + print(f'turbostat column check failed\n{expected_columns_debug=}\n{actual_columns=}') + exit(1) + print('OK') From f3065f9c3917fa9279992623a2f3282f1fd43515 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Thu, 4 Jul 2024 20:11:33 +0200 Subject: [PATCH 108/120] tools/power turbostat: Add selftests for added perf counters Test adds several perf counters from msr, cstate_core and cstate_pkg groups and checks if the columns for those counters show up. The test skips the counters that are not present. It is not an error, but the test may not be as exhaustive. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- .../turbostat/added_perf_counters.py | 178 ++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100755 tools/testing/selftests/turbostat/added_perf_counters.py diff --git a/tools/testing/selftests/turbostat/added_perf_counters.py b/tools/testing/selftests/turbostat/added_perf_counters.py new file mode 100755 index 000000000000..9ab4aaf45fb8 --- /dev/null +++ b/tools/testing/selftests/turbostat/added_perf_counters.py @@ -0,0 +1,178 @@ +#!/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import subprocess +from shutil import which +from os import pread + +class PerfCounterInfo: + def __init__(self, subsys, event): + self.subsys = subsys + self.event = event + + def get_perf_event_name(self): + return f'{self.subsys}/{self.event}/' + + def get_turbostat_perf_id(self, counter_scope, counter_type, column_name): + return f'perf/{self.subsys}/{self.event},{counter_scope},{counter_type},{column_name}' + +PERF_COUNTERS_CANDIDATES = [ + PerfCounterInfo('msr', 'mperf'), + PerfCounterInfo('msr', 'aperf'), + PerfCounterInfo('msr', 'tsc'), + PerfCounterInfo('cstate_core', 'c1-residency'), + PerfCounterInfo('cstate_core', 'c6-residency'), + PerfCounterInfo('cstate_core', 'c7-residency'), + PerfCounterInfo('cstate_pkg', 'c2-residency'), + PerfCounterInfo('cstate_pkg', 'c3-residency'), + PerfCounterInfo('cstate_pkg', 'c6-residency'), + PerfCounterInfo('cstate_pkg', 'c7-residency'), + PerfCounterInfo('cstate_pkg', 'c8-residency'), + PerfCounterInfo('cstate_pkg', 'c9-residency'), + PerfCounterInfo('cstate_pkg', 'c10-residency'), +] +present_perf_counters = [] + +def check_perf_access(): + perf = which('perf') + if perf is None: + print('SKIP: Could not find perf binary, thus could not determine perf access.') + return False + + def has_perf_counter_access(counter_name): + proc_perf = subprocess.run([perf, 'stat', '-e', counter_name, '--timeout', '10'], + capture_output = True) + + if proc_perf.returncode != 0: + print(f'SKIP: Could not read {counter_name} perf counter.') + return False + + if b'' in proc_perf.stderr: + print(f'SKIP: Could not read {counter_name} perf counter.') + return False + + return True + + for counter in PERF_COUNTERS_CANDIDATES: + if has_perf_counter_access(counter.get_perf_event_name()): + present_perf_counters.append(counter) + + if len(present_perf_counters) == 0: + print('SKIP: Could not read any perf counter.') + return False + + if len(present_perf_counters) != len(PERF_COUNTERS_CANDIDATES): + print(f'WARN: Could not access all of the counters - some will be left untested') + + return True + +if not check_perf_access(): + exit(0) + +turbostat_counter_source_opts = [''] + +turbostat = which('turbostat') +if turbostat is None: + print('Could not find turbostat binary') + exit(1) + +timeout = which('timeout') +if timeout is None: + print('Could not find timeout binary') + exit(1) + +proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +EXPECTED_COLUMNS_DEBUG_DEFAULT = [b'usec', b'Time_Of_Day_Seconds', b'APIC', b'X2APIC'] + +expected_columns = [b'CPU'] +counters_argv = [] +for counter in present_perf_counters: + if counter.subsys == 'cstate_core': + counter_scope = 'core' + elif counter.subsys == 'cstate_pkg': + counter_scope = 'package' + else: + counter_scope = 'cpu' + + counter_type = 'delta' + column_name = counter.event + + cparams = counter.get_turbostat_perf_id( + counter_scope = counter_scope, + counter_type = counter_type, + column_name = column_name + ) + expected_columns.append(column_name.encode()) + counters_argv.extend(['--add', cparams]) + +expected_columns_debug = EXPECTED_COLUMNS_DEBUG_DEFAULT + expected_columns + +def gen_user_friendly_cmdline(argv_): + argv = argv_[:] + ret = '' + + while len(argv) != 0: + arg = argv.pop(0) + arg_next = '' + + if arg in ('-i', '--show', '--add'): + arg_next = argv.pop(0) if len(argv) > 0 else '' + + ret += f'{arg} {arg_next} \\\n\t' + + # Remove the last separator and return + return ret[:-4] + +# +# Run turbostat for some time and send SIGINT +# +timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '0.2s'] +turbostat_argv = [turbostat, '-i', '0.50', '--show', 'CPU'] + counters_argv + +def check_columns_or_fail(expected_columns: list, actual_columns: list): + if len(actual_columns) != len(expected_columns): + print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}') + exit(1) + + failed = False + for expected_column in expected_columns: + if expected_column not in actual_columns: + print(f'turbostat column check failed: missing column {expected_column.decode()}') + failed = True + + if failed: + exit(1) + +cmdline = gen_user_friendly_cmdline(turbostat_argv) +print(f'Running turbostat with:\n\t{cmdline}\n... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +actual_columns = proc_turbostat.stdout.split(b'\n')[0].split(b'\t') +check_columns_or_fail(expected_columns, actual_columns) +print('OK') + +# +# Same, but with --debug +# +# We explicitly specify '--show CPU' to make sure turbostat +# don't show a bunch of default counters instead. +# +turbostat_argv.append('--debug') + +cmdline = gen_user_friendly_cmdline(turbostat_argv) +print(f'Running turbostat (in debug mode) with:\n\t{cmdline}\n... ', end = '', flush = True) +proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True) +if proc_turbostat.returncode != 0: + print(f'turbostat failed with {proc_turbostat.returncode}') + exit(1) + +actual_columns = proc_turbostat.stdout.split(b'\n')[0].split(b'\t') +check_columns_or_fail(expected_columns_debug, actual_columns) +print('OK') From f0e4ed752fda6997b41917c94a5478b340178001 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Mon, 22 Jul 2024 22:11:03 +0200 Subject: [PATCH 109/120] tools/power turbostat: Add early support for PMT counters Allows users to read Intel PMT (Platform Monitoring Technology) counters, providing interface similar to one used to add MSR and perf counters. Because PMT is exposed as a raw MMIO range, without metadata, user has to supply the necessary information to find and correctly display the requested counter. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 768 +++++++++++++++++++++++++- 1 file changed, 766 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 86bbea7d8e57..c9fdfb074ce9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -1082,6 +1083,9 @@ size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affi #define MAX_ADDED_THREAD_COUNTERS 24 #define MAX_ADDED_CORE_COUNTERS 8 #define MAX_ADDED_PACKAGE_COUNTERS 16 +#define PMT_MAX_ADDED_THREAD_COUNTERS 24 +#define PMT_MAX_ADDED_CORE_COUNTERS 8 +#define PMT_MAX_ADDED_PACKAGE_COUNTERS 16 #define BITMASK_SIZE 32 #define ZERO_ARRAY(arr) (memset(arr, 0, sizeof(arr)) + __must_be_array(arr)) @@ -1466,6 +1470,100 @@ static struct msr_counter_arch_info msr_counter_arch_infos[] = { }, }; +/* Can be redefined when compiling, useful for testing. */ +#ifndef SYSFS_TELEM_PATH +#define SYSFS_TELEM_PATH "/sys/class/intel_pmt" +#endif + +#define PMT_COUNTER_NAME_SIZE_BYTES 16 +#define PMT_COUNTER_TYPE_NAME_SIZE_BYTES 32 + +struct pmt_mmio { + struct pmt_mmio *next; + + unsigned int guid; + unsigned int size; + + /* Base pointer to the mmaped memory. */ + void *mmio_base; + + /* + * Offset to be applied to the mmio_base + * to get the beginning of the PMT counters for given GUID. + */ + unsigned long pmt_offset; +} *pmt_mmios; + +enum pmt_datatype { + PMT_TYPE_RAW, +}; + +struct pmt_domain_info { + /* + * Pointer to the MMIO obtained by applying a counter offset + * to the mmio_base of the mmaped region for the given GUID. + * + * This is where to read the raw value of the counter from. + */ + unsigned long *pcounter; +}; + +struct pmt_counter { + struct pmt_counter *next; + + /* PMT metadata */ + char name[PMT_COUNTER_NAME_SIZE_BYTES]; + enum pmt_datatype type; + enum counter_scope scope; + unsigned int lsb; + unsigned int msb; + + /* BIC-like metadata */ + enum counter_format format; + + unsigned int num_domains; + struct pmt_domain_info *domains; +}; + +unsigned int pmt_counter_get_width(const struct pmt_counter *p) +{ + return (p->msb - p->lsb)+1; +} + +void pmt_counter_resize_(struct pmt_counter *pcounter, unsigned int new_size) +{ + struct pmt_domain_info *new_mem; + + new_mem = (struct pmt_domain_info*) reallocarray(pcounter->domains, new_size, sizeof(*pcounter->domains)); + if (!new_mem) { + fprintf(stderr, "%s: failed to allocate memory for PMT counters\n", __func__); + exit(1); + } + + /* Zero initialize just allocated memory. */ + const size_t num_new_domains = new_size - pcounter->num_domains; + + memset(&new_mem[pcounter->num_domains], 0, num_new_domains * sizeof(*pcounter->domains)); + + pcounter->num_domains = new_size; + pcounter->domains = new_mem; +} + +void pmt_counter_resize(struct pmt_counter *pcounter, unsigned int new_size) +{ + /* + * Allocate more memory ahead of time. + * + * Always allocate space for at least 8 elements + * and double the size when growing. + */ + if (new_size < 8) + new_size = 8; + new_size = MAX(new_size, pcounter->num_domains*2); + + pmt_counter_resize_(pcounter, new_size); +} + struct thread_data { struct timeval tv_begin; struct timeval tv_end; @@ -1484,6 +1582,7 @@ struct thread_data { bool is_atom; unsigned long long counter[MAX_ADDED_THREAD_COUNTERS]; unsigned long long perf_counter[MAX_ADDED_THREAD_COUNTERS]; + unsigned long long pmt_counter[PMT_MAX_ADDED_THREAD_COUNTERS]; } *thread_even, *thread_odd; struct core_data { @@ -1498,6 +1597,7 @@ struct core_data { unsigned long long core_throt_cnt; unsigned long long counter[MAX_ADDED_CORE_COUNTERS]; unsigned long long perf_counter[MAX_ADDED_CORE_COUNTERS]; + unsigned long long pmt_counter[PMT_MAX_ADDED_CORE_COUNTERS]; } *core_even, *core_odd; struct pkg_data { @@ -1532,6 +1632,7 @@ struct pkg_data { unsigned int uncore_mhz; unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS]; unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS]; + unsigned long long pmt_counter[PMT_MAX_ADDED_PACKAGE_COUNTERS]; } *package_even, *package_odd; #define ODD_COUNTERS thread_odd, core_odd, package_odd @@ -1681,6 +1782,10 @@ struct sys_counters { struct perf_counter_info *perf_tp; struct perf_counter_info *perf_cp; struct perf_counter_info *perf_pp; + + struct pmt_counter *pmt_tp; + struct pmt_counter *pmt_cp; + struct pmt_counter *pmt_pp; } sys; static size_t free_msr_counters_(struct msr_counter **pp) @@ -1981,6 +2086,7 @@ void help(void) " -a, --add add a counter\n" " eg. --add msr0x10,u64,cpu,delta,MY_TSC\n" " eg. --add perf/cstate_pkg/c2-residency,package,delta,percent,perfPC2\n" + " eg. --add pmt,name=XTAL,type=raw,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102\n" " -c, --cpu cpu-set limit output to summary plus cpu-set:\n" " {core | package | j,k,l..m,n-p }\n" " -d, --debug displays usec, Time_Of_Day_Seconds and more debugging\n" @@ -2092,6 +2198,7 @@ void print_header(char *delim) { struct msr_counter *mp; struct perf_counter_info *pp; + struct pmt_counter *ppmt; int printed = 0; if (DO_BIC(BIC_USEC)) @@ -2164,6 +2271,21 @@ void print_header(char *delim) } } + ppmt = sys.pmt_tp; + while (ppmt) { + switch(ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); + else + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); + + break; + } + + ppmt = ppmt->next; + } + if (DO_BIC(BIC_CPU_c1)) outp += sprintf(outp, "%sCPU%%c1", (printed++ ? delim : "")); if (DO_BIC(BIC_CPU_c3)) @@ -2219,6 +2341,21 @@ void print_header(char *delim) } } + ppmt = sys.pmt_cp; + while (ppmt) { + switch(ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); + else + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); + + break; + } + + ppmt = ppmt->next; + } + if (DO_BIC(BIC_PkgTmp)) outp += sprintf(outp, "%sPkgTmp", (printed++ ? delim : "")); @@ -2329,6 +2466,21 @@ void print_header(char *delim) } } + ppmt = sys.pmt_pp; + while (ppmt) { + switch(ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); + else + outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); + + break; + } + + ppmt = ppmt->next; + } + outp += sprintf(outp, "\n"); } @@ -2450,6 +2602,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data int i; struct msr_counter *mp; struct perf_counter_info *pp; + struct pmt_counter *ppmt; char *delim = "\t"; int printed = 0; @@ -2612,6 +2765,19 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } } + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)t->pmt_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->pmt_counter[i]); + + break; + } + } + /* C1 */ if (DO_BIC(BIC_CPU_c1)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc); @@ -2673,6 +2839,19 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } } + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)c->pmt_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->pmt_counter[i]); + + break; + } + } + fmt8 = "%s%.2f"; if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl) @@ -2842,9 +3021,23 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->perf_counter[i]); } else if (pp->format == FORMAT_PERCENT) { outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->perf_counter[i] / tsc); - } else if (pp->type == COUNTER_K2M) + } else if (pp->type == COUNTER_K2M) { outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->perf_counter[i] / 1000); + } + } + + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + switch (ppmt->type) { + case PMT_TYPE_RAW: + if (pmt_counter_get_width(ppmt) <= 32) + outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), + (unsigned int)p->pmt_counter[i]); + else + outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->pmt_counter[i]); + + break; + } } done: @@ -2901,6 +3094,7 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) int i; struct msr_counter *mp; struct perf_counter_info *pp; + struct pmt_counter *ppmt; if (DO_BIC(BIC_Totl_c0)) old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0; @@ -2970,6 +3164,13 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; } + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + if (ppmt->format == FORMAT_RAW) + old->pmt_counter[i] = new->pmt_counter[i]; + else + old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i]; + } + return 0; } @@ -2978,6 +3179,7 @@ void delta_core(struct core_data *new, struct core_data *old) int i; struct msr_counter *mp; struct perf_counter_info *pp; + struct pmt_counter *ppmt; old->c3 = new->c3 - old->c3; old->c6 = new->c6 - old->c6; @@ -3001,6 +3203,13 @@ void delta_core(struct core_data *new, struct core_data *old) else old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; } + + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + if (ppmt->format == FORMAT_RAW) + old->pmt_counter[i] = new->pmt_counter[i]; + else + old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i]; + } } int soft_c1_residency_display(int bic) @@ -3019,6 +3228,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d int i; struct msr_counter *mp; struct perf_counter_info *pp; + struct pmt_counter *ppmt; /* we run cpuid just the 1st time, copy the results */ if (DO_BIC(BIC_APIC)) @@ -3105,6 +3315,13 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i]; } + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + if (ppmt->format == FORMAT_RAW) + old->pmt_counter[i] = new->pmt_counter[i]; + else + old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i]; + } + return 0; } @@ -3211,6 +3428,10 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data memset(&t->perf_counter[0], 0, sizeof(t->perf_counter)); memset(&c->perf_counter[0], 0, sizeof(c->perf_counter)); memset(&p->perf_counter[0], 0, sizeof(p->perf_counter)); + + memset(&t->pmt_counter[0], 0, ARRAY_SIZE(t->pmt_counter)); + memset(&c->pmt_counter[0], 0, ARRAY_SIZE(c->pmt_counter)); + memset(&p->pmt_counter[0], 0, ARRAY_SIZE(p->pmt_counter)); } void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src) @@ -3232,6 +3453,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) int i; struct msr_counter *mp; struct perf_counter_info *pp; + struct pmt_counter *ppmt; /* copy un-changing apic_id's */ if (DO_BIC(BIC_APIC)) @@ -3268,6 +3490,10 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.threads.perf_counter[i] += t->perf_counter[i]; } + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + average.threads.pmt_counter[i] += t->pmt_counter[i]; + } + /* sum per-core values only for 1st thread in core */ if (!is_cpu_first_thread_in_core(t, c, p)) return 0; @@ -3294,6 +3520,10 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.cores.perf_counter[i] += c->perf_counter[i]; } + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + average.cores.pmt_counter[i] += c->pmt_counter[i]; + } + /* sum per-pkg values only for 1st core in pkg */ if (!is_cpu_first_core_in_package(t, c, p)) return 0; @@ -3353,6 +3583,10 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.perf_counter[i] += p->perf_counter[i]; } + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + average.packages.pmt_counter[i] += p->pmt_counter[i]; + } + return 0; } @@ -3365,6 +3599,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data int i; struct msr_counter *mp; struct perf_counter_info *pp; + struct pmt_counter *ppmt; clear_counters(&average.threads, &average.cores, &average.packages); @@ -3465,6 +3700,16 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data } average.packages.perf_counter[i] /= topo.allowed_packages; } + + for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + average.threads.pmt_counter[i] /= topo.allowed_cpus; + } + for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + average.cores.pmt_counter[i] /= topo.allowed_cores; + } + for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + average.packages.pmt_counter[i] /= topo.allowed_packages; + } } static unsigned long long rdtsc(void) @@ -4120,6 +4365,32 @@ int perf_counter_info_read_values(struct perf_counter_info *pp, int cpu, unsigne return 0; } +unsigned long pmt_gen_value_mask(unsigned int lsb, unsigned int msb) +{ + unsigned long mask; + + if (msb == 63) + mask = 0xffffffffffffffff; + else + mask = ((1<<(msb+1))-1); + + mask -= (1<num_domains); + + const unsigned long *pmmio = ppmt->domains[domain_id].pcounter; + const unsigned long value = pmmio ? *pmmio : 0; + const unsigned long value_mask = pmt_gen_value_mask(ppmt->lsb, ppmt->msb); + const unsigned long value_shift = ppmt->lsb; + + return (value & value_mask) >> value_shift; +} + /* * get_counters(...) * migrate to cpu @@ -4130,6 +4401,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) int cpu = t->cpu_id; unsigned long long msr; struct msr_counter *mp; + struct pmt_counter *pp; int i; int status; @@ -4164,6 +4436,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (perf_counter_info_read_values(sys.perf_tp, cpu, t->perf_counter, MAX_ADDED_THREAD_COUNTERS)) return -10; + for (i = 0, pp = sys.pmt_tp; pp; i++, pp = pp->next) + t->pmt_counter[i] = pmt_read_counter(pp, t->cpu_id); + /* collect core counters only for 1st thread in core */ if (!is_cpu_first_thread_in_core(t, c, p)) goto done; @@ -4205,6 +4480,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (perf_counter_info_read_values(sys.perf_cp, cpu, c->perf_counter, MAX_ADDED_CORE_COUNTERS)) return -10; + for (i = 0, pp = sys.pmt_cp; pp; i++, pp = pp->next) + c->pmt_counter[i] = pmt_read_counter(pp, c->core_id); + /* collect package counters only for 1st core in package */ if (!is_cpu_first_core_in_package(t, c, p)) goto done; @@ -4281,6 +4559,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (perf_counter_info_read_values(sys.perf_pp, cpu, p->perf_counter, MAX_ADDED_PACKAGE_COUNTERS)) return -10; + for (i = 0, pp = sys.pmt_pp; pp; i++, pp = pp->next) + p->pmt_counter[i] = pmt_read_counter(pp, p->package_id); + done: gettimeofday(&t->tv_end, (struct timezone *)NULL); @@ -8285,6 +8566,304 @@ void added_perf_counters_init(void) errx(1, "%s: %s", __func__, "package"); } +int parse_telem_info_file(int fd_dir, const char *info_filename, const char *format, unsigned long *output) +{ + int fd_telem_info; + FILE *file_telem_info; + unsigned long value; + + fd_telem_info = openat(fd_dir, info_filename, O_RDONLY); + if (fd_telem_info == -1) + return -1; + + file_telem_info = fdopen(fd_telem_info, "r"); + if (file_telem_info == NULL) { + close(fd_telem_info); + return -1; + } + + if (fscanf(file_telem_info, format, &value) != 1) { + fclose(file_telem_info); + return -1; + } + + fclose(file_telem_info); + + *output = value; + + return 0; +} + +struct pmt_mmio* pmt_mmio_open(unsigned int target_guid) +{ + DIR *dirp; + struct dirent *entry; + struct stat st; + unsigned int telem_idx; + int fd_telem_dir, fd_pmt; + unsigned long guid, size, offset; + size_t mmap_size; + void *mmio; + struct pmt_mmio *ret = NULL; + + if (stat(SYSFS_TELEM_PATH, &st) == -1) + return NULL; + + dirp = opendir(SYSFS_TELEM_PATH); + if (dirp == NULL) + return NULL; + + for (;;) { + entry = readdir(dirp); + + if (entry == NULL) + break; + + if (strcmp(entry->d_name, ".") == 0) + continue; + + if (strcmp(entry->d_name, "..") == 0) + continue; + + if (sscanf(entry->d_name, "telem%u", &telem_idx) != 1) + continue; + + if (fstatat(dirfd(dirp), entry->d_name, &st, 0) == -1) { + break; + } + + if (!S_ISDIR(st.st_mode)) + continue; + + fd_telem_dir = openat(dirfd(dirp), entry->d_name, O_RDONLY); + if (fd_telem_dir == -1) { + break; + } + + if (parse_telem_info_file(fd_telem_dir, "guid", "%lx", &guid)) { + close(fd_telem_dir); + break; + } + + if (parse_telem_info_file(fd_telem_dir, "size", "%lu", &size)) { + close(fd_telem_dir); + break; + } + + if (guid != target_guid) { + close(fd_telem_dir); + continue; + } + + if (parse_telem_info_file(fd_telem_dir, "offset", "%lu", &offset)) { + close(fd_telem_dir); + break; + } + + assert(offset == 0); + + fd_pmt = openat(fd_telem_dir, "telem", O_RDONLY); + if (fd_pmt == -1) + goto loop_cleanup_and_break; + + mmap_size = (size + 0x1000UL) & (~0x1000UL); + mmio = mmap(0, mmap_size, PROT_READ, MAP_SHARED, fd_pmt, 0); + if (mmio != MAP_FAILED) { + + if (debug) + fprintf(stderr, "%s: 0x%lx mmaped at: %p\n", __func__, guid, mmio); + + ret = calloc(1, sizeof(*ret)); + + if (!ret) { + fprintf(stderr, "%s: Failed to allocate pmt_mmio\n", __func__); + exit(1); + } + + ret->guid = guid; + ret->mmio_base = mmio; + ret->pmt_offset = offset; + ret->size = size; + + ret->next = pmt_mmios; + pmt_mmios = ret; + } + +loop_cleanup_and_break: + close(fd_pmt); + close(fd_telem_dir); + break; + } + + closedir(dirp); + + return ret; +} + +struct pmt_mmio* pmt_mmio_find(unsigned int guid) +{ + struct pmt_mmio *pmmio = pmt_mmios; + + while (pmmio) { + if (pmmio->guid == guid) + return pmmio; + + pmmio = pmmio->next; + } + + return NULL; +} + +void* pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offset) +{ + char *ret; + + /* Get base of mmaped PMT file. */ + ret = (char*)pmmio->mmio_base; + + /* + * Apply PMT MMIO offset to obtain beginning of the mmaped telemetry data. + * It's not guaranteed that the mmaped memory begins with the telemetry data + * - we might have to apply the offset first. + */ + ret += pmmio->pmt_offset; + + /* Apply the counter offset to get the address to the mmaped counter. */ + ret += counter_offset; + + return ret; +} + +struct pmt_mmio* pmt_add_guid(unsigned int guid) +{ + struct pmt_mmio *ret; + + ret = pmt_mmio_find(guid); + if (!ret) + ret = pmt_mmio_open(guid); + + return ret; +} + +enum pmt_open_mode { + PMT_OPEN_TRY, /* Open failure is not an error. */ + PMT_OPEN_REQUIRED, /* Open failure is a fatal error. */ +}; + +struct pmt_counter* pmt_find_counter(struct pmt_counter *pcounter, const char *name) +{ + while (pcounter) { + if (strcmp(pcounter->name, name) == 0) + break; + + pcounter = pcounter->next; + } + + return pcounter; +} + +struct pmt_counter** pmt_get_scope_root(enum counter_scope scope) +{ + switch(scope) { + case SCOPE_CPU: + return &sys.pmt_tp; + case SCOPE_CORE: + return &sys.pmt_cp; + case SCOPE_PACKAGE: + return &sys.pmt_pp; + } + + __builtin_unreachable(); +} + +void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio, unsigned int domain_id) +{ + /* Make sure the new domain fits. */ + if (domain_id >= pcounter->num_domains) + pmt_counter_resize(pcounter, domain_id+1); + + assert(pcounter->domains); + assert(domain_id < pcounter->num_domains); + + pcounter->domains[domain_id].pcounter = pmmio; +} + +int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type, + unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope, + enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode) +{ + struct pmt_mmio *mmio; + struct pmt_counter *pcounter; + struct pmt_counter ** const pmt_root = pmt_get_scope_root(scope); + bool new_counter = false; + int conflict = 0; + + if (lsb > msb) { + fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "lsb <= msb", name); + exit(1); + } + + if (msb >= 64) { + fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "msb < 64", name); + exit(1); + } + + mmio = pmt_add_guid(guid); + if (!mmio) { + if (mode != PMT_OPEN_TRY) { + fprintf(stderr, "%s: failed to map PMT MMIO for guid %x\n", __func__, guid); + exit(1); + } + + return 1; + } + + if (offset >= mmio->size) { + if (mode != PMT_OPEN_TRY) { + fprintf(stderr, "%s: offset %u outside of PMT MMIO size %u\n", __func__, offset, mmio->size); + exit(1); + } + + return 1; + } + + pcounter = pmt_find_counter(*pmt_root, name); + if (!pcounter) { + pcounter = calloc(1, sizeof(*pcounter)); + new_counter = true; + } + + if (new_counter) { + strncpy(pcounter->name, name, ARRAY_SIZE(pcounter->name)-1); + pcounter->type = type; + pcounter->scope = scope; + pcounter->lsb = lsb; + pcounter->msb = msb; + pcounter->format = format; + } else { + conflict += pcounter->type != type; + conflict += pcounter->scope != scope; + conflict += pcounter->lsb != lsb; + conflict += pcounter->msb != msb; + conflict += pcounter->format != format; + } + + if (conflict) { + fprintf(stderr, "%s: conflicting parameters for the PMT counter with the same name %s\n", + __func__, name); + exit(1); + } + + pmt_counter_add_domain(pcounter, pmt_get_counter_pointer(mmio, offset), domain_id); + + if (new_counter) { + pcounter->next = *pmt_root; + *pmt_root = pcounter; + } + + return 0; +} + void turbostat_init() { setup_all_buffers(true); @@ -8622,7 +9201,7 @@ int add_perf_counter(const char *perf_device, const char *perf_event, const char return 0; } -void parse_add_command(char *add_command) +void parse_add_command_msr(char *add_command) { int msr_num = 0; char *path = NULL; @@ -8747,6 +9326,191 @@ next: } } +bool starts_with(const char *str, const char *prefix) +{ + return strncmp(prefix, str, strlen(prefix)) == 0; +} + +void parse_add_command_pmt(char *add_command) +{ + char *name = NULL; + char *type_name = NULL; + char *format_name = NULL; + unsigned int offset; + unsigned int lsb; + unsigned int msb; + unsigned int guid; + unsigned int domain_id; + enum counter_scope scope = 0; + enum pmt_datatype type = PMT_TYPE_RAW; + enum counter_format format = FORMAT_RAW; + bool has_offset = false; + bool has_lsb = false; + bool has_msb = false; + bool has_format = true; /* Format has a default value. */ + bool has_guid = false; + bool has_scope = false; + bool has_type = true; /* Type has a default value. */ + + /* Consume the "pmt," prefix. */ + add_command = strchr(add_command, ','); + if (!add_command) { + help(); + exit(1); + } + ++add_command; + + while (add_command) { + if (starts_with(add_command, "name=")) { + name = add_command + strlen("name="); + goto next; + } + + if (starts_with(add_command, "type=")) { + type_name = add_command + strlen("type="); + goto next; + } + + if (starts_with(add_command, "domain=")) { + const size_t prefix_len = strlen("domain="); + + if (sscanf(add_command + prefix_len, "cpu%u", &domain_id) == 1) { + scope = SCOPE_CPU; + has_scope = true; + } else if (sscanf(add_command + prefix_len, "core%u", &domain_id) == 1) { + scope = SCOPE_CORE; + has_scope = true; + } else if (sscanf(add_command + prefix_len, "package%u", &domain_id) == 1) { + scope = SCOPE_PACKAGE; + has_scope = true; + } + + if (!has_scope) { + printf("%s: invalid value for scope. Expected cpu%%u, core%%u or package%%u.\n", + __func__); + exit(1); + } + + goto next; + } + + if (starts_with(add_command, "format=")) { + format_name = add_command + strlen("format="); + goto next; + } + + if (sscanf(add_command, "offset=%u", &offset) == 1) { + has_offset = true; + goto next; + } + + if (sscanf(add_command, "lsb=%u", &lsb) == 1) { + has_lsb = true; + goto next; + } + + if (sscanf(add_command, "msb=%u", &msb) == 1) { + has_msb = true; + goto next; + } + + if (sscanf(add_command, "guid=%x", &guid) == 1) { + has_guid = true; + goto next; + } + +next: + add_command = strchr(add_command, ','); + if (add_command) { + *add_command = '\0'; + add_command++; + } + } + + if (!name) { + printf("%s: missing %s\n", __func__, "name"); + exit(1); + } + + if (strlen(name) >= PMT_COUNTER_NAME_SIZE_BYTES) { + printf("%s: name has to be at most %d characters long\n", + __func__, PMT_COUNTER_NAME_SIZE_BYTES); + exit(1); + } + + if (format_name) { + has_format = false; + + if (strcmp("raw", format_name) == 0) { + format = FORMAT_RAW; + has_format = true; + } + + if (strcmp("delta", format_name) == 0) { + format = FORMAT_DELTA; + has_format = true; + } + + if (!has_format) { + fprintf(stderr, "%s: Invalid format %s. Expected raw or delta\n", __func__, format_name); + exit(1); + } + } + + if (type_name) { + has_type = false; + + if (strcmp("raw", type_name) == 0) { + type = PMT_TYPE_RAW; + has_type = true; + } + + if (!has_type) { + printf("%s: invalid %s: %s\n", __func__, "type", type_name); + exit(1); + } + } + + if (!has_offset) { + printf("%s : missing %s\n", __func__, "offset"); + exit(1); + } + + if (!has_lsb) { + printf("%s: missing %s\n", __func__, "lsb"); + exit(1); + } + + if (!has_msb) { + printf("%s: missing %s\n", __func__, "msb"); + exit(1); + } + + if (!has_guid) { + printf("%s: missing %s\n", __func__, "guid"); + exit(1); + } + + if (!has_scope) { + printf("%s: missing %s\n", __func__, "scope"); + exit(1); + } + + if (lsb > msb) { + printf("%s: lsb > msb doesn't make sense\n", __func__); + exit(1); + } + + pmt_add_counter(guid, name, type, lsb, msb, offset, scope, format, domain_id, PMT_OPEN_REQUIRED); +} + +void parse_add_command(char *add_command) +{ + if (strncmp(add_command, "pmt", strlen("pmt")) == 0) + return parse_add_command_pmt(add_command); + return parse_add_command_msr(add_command); +} + int is_deferred_add(char *name) { int i; From 640540beb88363a825524295664acfdb0f5d5fc2 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Mon, 22 Jul 2024 22:12:22 +0200 Subject: [PATCH 110/120] tools/power turbostat: Add MTL's PMT DC6 builtin counter Provide a definition for metadata that allows reading DC6 residency counter via PMT and exposes it as a builtin counter. Note that this residency counter is updated and read via entirely different mechanisms vs the MSR-based residency counters. On MTL processors, there are times when Die%c6 will report above 100%. This is still useful, but don't expect 3 digits of precision... Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 70 ++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index c9fdfb074ce9..f769e8beabd3 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -193,6 +193,7 @@ struct msr_counter bic[] = { { 0x0, "SAM%mc6", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 }, { 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 }, + { 0x0, "Die%c6", NULL, 0, 0, 0, NULL, 0 }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -254,11 +255,12 @@ struct msr_counter bic[] = { #define BIC_SAM_mc6 (1ULL << 55) #define BIC_SAMMHz (1ULL << 56) #define BIC_SAMACTMHz (1ULL << 57) +#define BIC_Diec6 (1ULL << 58) #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ) -#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6) +#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6) #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -1475,6 +1477,11 @@ static struct msr_counter_arch_info msr_counter_arch_infos[] = { #define SYSFS_TELEM_PATH "/sys/class/intel_pmt" #endif +#define PMT_COUNTER_MTL_DC6_OFFSET 120 +#define PMT_COUNTER_MTL_DC6_LSB 0 +#define PMT_COUNTER_MTL_DC6_MSB 63 +#define PMT_MTL_DC6_GUID 0x1a067102 + #define PMT_COUNTER_NAME_SIZE_BYTES 16 #define PMT_COUNTER_TYPE_NAME_SIZE_BYTES 32 @@ -1496,6 +1503,7 @@ struct pmt_mmio { enum pmt_datatype { PMT_TYPE_RAW, + PMT_TYPE_XTAL_TIME, }; struct pmt_domain_info { @@ -1630,6 +1638,7 @@ struct pkg_data { struct rapl_counter rapl_dram_perf_status; /* MSR_DRAM_PERF_STATUS */ unsigned int pkg_temp_c; unsigned int uncore_mhz; + unsigned long long die_c6; unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS]; unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS]; unsigned long long pmt_counter[PMT_MAX_ADDED_PACKAGE_COUNTERS]; @@ -2281,6 +2290,10 @@ void print_header(char *delim) outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); break; + + case PMT_TYPE_XTAL_TIME: + outp += sprintf(outp, "%s%s", delim, ppmt->name); + break; } ppmt = ppmt->next; @@ -2351,6 +2364,10 @@ void print_header(char *delim) outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); break; + + case PMT_TYPE_XTAL_TIME: + outp += sprintf(outp, "%s%s", delim, ppmt->name); + break; } ppmt = ppmt->next; @@ -2400,6 +2417,8 @@ void print_header(char *delim) outp += sprintf(outp, "%sPkg%%pc9", (printed++ ? delim : "")); if (DO_BIC(BIC_Pkgpc10)) outp += sprintf(outp, "%sPk%%pc10", (printed++ ? delim : "")); + if (DO_BIC(BIC_Diec6)) + outp += sprintf(outp, "%sDie%%c6", (printed++ ? delim : "")); if (DO_BIC(BIC_CPU_LPI)) outp += sprintf(outp, "%sCPU%%LPI", (printed++ ? delim : "")); if (DO_BIC(BIC_SYS_LPI)) @@ -2476,6 +2495,10 @@ void print_header(char *delim) outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name); break; + + case PMT_TYPE_XTAL_TIME: + outp += sprintf(outp, "%s%s", delim, ppmt->name); + break; } ppmt = ppmt->next; @@ -2775,6 +2798,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->pmt_counter[i]); break; + + case PMT_TYPE_XTAL_TIME: + const unsigned long value_raw = t->pmt_counter[i]; + const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; + + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + break; } } @@ -2849,6 +2879,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->pmt_counter[i]); break; + + case PMT_TYPE_XTAL_TIME: + const unsigned long value_raw = c->pmt_counter[i]; + const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; + + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + break; } } @@ -2931,6 +2968,9 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data if (DO_BIC(BIC_Pkgpc10)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc); + if (DO_BIC(BIC_Diec6)) + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->die_c6 / crystal_hz / interval_float); + if (DO_BIC(BIC_CPU_LPI)) { if (p->cpu_lpi >= 0) outp += @@ -3037,6 +3077,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->pmt_counter[i]); break; + + case PMT_TYPE_XTAL_TIME: + const unsigned long value_raw = p->pmt_counter[i]; + const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; + + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); + break; } } @@ -3115,6 +3162,7 @@ int delta_package(struct pkg_data *new, struct pkg_data *old) old->pc8 = new->pc8 - old->pc8; old->pc9 = new->pc9 - old->pc9; old->pc10 = new->pc10 - old->pc10; + old->die_c6 = new->die_c6 - old->die_c6; old->cpu_lpi = new->cpu_lpi - old->cpu_lpi; old->sys_lpi = new->sys_lpi - old->sys_lpi; old->pkg_temp_c = new->pkg_temp_c; @@ -3398,6 +3446,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data p->pc8 = 0; p->pc9 = 0; p->pc10 = 0; + p->die_c6 = 0; p->cpu_lpi = 0; p->sys_lpi = 0; @@ -3547,6 +3596,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) average.packages.pc8 += p->pc8; average.packages.pc9 += p->pc9; average.packages.pc10 += p->pc10; + average.packages.die_c6 += p->die_c6; average.packages.cpu_lpi = p->cpu_lpi; average.packages.sys_lpi = p->sys_lpi; @@ -3642,6 +3692,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data average.packages.pc8 /= topo.allowed_packages; average.packages.pc9 /= topo.allowed_packages; average.packages.pc10 /= topo.allowed_packages; + average.packages.die_c6 /= topo.allowed_packages; for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) @@ -5505,6 +5556,7 @@ void msr_perf_init(void); void rapl_perf_init(void); void cstate_perf_init(void); void added_perf_counters_init(void); +void pmt_init(void); void re_initialize(void) { @@ -5515,6 +5567,7 @@ void re_initialize(void) rapl_perf_init(); cstate_perf_init(); added_perf_counters_init(); + pmt_init(); fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus, topo.allowed_cpus); } @@ -8864,6 +8917,15 @@ int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type, return 0; } +void pmt_init(void) +{ + if (BIC_IS_ENABLED(BIC_Diec6)) { + pmt_add_counter(PMT_MTL_DC6_GUID, "Die%c6", PMT_TYPE_XTAL_TIME, PMT_COUNTER_MTL_DC6_LSB, + PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET, SCOPE_PACKAGE, FORMAT_DELTA, + 0, PMT_OPEN_TRY); + } +} + void turbostat_init() { setup_all_buffers(true); @@ -8878,6 +8940,7 @@ void turbostat_init() rapl_perf_init(); cstate_perf_init(); added_perf_counters_init(); + pmt_init(); for_all_cpus(get_cpu_type, ODD_COUNTERS); for_all_cpus(get_cpu_type, EVEN_COUNTERS); @@ -9465,6 +9528,11 @@ next: has_type = true; } + if (strcmp("txtal_time", type_name) == 0) { + type = PMT_TYPE_XTAL_TIME; + has_type = true; + } + if (!has_type) { printf("%s: invalid %s: %s\n", __func__, "type", type_name); exit(1); From 944264a2a99cc4dd4b10eaa9798b6aab80adf4be Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 23 Jul 2024 20:12:32 +0200 Subject: [PATCH 111/120] tools/power turbostat: Document PMT in turbostat.8 Add a general description of the user interface for adding PMT counters with the new --add pmt,... option. Provide a complete example for requesting two counters. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 65 +++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 44cbc7cb382d..067717bce1d4 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -55,6 +55,39 @@ name as necessary to disambiguate it from others is necessary. Note that option as the column header. .fi .PP +\fB--add pmt,[attr_name=attr_value, ...]\fP add column with a PMT (Intel Platform Monitoring Technology) counter in a similar way to --add option above, but require PMT metadata to be supplied to correctly read and display the counter. The metadata can be found in the Intel PMT XML files, hosted at https://github.com/intel/Intel-PMT. For a complete example see "ADD PMT COUNTER EXAMPLE". +.nf + name="name_string" + For column header. + + type={\fBraw\fP} + 'raw' shows the counter contents in hex. + default: raw + + format={\fBraw\fP | \fBdelta\fP} + 'raw' shows the counter contents in hex. + 'delta' shows the difference in values during the measurement interval. + default: raw + + domain={\fBcpu%u\fP | \fBcore%u\fP | \fBpackage%u\fP} + 'cpu' per cpu/thread counter. + 'core' per core counter. + 'package' per package counter. + '%u' denotes id of the domain that the counter is associated with. For example core4 would mean that the counter is associated with core number 4. + + offset=\fB%u\fP + '%u' offset within the PMT MMIO region. + + lsb=\fB%u\fP + '%u' least significant bit within the 64 bit value read from 'offset'. Together with 'msb', used to form a read mask. + + msb=\fB%u\fP + '%u' most significant bit within the 64 bit value read from 'offset'. Together with 'lsb', used to form a read mask. + + guid=\fB%x\fP + '%x' hex identifier of the PMT MMIO region. +.fi +.PP \fB--cpu cpu-set\fP limit output to system summary plus the specified cpu-set. If cpu-set is the string "core", then the system summary plus the first CPU in each core are printed -- eg. subsequent HT siblings are not printed. Or if cpu-set is the string "package", then the system summary plus the first CPU in each package is printed. Otherwise, the system summary plus the specified set of CPUs are printed. The cpu-set is ordered from low to high, comma delimited with ".." and "-" permitted to denote a range. eg. 1,2,8,14..17,21-44 .PP \fB--hide column\fP do not show the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. @@ -354,6 +387,38 @@ CPU pCPU%c1 CPU%c1 .fi +.SH ADD PMT COUNTER EXAMPLE +Here we limit turbostat to showing just the CPU number 0. +We add two counters, showing crystal clock count and the DC6 residency. +All the parameters passed are based on the metadata found in the PMT XML files. + +For the crystal clock count, we +label it with the column header, "XTAL", +we set the type to 'raw', to read the number of clock ticks in hex, +we set the format to 'delta', to display the difference in ticks during the measurement interval, +we set the domain to 'package0', to collect it and associate it with the whole package number 0, +we set the offset to '0', which is a offset of the counter within the PMT MMIO region, +we set the lsb and msb to cover all 64 bits of the read 64 bit value, +and finally we set the guid to '0x1a067102', that identifies the PMT MMIO region to which the 'offset' is applied to read the counter value. + +For the DC6 residency counter, we +label it with the column header, "Die%c6", +we set the type to 'txtal_time', to obtain the percent residency value +we set the format to 'delta', to display the difference in ticks during the measurement interval, +we set the domain to 'package0', to collect it and associate it with the whole package number 0, +we set the offset to '0', which is a offset of the counter within the PMT MMIO region, +we set the lsb and msb to cover all 64 bits of the read 64 bit value, +and finally we set the guid to '0x1a067102', that identifies the PMT MMIO region to which the 'offset' is applied to read the counter value. + +.nf +sudo ./turbostat --quiet --cpu 0 --show CPU --add pmt,name=XTAL,type=raw,format=delta,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102 --add pmt,name=Die%c6,type=txtal_time,format=delta,domain=package0,offset=120,lsb=0,msb=63,guid=0x1a067102 +0.104352 sec +CPU XTAL Die%c6 +- 0x0000006d4d957ca7 0.00 +0 0x0000006d4d957ca7 0.00 +0.102448 sec +.fi + .SH INPUT For interval-mode, turbostat will immediately end the current interval From 19d076903b95896ce55c7cc3679f795731591ac6 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 24 Jul 2024 13:17:30 +0200 Subject: [PATCH 112/120] tools/power turbostat: Include umask=%x in perf counter's config Some counters, like cpu/cache-misses/, expose and require umask=%x parameter alongside event=%x in the sysfs perf counter's event file. This change make sure we parse and use it when opening user added counters. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 60 ++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f769e8beabd3..ed5f032c6b3f 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4017,15 +4017,55 @@ static unsigned int read_perf_type(const char *subsys) return read_perf_counter_info_n(path, format); } -static unsigned int read_rapl_config(const char *subsys, const char *event_name) +static unsigned int read_perf_config(const char *subsys, const char *event_name) { const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s"; - const char *const format = "event=%x"; + FILE *fconfig = NULL; char path[128]; + char config_str[64]; + unsigned int config; + unsigned int umask; + bool has_config = false; + bool has_umask = false; + unsigned int ret = -1; snprintf(path, sizeof(path), path_format, subsys, event_name); - return read_perf_counter_info_n(path, format); + fconfig = fopen(path, "r"); + if (!fconfig) + return -1; + + if (fgets(config_str, ARRAY_SIZE(config_str), fconfig) != config_str) + goto cleanup_and_exit; + + for (char *pconfig_str = &config_str[0]; pconfig_str;) { + if (sscanf(pconfig_str, "event=%x", &config) == 1) { + has_config = true; + goto next; + } + + if (sscanf(pconfig_str, "umask=%x", &umask) == 1) { + has_umask = true; + goto next; + } + + next: + pconfig_str = strchr(pconfig_str, ','); + if (pconfig_str) { + *pconfig_str = '\0'; + ++pconfig_str; + } + } + + if (!has_umask) + umask = 0; + + if (has_config) + ret = (umask << 8) | config; + +cleanup_and_exit: + fclose(fconfig); + return ret; } static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name) @@ -4044,7 +4084,7 @@ static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_na return RAPL_UNIT_INVALID; } -static double read_perf_rapl_scale(const char *subsys, const char *event_name) +static double read_perf_scale(const char *subsys, const char *event_name) { const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale"; const char *const format = "%lf"; @@ -7425,7 +7465,7 @@ int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struc if (no_perf) return -1; - const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name); + const double scale = read_perf_scale(cai->perf_subsys, cai->perf_name); if (scale == 0.0) return -1; @@ -7436,7 +7476,7 @@ int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struc return -1; const unsigned int rapl_type = read_perf_type(cai->perf_subsys); - const unsigned int rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name); + const unsigned int rapl_energy_pkg_config = read_perf_config(cai->perf_subsys, cai->perf_name); const int fd_counter = open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP); @@ -7598,7 +7638,7 @@ int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, const s return -1; const unsigned int type = read_perf_type(cai->perf_subsys); - const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name); + const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name); const int fd_counter = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP); @@ -7628,7 +7668,7 @@ int add_msr_perf_counter_(int cpu, struct msr_counter_info_t *cci, const struct return -1; const unsigned int type = read_perf_type(cai->perf_subsys); - const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name); + const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name); const int fd_counter = open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP); @@ -8571,7 +8611,7 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo) continue; } - perf_config = read_rapl_config(pinfo->device, pinfo->event); + perf_config = read_perf_config(pinfo->device, pinfo->event); if (perf_config == (unsigned int)-1) { warnx("%s: perf/%s/%s: failed to read %s", __func__, pinfo->device, pinfo->event, "config"); @@ -8579,7 +8619,7 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo) } /* Scale is not required, some counters just don't have it. */ - perf_scale = read_perf_rapl_scale(pinfo->device, pinfo->event); + perf_scale = read_perf_scale(pinfo->device, pinfo->event); if (perf_scale == 0.0) perf_scale = 1.0; From 866d2d36b81d7d0e6d91423b6dd9b1bcfd0510dd Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 26 Jul 2024 14:16:28 -0400 Subject: [PATCH 113/120] tools/power turbostat: version 2024.07.26 Release 2024.07.26: Enable turbostat extensions to add both perf and PMT (Intel Platform Monitoring Technology) counters from the cmdline. Demonstrate PMT access with built-in support for Meteor Lake's Die%c6 counter. This commit: Clean up white-space nits introduced since version 2024.05.10 Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 105 +++++++++++++------------- 1 file changed, 52 insertions(+), 53 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ed5f032c6b3f..089220aaa5c9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1057,8 +1057,7 @@ void probe_platform_features(unsigned int family, unsigned int model) return; for (i = 0; turbostat_pdata[i].features; i++) { - if (VFM_FAMILY(turbostat_pdata[i].vfm) == family && - VFM_MODEL(turbostat_pdata[i].vfm) == model) { + if (VFM_FAMILY(turbostat_pdata[i].vfm) == family && VFM_MODEL(turbostat_pdata[i].vfm) == model) { platform = turbostat_pdata[i].features; return; } @@ -1448,28 +1447,28 @@ enum msr_arch_info_index { static struct msr_counter_arch_info msr_counter_arch_infos[] = { [MSR_ARCH_INFO_APERF_INDEX] = { - .perf_subsys = "msr", - .perf_name = "aperf", - .msr = MSR_IA32_APERF, - .msr_mask = 0xFFFFFFFFFFFFFFFF, - .rci_index = MSR_RCI_INDEX_APERF, - }, + .perf_subsys = "msr", + .perf_name = "aperf", + .msr = MSR_IA32_APERF, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .rci_index = MSR_RCI_INDEX_APERF, + }, [MSR_ARCH_INFO_MPERF_INDEX] = { - .perf_subsys = "msr", - .perf_name = "mperf", - .msr = MSR_IA32_MPERF, - .msr_mask = 0xFFFFFFFFFFFFFFFF, - .rci_index = MSR_RCI_INDEX_MPERF, - }, + .perf_subsys = "msr", + .perf_name = "mperf", + .msr = MSR_IA32_MPERF, + .msr_mask = 0xFFFFFFFFFFFFFFFF, + .rci_index = MSR_RCI_INDEX_MPERF, + }, [MSR_ARCH_INFO_SMI_INDEX] = { - .perf_subsys = "msr", - .perf_name = "smi", - .msr = MSR_SMI_COUNT, - .msr_mask = 0xFFFFFFFF, - .rci_index = MSR_RCI_INDEX_SMI, - }, + .perf_subsys = "msr", + .perf_name = "smi", + .msr = MSR_SMI_COUNT, + .msr_mask = 0xFFFFFFFF, + .rci_index = MSR_RCI_INDEX_SMI, + }, }; /* Can be redefined when compiling, useful for testing. */ @@ -1535,14 +1534,14 @@ struct pmt_counter { unsigned int pmt_counter_get_width(const struct pmt_counter *p) { - return (p->msb - p->lsb)+1; + return (p->msb - p->lsb) + 1; } void pmt_counter_resize_(struct pmt_counter *pcounter, unsigned int new_size) { struct pmt_domain_info *new_mem; - new_mem = (struct pmt_domain_info*) reallocarray(pcounter->domains, new_size, sizeof(*pcounter->domains)); + new_mem = (struct pmt_domain_info *)reallocarray(pcounter->domains, new_size, sizeof(*pcounter->domains)); if (!new_mem) { fprintf(stderr, "%s: failed to allocate memory for PMT counters\n", __func__); exit(1); @@ -1567,7 +1566,7 @@ void pmt_counter_resize(struct pmt_counter *pcounter, unsigned int new_size) */ if (new_size < 8) new_size = 8; - new_size = MAX(new_size, pcounter->num_domains*2); + new_size = MAX(new_size, pcounter->num_domains * 2); pmt_counter_resize_(pcounter, new_size); } @@ -2282,7 +2281,7 @@ void print_header(char *delim) ppmt = sys.pmt_tp; while (ppmt) { - switch(ppmt->type) { + switch (ppmt->type) { case PMT_TYPE_RAW: if (pmt_counter_get_width(ppmt) <= 32) outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); @@ -2356,7 +2355,7 @@ void print_header(char *delim) ppmt = sys.pmt_cp; while (ppmt) { - switch(ppmt->type) { + switch (ppmt->type) { case PMT_TYPE_RAW: if (pmt_counter_get_width(ppmt) <= 32) outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); @@ -2487,7 +2486,7 @@ void print_header(char *delim) ppmt = sys.pmt_pp; while (ppmt) { - switch(ppmt->type) { + switch (ppmt->type) { case PMT_TYPE_RAW: if (pmt_counter_get_width(ppmt) <= 32) outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name); @@ -2969,7 +2968,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc); if (DO_BIC(BIC_Diec6)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->die_c6 / crystal_hz / interval_float); + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->die_c6 / crystal_hz / interval_float); if (DO_BIC(BIC_CPU_LPI)) { if (p->cpu_lpi >= 0) @@ -4049,7 +4049,7 @@ static unsigned int read_perf_config(const char *subsys, const char *event_name) goto next; } - next: +next: pconfig_str = strchr(pconfig_str, ','); if (pconfig_str) { *pconfig_str = '\0'; @@ -4463,9 +4463,9 @@ unsigned long pmt_gen_value_mask(unsigned int lsb, unsigned int msb) if (msb == 63) mask = 0xffffffffffffffff; else - mask = ((1<<(msb+1))-1); + mask = ((1 << (msb + 1)) - 1); - mask -= (1<device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]); + pinfo->device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]); } pinfo = pinfo->next; @@ -8687,7 +8687,7 @@ int parse_telem_info_file(int fd_dir, const char *info_filename, const char *for return 0; } -struct pmt_mmio* pmt_mmio_open(unsigned int target_guid) +struct pmt_mmio *pmt_mmio_open(unsigned int target_guid) { DIR *dirp; struct dirent *entry; @@ -8793,7 +8793,7 @@ loop_cleanup_and_break: return ret; } -struct pmt_mmio* pmt_mmio_find(unsigned int guid) +struct pmt_mmio *pmt_mmio_find(unsigned int guid) { struct pmt_mmio *pmmio = pmt_mmios; @@ -8802,22 +8802,22 @@ struct pmt_mmio* pmt_mmio_find(unsigned int guid) return pmmio; pmmio = pmmio->next; - } + } return NULL; } -void* pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offset) +void *pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offset) { char *ret; /* Get base of mmaped PMT file. */ - ret = (char*)pmmio->mmio_base; + ret = (char *)pmmio->mmio_base; /* * Apply PMT MMIO offset to obtain beginning of the mmaped telemetry data. * It's not guaranteed that the mmaped memory begins with the telemetry data - * - we might have to apply the offset first. + * - we might have to apply the offset first. */ ret += pmmio->pmt_offset; @@ -8827,7 +8827,7 @@ void* pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offs return ret; } -struct pmt_mmio* pmt_add_guid(unsigned int guid) +struct pmt_mmio *pmt_add_guid(unsigned int guid) { struct pmt_mmio *ret; @@ -8843,7 +8843,7 @@ enum pmt_open_mode { PMT_OPEN_REQUIRED, /* Open failure is a fatal error. */ }; -struct pmt_counter* pmt_find_counter(struct pmt_counter *pcounter, const char *name) +struct pmt_counter *pmt_find_counter(struct pmt_counter *pcounter, const char *name) { while (pcounter) { if (strcmp(pcounter->name, name) == 0) @@ -8855,9 +8855,9 @@ struct pmt_counter* pmt_find_counter(struct pmt_counter *pcounter, const char *n return pcounter; } -struct pmt_counter** pmt_get_scope_root(enum counter_scope scope) +struct pmt_counter **pmt_get_scope_root(enum counter_scope scope) { - switch(scope) { + switch (scope) { case SCOPE_CPU: return &sys.pmt_tp; case SCOPE_CORE: @@ -8873,7 +8873,7 @@ void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio, { /* Make sure the new domain fits. */ if (domain_id >= pcounter->num_domains) - pmt_counter_resize(pcounter, domain_id+1); + pmt_counter_resize(pcounter, domain_id + 1); assert(pcounter->domains); assert(domain_id < pcounter->num_domains); @@ -8882,12 +8882,12 @@ void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio, } int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type, - unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope, - enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode) + unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope, + enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode) { struct pmt_mmio *mmio; struct pmt_counter *pcounter; - struct pmt_counter ** const pmt_root = pmt_get_scope_root(scope); + struct pmt_counter **const pmt_root = pmt_get_scope_root(scope); bool new_counter = false; int conflict = 0; @@ -8927,7 +8927,7 @@ int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type, } if (new_counter) { - strncpy(pcounter->name, name, ARRAY_SIZE(pcounter->name)-1); + strncpy(pcounter->name, name, ARRAY_SIZE(pcounter->name) - 1); pcounter->type = type; pcounter->scope = scope; pcounter->lsb = lsb; @@ -9071,7 +9071,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 2024.05.10 - Len Brown \n"); + fprintf(outf, "turbostat version 2024.07.26 - Len Brown \n"); } #define COMMAND_LINE_SIZE 2048 @@ -9299,7 +9299,7 @@ int add_perf_counter(const char *perf_device, const char *perf_event, const char // FIXME: we might not have debug here yet if (debug) fprintf(stderr, "%s: %s/%s, name: %s, scope%d\n", - __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope); + __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope); return 0; } @@ -9450,10 +9450,10 @@ void parse_add_command_pmt(char *add_command) bool has_offset = false; bool has_lsb = false; bool has_msb = false; - bool has_format = true; /* Format has a default value. */ + bool has_format = true; /* Format has a default value. */ bool has_guid = false; bool has_scope = false; - bool has_type = true; /* Type has a default value. */ + bool has_type = true; /* Type has a default value. */ /* Consume the "pmt," prefix. */ add_command = strchr(add_command, ','); @@ -9490,7 +9490,7 @@ void parse_add_command_pmt(char *add_command) if (!has_scope) { printf("%s: invalid value for scope. Expected cpu%%u, core%%u or package%%u.\n", - __func__); + __func__); exit(1); } @@ -9536,8 +9536,7 @@ next: } if (strlen(name) >= PMT_COUNTER_NAME_SIZE_BYTES) { - printf("%s: name has to be at most %d characters long\n", - __func__, PMT_COUNTER_NAME_SIZE_BYTES); + printf("%s: name has to be at most %d characters long\n", __func__, PMT_COUNTER_NAME_SIZE_BYTES); exit(1); } From e1c5ae59c0f22f7fe5c07fb5513a29e4aad868c9 Mon Sep 17 00:00:00 2001 From: "Seth Forshee (DigitalOcean)" Date: Wed, 24 Jul 2024 09:53:59 -0500 Subject: [PATCH 114/120] fs: don't allow non-init s_user_ns for filesystems without FS_USERNS_MOUNT Christian noticed that it is possible for a privileged user to mount most filesystems with a non-initial user namespace in sb->s_user_ns. When fsopen() is called in a non-init namespace the caller's namespace is recorded in fs_context->user_ns. If the returned file descriptor is then passed to a process priviliged in init_user_ns, that process can call fsconfig(fd_fs, FSCONFIG_CMD_CREATE), creating a new superblock with sb->s_user_ns set to the namespace of the process which called fsopen(). This is problematic. We cannot assume that any filesystem which does not set FS_USERNS_MOUNT has been written with a non-initial s_user_ns in mind, increasing the risk for bugs and security issues. Prevent this by returning EPERM from sget_fc() when FS_USERNS_MOUNT is not set for the filesystem and a non-initial user namespace will be used. sget() does not need to be updated as it always uses the user namespace of the current context, or the initial user namespace if SB_SUBMOUNT is set. Fixes: cb50b348c71f ("convenience helpers: vfs_get_super() and sget_fc()") Reported-by: Christian Brauner Signed-off-by: Seth Forshee (DigitalOcean) Link: https://lore.kernel.org/r/20240724-s_user_ns-fix-v1-1-895d07c94701@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- fs/super.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/super.c b/fs/super.c index 095ba793e10c..38d72a3cf6fc 100644 --- a/fs/super.c +++ b/fs/super.c @@ -736,6 +736,17 @@ struct super_block *sget_fc(struct fs_context *fc, struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; int err; + /* + * Never allow s_user_ns != &init_user_ns when FS_USERNS_MOUNT is + * not set, as the filesystem is likely unprepared to handle it. + * This can happen when fsconfig() is called from init_user_ns with + * an fs_fd opened in another user namespace. + */ + if (user_ns != &init_user_ns && !(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) { + errorfc(fc, "VFS: Mounting from non-initial user namespace is not allowed"); + return ERR_PTR(-EPERM); + } + retry: spin_lock(&sb_lock); if (test) { From ef9ca17ca458ac7253ae71b552e601e49311fc48 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Thu, 25 Jul 2024 14:51:30 +0800 Subject: [PATCH 115/120] hostfs: fix the host directory parse when mounting. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hostfs not keep the host directory when mounting. When the host directory is none (default), fc->source is used as the host root directory, and this is wrong. Here we use `parse_monolithic` to handle the old mount path for parsing the root directory. For new mount path, The `parse_param` is used for the host directory parse. Reported-and-tested-by: Maciej Żenczykowski Fixes: cd140ce9f611 ("hostfs: convert hostfs to use the new mount API") Link: https://lore.kernel.org/all/CANP3RGceNzwdb7w=vPf5=7BCid5HVQDmz1K5kC9JG42+HVAh_g@mail.gmail.com/ Cc: Christian Brauner Signed-off-by: Hongbo Li Link: https://lore.kernel.org/r/20240725065130.1821964-1-lihongbo22@huawei.com [brauner: minor fixes] Signed-off-by: Christian Brauner --- fs/hostfs/hostfs_kern.c | 65 ++++++++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 10 deletions(-) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 2b532670a561..e866b08e448f 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "hostfs.h" #include @@ -927,7 +928,6 @@ static const struct inode_operations hostfs_link_iops = { static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct hostfs_fs_info *fsi = sb->s_fs_info; - const char *host_root = fc->source; struct inode *root_inode; int err; @@ -941,15 +941,6 @@ static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; - /* NULL is printed as '(null)' by printf(): avoid that. */ - if (fc->source == NULL) - host_root = ""; - - fsi->host_root_path = - kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); - if (fsi->host_root_path == NULL) - return -ENOMEM; - root_inode = hostfs_iget(sb, fsi->host_root_path); if (IS_ERR(root_inode)) return PTR_ERR(root_inode); @@ -975,6 +966,58 @@ static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc) return 0; } +enum hostfs_parma { + Opt_hostfs, +}; + +static const struct fs_parameter_spec hostfs_param_specs[] = { + fsparam_string_empty("hostfs", Opt_hostfs), + {} +}; + +static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct hostfs_fs_info *fsi = fc->s_fs_info; + struct fs_parse_result result; + char *host_root; + int opt; + + opt = fs_parse(fc, hostfs_param_specs, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_hostfs: + host_root = param->string; + if (!*host_root) + host_root = ""; + fsi->host_root_path = + kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); + if (fsi->host_root_path == NULL) + return -ENOMEM; + break; + } + + return 0; +} + +static int hostfs_parse_monolithic(struct fs_context *fc, void *data) +{ + struct hostfs_fs_info *fsi = fc->s_fs_info; + char *host_root = (char *)data; + + /* NULL is printed as '(null)' by printf(): avoid that. */ + if (host_root == NULL) + host_root = ""; + + fsi->host_root_path = + kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root); + if (fsi->host_root_path == NULL) + return -ENOMEM; + + return 0; +} + static int hostfs_fc_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, hostfs_fill_super); @@ -992,6 +1035,8 @@ static void hostfs_fc_free(struct fs_context *fc) } static const struct fs_context_operations hostfs_context_ops = { + .parse_monolithic = hostfs_parse_monolithic, + .parse_param = hostfs_parse_param, .get_tree = hostfs_fc_get_tree, .free = hostfs_fc_free, }; From d01c14074be79e5f5270498f90530a12583fbf7a Mon Sep 17 00:00:00 2001 From: Jose Ignacio Tornos Martinez Date: Fri, 26 Jul 2024 11:00:26 +0200 Subject: [PATCH 116/120] kbuild: rpm-pkg: ghost modules.weakdep file In the same way as for other similar files, mark as ghost the new file generated by depmod for configured weak dependencies for modules, modules.weakdep, so that although it is not included in the package, claim the ownership on it. Signed-off-by: Jose Ignacio Tornos Martinez Signed-off-by: Masahiro Yamada --- scripts/package/kernel.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/package/kernel.spec b/scripts/package/kernel.spec index 74355ff0e106..ac3e5ac01d8a 100644 --- a/scripts/package/kernel.spec +++ b/scripts/package/kernel.spec @@ -74,7 +74,7 @@ ln -fns /usr/src/kernels/%{KERNELRELEASE} %{buildroot}/lib/modules/%{KERNELRELEA echo "/lib/modules/%{KERNELRELEASE}" for x in alias alias.bin builtin.alias.bin builtin.bin dep dep.bin \ - devname softdep symbols symbols.bin; do + devname softdep symbols symbols.bin weakdep; do echo "%ghost /lib/modules/%{KERNELRELEASE}/modules.${x}" done From 92a286e90203ce3e6c3a6d945fa36da419c3671f Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sat, 13 Jul 2024 09:35:19 +0200 Subject: [PATCH 117/120] ubi: Fix ubi_init() ubiblock_exit() section mismatch Since ubiblock_exit() is now called from an init function, the __exit section no longer makes sense. Cc: Ben Hutchings Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407131403.wZJpd8n2-lkp@intel.com/ Signed-off-by: Richard Weinberger Reviewed-by: Zhihao Cheng --- drivers/mtd/ubi/block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index bf7308e8ec2f..60d0155be869 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -670,7 +670,7 @@ err_unreg: return ret; } -void __exit ubiblock_exit(void) +void ubiblock_exit(void) { ubi_unregister_volume_notifier(&ubiblock_notifier); ubiblock_remove_all(); From 3415b10a03945b0da4a635e146750dfe5ce0f448 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 26 Jul 2024 11:05:00 -0700 Subject: [PATCH 118/120] kbuild: Fix '-S -c' in x86 stack protector scripts After a recent change in clang to stop consuming all instances of '-S' and '-c' [1], the stack protector scripts break due to the kernel's use of -Werror=unused-command-line-argument to catch cases where flags are not being properly consumed by the compiler driver: $ echo | clang -o - -x c - -S -c -Werror=unused-command-line-argument clang: error: argument unused during compilation: '-c' [-Werror,-Wunused-command-line-argument] This results in CONFIG_STACKPROTECTOR getting disabled because CONFIG_CC_HAS_SANE_STACKPROTECTOR is no longer set. '-c' and '-S' both instruct the compiler to stop at different stages of the pipeline ('-S' after compiling, '-c' after assembling), so having them present together in the same command makes little sense. In this case, the test wants to stop before assembling because it is looking at the textual assembly output of the compiler for either '%fs' or '%gs', so remove '-c' from the list of arguments to resolve the error. All versions of GCC continue to work after this change, along with versions of clang that do or do not contain the change mentioned above. Cc: stable@vger.kernel.org Fixes: 4f7fd4d7a791 ("[PATCH] Add the -fstack-protector option to the CFLAGS") Fixes: 60a5317ff0f4 ("x86: implement x86_32 stack protector") Link: https://github.com/llvm/llvm-project/commit/6461e537815f7fa68cef06842505353cf5600e9c [1] Signed-off-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- scripts/gcc-x86_32-has-stack-protector.sh | 2 +- scripts/gcc-x86_64-has-stack-protector.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gcc-x86_32-has-stack-protector.sh b/scripts/gcc-x86_32-has-stack-protector.sh index 825c75c5b715..9459ca4f0f11 100755 --- a/scripts/gcc-x86_32-has-stack-protector.sh +++ b/scripts/gcc-x86_32-has-stack-protector.sh @@ -5,4 +5,4 @@ # -mstack-protector-guard-reg, added by # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81708 -echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -c -m32 -O0 -fstack-protector -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard - -o - 2> /dev/null | grep -q "%fs" +echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -m32 -O0 -fstack-protector -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard - -o - 2> /dev/null | grep -q "%fs" diff --git a/scripts/gcc-x86_64-has-stack-protector.sh b/scripts/gcc-x86_64-has-stack-protector.sh index 75e4e22b986a..f680bb01aeeb 100755 --- a/scripts/gcc-x86_64-has-stack-protector.sh +++ b/scripts/gcc-x86_64-has-stack-protector.sh @@ -1,4 +1,4 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 -echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -c -m64 -O0 -mcmodel=kernel -fno-PIE -fstack-protector - -o - 2> /dev/null | grep -q "%gs" +echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -m64 -O0 -mcmodel=kernel -fno-PIE -fstack-protector - -o - 2> /dev/null | grep -q "%gs" From 4477b39c32fdc03363affef4b11d48391e6dc9ff Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 13:03:48 -0700 Subject: [PATCH 119/120] minmax: add a few more MIN_T/MAX_T users Commit 3a7e02c040b1 ("minmax: avoid overly complicated constant expressions in VM code") added the simpler MIN_T/MAX_T macros in order to avoid some excessive expansion from the rather complicated regular min/max macros. The complexity of those macros stems from two issues: (a) trying to use them in situations that require a C constant expression (in static initializers and for array sizes) (b) the type sanity checking and MIN_T/MAX_T avoids both of these issues. Now, in the whole (long) discussion about all this, it was pointed out that the whole type sanity checking is entirely unnecessary for min_t/max_t which get a fixed type that the comparison is done in. But that still leaves min_t/max_t unnecessarily complicated due to worries about the C constant expression case. However, it turns out that there really aren't very many cases that use min_t/max_t for this, and we can just force-convert those. This does exactly that. Which in turn will then allow for much simpler implementations of min_t()/max_t(). All the usual "macros in all upper case will evaluate the arguments multiple times" rules apply. We should do all the same things for the regular min/max() vs MIN/MAX() cases, but that has the added complexity of various drivers defining their own local versions of MIN/MAX, so that needs another level of fixes first. Link: https://lore.kernel.org/all/b47fad1d0cf8449886ad148f8c013dae@AcuMS.aculab.com/ Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- arch/x86/mm/pgtable.c | 2 +- drivers/edac/sb_edac.c | 4 ++-- drivers/gpu/drm/drm_color_mgmt.c | 2 +- drivers/md/dm-integrity.c | 6 +++--- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- net/ipv4/proc.c | 2 +- net/ipv6/proc.c | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 93e54ba91fbf..f5931499c2d6 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -110,7 +110,7 @@ static inline void pgd_list_del(pgd_t *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) #define MAX_UNSHARED_PTRS_PER_PGD \ - max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) + MAX_T(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index cbc92d3683e6..e5c05a876947 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -109,8 +109,8 @@ static const u32 knl_interleave_list[] = { 0x104, 0x10c, 0x114, 0x11c, /* 20-23 */ }; #define MAX_INTERLEAVE \ - (max_t(unsigned int, ARRAY_SIZE(sbridge_interleave_list), \ - max_t(unsigned int, ARRAY_SIZE(ibridge_interleave_list), \ + (MAX_T(unsigned int, ARRAY_SIZE(sbridge_interleave_list), \ + MAX_T(unsigned int, ARRAY_SIZE(ibridge_interleave_list), \ ARRAY_SIZE(knl_interleave_list)))) struct interleave_pkg { diff --git a/drivers/gpu/drm/drm_color_mgmt.c b/drivers/gpu/drm/drm_color_mgmt.c index d021497841b8..3969dc548cff 100644 --- a/drivers/gpu/drm/drm_color_mgmt.c +++ b/drivers/gpu/drm/drm_color_mgmt.c @@ -532,7 +532,7 @@ int drm_plane_create_color_properties(struct drm_plane *plane, { struct drm_device *dev = plane->dev; struct drm_property *prop; - struct drm_prop_enum_list enum_list[max_t(int, DRM_COLOR_ENCODING_MAX, + struct drm_prop_enum_list enum_list[MAX_T(int, DRM_COLOR_ENCODING_MAX, DRM_COLOR_RANGE_MAX)]; int i, len; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 48ac628f471b..51e6964c1305 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1776,7 +1776,7 @@ static void integrity_metadata(struct work_struct *w) struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); char *checksums; unsigned int extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; - char checksums_onstack[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; sector_t sector; unsigned int sectors_to_process; @@ -2064,7 +2064,7 @@ retry_kmap: } while (++s < ic->sectors_per_block); #ifdef INTERNAL_VERIFY if (ic->internal_hash) { - char checksums_onstack[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { @@ -2837,7 +2837,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start unlikely(from_replay) && #endif ic->internal_hash) { - char test_tag[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + char test_tag[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), (char *)access_journal_data(ic, i, l), test_tag); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 12689774d755..f3a1b179aaea 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2912,7 +2912,7 @@ static void stmmac_dma_interrupt(struct stmmac_priv *priv) u32 channels_to_check = tx_channel_count > rx_channel_count ? tx_channel_count : rx_channel_count; u32 chan; - int status[max_t(u32, MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES)]; + int status[MAX_T(u32, MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES)]; /* Make sure we never check beyond our status buffer. */ if (WARN_ON_ONCE(channels_to_check > ARRAY_SIZE(status))) diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6c4664c681ca..40053a02bae1 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -44,7 +44,7 @@ #include #include -#define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX) +#define TCPUDP_MIB_MAX MAX_T(u32, UDP_MIB_MAX, TCP_MIB_MAX) /* * Report socket allocation statistics [mea@utu.fi] diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 6d1d9221649d..752327b10dde 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -27,7 +27,7 @@ #include #define MAX4(a, b, c, d) \ - max_t(u32, max_t(u32, a, b), max_t(u32, c, d)) + MAX_T(u32, MAX_T(u32, a, b), MAX_T(u32, c, d)) #define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \ IPSTATS_MIB_MAX, ICMP_MIB_MAX) From 017fa3e89187848fd056af757769c9e66ac3e93d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Jul 2024 13:50:01 -0700 Subject: [PATCH 120/120] minmax: simplify and clarify min_t()/max_t() implementation This simplifies the min_t() and max_t() macros by no longer making them work in the context of a C constant expression. That means that you can no longer use them for static initializers or for array sizes in type definitions, but there were only a couple of such uses, and all of them were converted (famous last words) to use MIN_T/MAX_T instead. Cc: David Laight Cc: Lorenzo Stoakes Signed-off-by: Linus Torvalds --- include/linux/minmax.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index a7ef65f78933..9c2848abc804 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -45,17 +45,20 @@ #define __cmp(op, x, y) ((x) __cmp_op_##op (y) ? (x) : (y)) -#define __cmp_once(op, x, y, unique_x, unique_y) ({ \ - typeof(x) unique_x = (x); \ - typeof(y) unique_y = (y); \ +#define __cmp_once_unique(op, type, x, y, ux, uy) \ + ({ type ux = (x); type uy = (y); __cmp(op, ux, uy); }) + +#define __cmp_once(op, type, x, y) \ + __cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) + +#define __careful_cmp_once(op, x, y) ({ \ static_assert(__types_ok(x, y), \ #op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \ - __cmp(op, unique_x, unique_y); }) + __cmp_once(op, __auto_type, x, y); }) #define __careful_cmp(op, x, y) \ __builtin_choose_expr(__is_constexpr((x) - (y)), \ - __cmp(op, x, y), \ - __cmp_once(op, x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y))) + __cmp(op, x, y), __careful_cmp_once(op, x, y)) #define __clamp(val, lo, hi) \ ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) @@ -158,7 +161,7 @@ * @x: first value * @y: second value */ -#define min_t(type, x, y) __careful_cmp(min, (type)(x), (type)(y)) +#define min_t(type, x, y) __cmp_once(min, type, x, y) /** * max_t - return maximum of two values, using the specified type @@ -166,7 +169,7 @@ * @x: first value * @y: second value */ -#define max_t(type, x, y) __careful_cmp(max, (type)(x), (type)(y)) +#define max_t(type, x, y) __cmp_once(max, type, x, y) /* * Do not check the array parameter using __must_be_array().