From 0005b2dc43f96b93fc5b0850d7ca3f7aeac9129c Mon Sep 17 00:00:00 2001 From: Christian Eggers Date: Wed, 3 Jul 2024 16:57:17 +0200 Subject: [PATCH 01/33] dsa: lan9303: Fix mapping between DSA port number and PHY address The 'phy' parameter supplied to lan9303_phy_read/_write was sometimes a DSA port number and sometimes a PHY address. This isn't a problem as long as they are equal. But if the external phy_addr_sel_strap pin is wired to 'high', the PHY addresses change from 0-1-2 to 1-2-3 (CPU, slave0, slave1). In this case, lan9303_phy_read/_write must translate between DSA port numbers and the corresponding PHY address. Fixes: a1292595e006 ("net: dsa: add new DSA switch driver for the SMSC-LAN9303") Signed-off-by: Christian Eggers Reviewed-by: Michal Kubiak Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20240703145718.19951-1-ceggers@arri.de Signed-off-by: Jakub Kicinski --- drivers/net/dsa/lan9303-core.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c index 02f07b870f10..268949939636 100644 --- a/drivers/net/dsa/lan9303-core.c +++ b/drivers/net/dsa/lan9303-core.c @@ -1047,31 +1047,31 @@ static int lan9303_get_sset_count(struct dsa_switch *ds, int port, int sset) return ARRAY_SIZE(lan9303_mib); } -static int lan9303_phy_read(struct dsa_switch *ds, int phy, int regnum) +static int lan9303_phy_read(struct dsa_switch *ds, int port, int regnum) { struct lan9303 *chip = ds->priv; int phy_base = chip->phy_addr_base; - if (phy == phy_base) + if (port == 0) return lan9303_virt_phy_reg_read(chip, regnum); - if (phy > phy_base + 2) + if (port > 2) return -ENODEV; - return chip->ops->phy_read(chip, phy, regnum); + return chip->ops->phy_read(chip, phy_base + port, regnum); } -static int lan9303_phy_write(struct dsa_switch *ds, int phy, int regnum, +static int lan9303_phy_write(struct dsa_switch *ds, int port, int regnum, u16 val) { struct lan9303 *chip = ds->priv; int phy_base = chip->phy_addr_base; - if (phy == phy_base) + if (port == 0) return lan9303_virt_phy_reg_write(chip, regnum, val); - if (phy > phy_base + 2) + if (port > 2) return -ENODEV; - return chip->ops->phy_write(chip, phy, regnum, val); + return chip->ops->phy_write(chip, phy_base + port, regnum, val); } static int lan9303_port_enable(struct dsa_switch *ds, int port, @@ -1099,7 +1099,7 @@ static void lan9303_port_disable(struct dsa_switch *ds, int port) vlan_vid_del(dsa_port_to_conduit(dp), htons(ETH_P_8021Q), port); lan9303_disable_processing_port(chip, port); - lan9303_phy_write(ds, chip->phy_addr_base + port, MII_BMCR, BMCR_PDOWN); + lan9303_phy_write(ds, port, MII_BMCR, BMCR_PDOWN); } static int lan9303_port_bridge_join(struct dsa_switch *ds, int port, @@ -1374,8 +1374,6 @@ static const struct dsa_switch_ops lan9303_switch_ops = { static int lan9303_register_switch(struct lan9303 *chip) { - int base; - chip->ds = devm_kzalloc(chip->dev, sizeof(*chip->ds), GFP_KERNEL); if (!chip->ds) return -ENOMEM; @@ -1385,8 +1383,7 @@ static int lan9303_register_switch(struct lan9303 *chip) chip->ds->priv = chip; chip->ds->ops = &lan9303_switch_ops; chip->ds->phylink_mac_ops = &lan9303_phylink_mac_ops; - base = chip->phy_addr_base; - chip->ds->phys_mii_mask = GENMASK(LAN9303_NUM_PORTS - 1 + base, base); + chip->ds->phys_mii_mask = GENMASK(LAN9303_NUM_PORTS - 1, 0); return dsa_register_switch(chip->ds); } From 0c754d9d86ffdf2f86b4272b25d759843fb62fd8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 4 Jul 2024 10:19:44 -0500 Subject: [PATCH 02/33] net: bcmasp: Fix error code in probe() Return an error code if bcmasp_interface_create() fails. Don't return success. Fixes: 490cb412007d ("net: bcmasp: Add support for ASP2.0 Ethernet controller") Signed-off-by: Dan Carpenter Reviewed-by: Michal Kubiak Reviewed-by: Justin Chen Link: https://patch.msgid.link/ZoWKBkHH9D1fqV4r@stanley.mountain Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/asp2/bcmasp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp.c b/drivers/net/ethernet/broadcom/asp2/bcmasp.c index a806dadc4196..20c6529ec135 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp.c @@ -1380,6 +1380,7 @@ static int bcmasp_probe(struct platform_device *pdev) dev_err(dev, "Cannot create eth interface %d\n", i); bcmasp_remove_intfs(priv); of_node_put(intf_node); + ret = -ENOMEM; goto of_put_exit; } list_add_tail(&intf->list, &priv->intfs); From 2cb489eb8dfc291060516df313ff31f4f9f3d794 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 4 Jul 2024 17:45:14 +0200 Subject: [PATCH 03/33] wireguard: selftests: use acpi=off instead of -no-acpi for recent QEMU QEMU 9.0 removed -no-acpi, in favor of machine properties, so update the Makefile to use the correct QEMU invocation. Cc: stable@vger.kernel.org Fixes: b83fdcd9fb8a ("wireguard: selftests: use microvm on x86") Signed-off-by: Jason A. Donenfeld Link: https://patch.msgid.link/20240704154517.1572127-2-Jason@zx2c4.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/wireguard/qemu/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/wireguard/qemu/Makefile b/tools/testing/selftests/wireguard/qemu/Makefile index e95bd56b332f..35856b11c143 100644 --- a/tools/testing/selftests/wireguard/qemu/Makefile +++ b/tools/testing/selftests/wireguard/qemu/Makefile @@ -109,9 +109,9 @@ KERNEL_ARCH := x86_64 KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage QEMU_VPORT_RESULT := virtio-serial-device ifeq ($(HOST_ARCH),$(ARCH)) -QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off -no-acpi +QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off,acpi=off else -QEMU_MACHINE := -cpu max -machine microvm -no-acpi +QEMU_MACHINE := -cpu max -machine microvm,acpi=off endif else ifeq ($(ARCH),i686) CHOST := i686-linux-musl @@ -120,9 +120,9 @@ KERNEL_ARCH := x86 KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage QEMU_VPORT_RESULT := virtio-serial-device ifeq ($(subst x86_64,i686,$(HOST_ARCH)),$(ARCH)) -QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off -no-acpi +QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off,acpi=off else -QEMU_MACHINE := -cpu coreduo -machine microvm -no-acpi +QEMU_MACHINE := -cpu coreduo -machine microvm,acpi=off endif else ifeq ($(ARCH),mips64) CHOST := mips64-linux-musl From 948f991c62a4018fb81d85804eeab3029c6209f8 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 4 Jul 2024 17:45:15 +0200 Subject: [PATCH 04/33] wireguard: allowedips: avoid unaligned 64-bit memory accesses On the parisc platform, the kernel issues kernel warnings because swap_endian() tries to load a 128-bit IPv6 address from an unaligned memory location: Kernel: unaligned access to 0x55f4688c in wg_allowedips_insert_v6+0x2c/0x80 [wireguard] (iir 0xf3010df) Kernel: unaligned access to 0x55f46884 in wg_allowedips_insert_v6+0x38/0x80 [wireguard] (iir 0xf2010dc) Avoid such unaligned memory accesses by instead using the get_unaligned_be64() helper macro. Signed-off-by: Helge Deller [Jason: replace src[8] in original patch with src+8] Cc: stable@vger.kernel.org Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Link: https://patch.msgid.link/20240704154517.1572127-3-Jason@zx2c4.com Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/allowedips.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c index 0ba714ca5185..4b8528206cc8 100644 --- a/drivers/net/wireguard/allowedips.c +++ b/drivers/net/wireguard/allowedips.c @@ -15,8 +15,8 @@ static void swap_endian(u8 *dst, const u8 *src, u8 bits) if (bits == 32) { *(u32 *)dst = be32_to_cpu(*(const __be32 *)src); } else if (bits == 128) { - ((u64 *)dst)[0] = be64_to_cpu(((const __be64 *)src)[0]); - ((u64 *)dst)[1] = be64_to_cpu(((const __be64 *)src)[1]); + ((u64 *)dst)[0] = get_unaligned_be64(src); + ((u64 *)dst)[1] = get_unaligned_be64(src + 8); } } From 2fe3d6d2053c57f2eae5e85ca1656d185ebbe4e8 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 4 Jul 2024 17:45:16 +0200 Subject: [PATCH 05/33] wireguard: queueing: annotate intentional data race in cpu round robin KCSAN reports a race in the CPU round robin function, which, as the comment points out, is intentional: BUG: KCSAN: data-race in wg_packet_send_staged_packets / wg_packet_send_staged_packets read to 0xffff88811254eb28 of 4 bytes by task 3160 on cpu 1: wg_cpumask_next_online drivers/net/wireguard/queueing.h:127 [inline] wg_queue_enqueue_per_device_and_peer drivers/net/wireguard/queueing.h:173 [inline] wg_packet_create_data drivers/net/wireguard/send.c:320 [inline] wg_packet_send_staged_packets+0x60e/0xac0 drivers/net/wireguard/send.c:388 wg_packet_send_keepalive+0xe2/0x100 drivers/net/wireguard/send.c:239 wg_receive_handshake_packet drivers/net/wireguard/receive.c:186 [inline] wg_packet_handshake_receive_worker+0x449/0x5f0 drivers/net/wireguard/receive.c:213 process_one_work kernel/workqueue.c:3248 [inline] process_scheduled_works+0x483/0x9a0 kernel/workqueue.c:3329 worker_thread+0x526/0x720 kernel/workqueue.c:3409 kthread+0x1d1/0x210 kernel/kthread.c:389 ret_from_fork+0x4b/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 write to 0xffff88811254eb28 of 4 bytes by task 3158 on cpu 0: wg_cpumask_next_online drivers/net/wireguard/queueing.h:130 [inline] wg_queue_enqueue_per_device_and_peer drivers/net/wireguard/queueing.h:173 [inline] wg_packet_create_data drivers/net/wireguard/send.c:320 [inline] wg_packet_send_staged_packets+0x6e5/0xac0 drivers/net/wireguard/send.c:388 wg_packet_send_keepalive+0xe2/0x100 drivers/net/wireguard/send.c:239 wg_receive_handshake_packet drivers/net/wireguard/receive.c:186 [inline] wg_packet_handshake_receive_worker+0x449/0x5f0 drivers/net/wireguard/receive.c:213 process_one_work kernel/workqueue.c:3248 [inline] process_scheduled_works+0x483/0x9a0 kernel/workqueue.c:3329 worker_thread+0x526/0x720 kernel/workqueue.c:3409 kthread+0x1d1/0x210 kernel/kthread.c:389 ret_from_fork+0x4b/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 value changed: 0xffffffff -> 0x00000000 Mark this race as intentional by using READ/WRITE_ONCE(). Cc: stable@vger.kernel.org Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Link: https://patch.msgid.link/20240704154517.1572127-4-Jason@zx2c4.com Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/queueing.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h index 1ea4f874e367..7eb76724b3ed 100644 --- a/drivers/net/wireguard/queueing.h +++ b/drivers/net/wireguard/queueing.h @@ -124,10 +124,10 @@ static inline int wg_cpumask_choose_online(int *stored_cpu, unsigned int id) */ static inline int wg_cpumask_next_online(int *last_cpu) { - int cpu = cpumask_next(*last_cpu, cpu_online_mask); + int cpu = cpumask_next(READ_ONCE(*last_cpu), cpu_online_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(cpu_online_mask); - *last_cpu = cpu; + WRITE_ONCE(*last_cpu, cpu); return cpu; } From 381a7d453fa2ac5f854a154d3c9b1bbb90c4f94f Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 4 Jul 2024 17:45:17 +0200 Subject: [PATCH 06/33] wireguard: send: annotate intentional data race in checking empty queue KCSAN reports a race in wg_packet_send_keepalive, which is intentional: BUG: KCSAN: data-race in wg_packet_send_keepalive / wg_packet_send_staged_packets write to 0xffff88814cd91280 of 8 bytes by task 3194 on cpu 0: __skb_queue_head_init include/linux/skbuff.h:2162 [inline] skb_queue_splice_init include/linux/skbuff.h:2248 [inline] wg_packet_send_staged_packets+0xe5/0xad0 drivers/net/wireguard/send.c:351 wg_xmit+0x5b8/0x660 drivers/net/wireguard/device.c:218 __netdev_start_xmit include/linux/netdevice.h:4940 [inline] netdev_start_xmit include/linux/netdevice.h:4954 [inline] xmit_one net/core/dev.c:3548 [inline] dev_hard_start_xmit+0x11b/0x3f0 net/core/dev.c:3564 __dev_queue_xmit+0xeff/0x1d80 net/core/dev.c:4349 dev_queue_xmit include/linux/netdevice.h:3134 [inline] neigh_connected_output+0x231/0x2a0 net/core/neighbour.c:1592 neigh_output include/net/neighbour.h:542 [inline] ip6_finish_output2+0xa66/0xce0 net/ipv6/ip6_output.c:137 ip6_finish_output+0x1a5/0x490 net/ipv6/ip6_output.c:222 NF_HOOK_COND include/linux/netfilter.h:303 [inline] ip6_output+0xeb/0x220 net/ipv6/ip6_output.c:243 dst_output include/net/dst.h:451 [inline] NF_HOOK include/linux/netfilter.h:314 [inline] ndisc_send_skb+0x4a2/0x670 net/ipv6/ndisc.c:509 ndisc_send_rs+0x3ab/0x3e0 net/ipv6/ndisc.c:719 addrconf_dad_completed+0x640/0x8e0 net/ipv6/addrconf.c:4295 addrconf_dad_work+0x891/0xbc0 process_one_work kernel/workqueue.c:2633 [inline] process_scheduled_works+0x5b8/0xa30 kernel/workqueue.c:2706 worker_thread+0x525/0x730 kernel/workqueue.c:2787 kthread+0x1d7/0x210 kernel/kthread.c:388 ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242 read to 0xffff88814cd91280 of 8 bytes by task 3202 on cpu 1: skb_queue_empty include/linux/skbuff.h:1798 [inline] wg_packet_send_keepalive+0x20/0x100 drivers/net/wireguard/send.c:225 wg_receive_handshake_packet drivers/net/wireguard/receive.c:186 [inline] wg_packet_handshake_receive_worker+0x445/0x5e0 drivers/net/wireguard/receive.c:213 process_one_work kernel/workqueue.c:2633 [inline] process_scheduled_works+0x5b8/0xa30 kernel/workqueue.c:2706 worker_thread+0x525/0x730 kernel/workqueue.c:2787 kthread+0x1d7/0x210 kernel/kthread.c:388 ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242 value changed: 0xffff888148fef200 -> 0xffff88814cd91280 Mark this race as intentional by using the skb_queue_empty_lockless() function rather than skb_queue_empty(), which uses READ_ONCE() internally to annotate the race. Cc: stable@vger.kernel.org Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Link: https://patch.msgid.link/20240704154517.1572127-5-Jason@zx2c4.com Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireguard/send.c b/drivers/net/wireguard/send.c index 0d48e0f4a1ba..26e09c30d596 100644 --- a/drivers/net/wireguard/send.c +++ b/drivers/net/wireguard/send.c @@ -222,7 +222,7 @@ void wg_packet_send_keepalive(struct wg_peer *peer) { struct sk_buff *skb; - if (skb_queue_empty(&peer->staged_packet_queue)) { + if (skb_queue_empty_lockless(&peer->staged_packet_queue)) { skb = alloc_skb(DATA_PACKET_HEAD_ROOM + MESSAGE_MINIMUM_LENGTH, GFP_ATOMIC); if (unlikely(!skb)) From 0ec986ed7bab6801faed1440e8839dcc710331ff Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 3 Jul 2024 13:12:46 -0400 Subject: [PATCH 07/33] tcp: fix incorrect undo caused by DSACK of TLP retransmit Loss recovery undo_retrans bookkeeping had a long-standing bug where a DSACK from a spurious TLP retransmit packet could cause an erroneous undo of a fast recovery or RTO recovery that repaired a single really-lost packet (in a sequence range outside that of the TLP retransmit). Basically, because the loss recovery state machine didn't account for the fact that it sent a TLP retransmit, the DSACK for the TLP retransmit could erroneously be implicitly be interpreted as corresponding to the normal fast recovery or RTO recovery retransmit that plugged a real hole, thus resulting in an improper undo. For example, consider the following buggy scenario where there is a real packet loss but the congestion control response is improperly undone because of this bug: + send packets P1, P2, P3, P4 + P1 is really lost + send TLP retransmit of P4 + receive SACK for original P2, P3, P4 + enter fast recovery, fast-retransmit P1, increment undo_retrans to 1 + receive DSACK for TLP P4, decrement undo_retrans to 0, undo (bug!) + receive cumulative ACK for P1-P4 (fast retransmit plugged real hole) The fix: when we initialize undo machinery in tcp_init_undo(), if there is a TLP retransmit in flight, then increment tp->undo_retrans so that we make sure that we receive a DSACK corresponding to the TLP retransmit, as well as DSACKs for all later normal retransmits, before triggering a loss recovery undo. Note that we also have to move the line that clears tp->tlp_high_seq for RTO recovery, so that upon RTO we remember the tp->tlp_high_seq value until tcp_init_undo() and clear it only afterward. Also note that the bug dates back to the original 2013 TLP implementation, commit 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)"). However, this patch will only compile and work correctly with kernels that have tp->tlp_retrans, which was added only in v5.8 in 2020 in commit 76be93fc0702 ("tcp: allow at most one TLP probe per flight"). So we associate this fix with that later commit. Fixes: 76be93fc0702 ("tcp: allow at most one TLP probe per flight") Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Cc: Yuchung Cheng Cc: Kevin Yang Link: https://patch.msgid.link/20240703171246.1739561-1-ncardwell.sw@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 11 ++++++++++- net/ipv4/tcp_timer.c | 2 -- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 77294fd5fd3e..38da23f991d6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2129,8 +2129,16 @@ void tcp_clear_retrans(struct tcp_sock *tp) static inline void tcp_init_undo(struct tcp_sock *tp) { tp->undo_marker = tp->snd_una; + /* Retransmission still in flight may cause DSACKs later. */ - tp->undo_retrans = tp->retrans_out ? : -1; + /* First, account for regular retransmits in flight: */ + tp->undo_retrans = tp->retrans_out; + /* Next, account for TLP retransmits in flight: */ + if (tp->tlp_high_seq && tp->tlp_retrans) + tp->undo_retrans++; + /* Finally, avoid 0, because undo_retrans==0 means "can undo now": */ + if (!tp->undo_retrans) + tp->undo_retrans = -1; } static bool tcp_is_rack(const struct sock *sk) @@ -2209,6 +2217,7 @@ void tcp_enter_loss(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; + tp->tlp_high_seq = 0; tcp_ecn_queue_cwr(tp); /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 5bfd76a31af6..db9d826560e5 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -536,8 +536,6 @@ void tcp_retransmit_timer(struct sock *sk) if (WARN_ON_ONCE(!skb)) return; - tp->tlp_high_seq = 0; - if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { /* Receiver dastardly shrinks window. Our retransmits From 83c36e7cfd74e41a5c145640dba581b38f12aa15 Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Fri, 5 Jul 2024 16:19:35 +1200 Subject: [PATCH 08/33] docs: networking: devlink: capitalise length value Correct the example to match the help text from the devlink utility. Signed-off-by: Chris Packham Reviewed-by: Przemek Kitszel Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-region.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/devlink/devlink-region.rst b/Documentation/networking/devlink/devlink-region.rst index 9232cd7da301..5d0b68f752c0 100644 --- a/Documentation/networking/devlink/devlink-region.rst +++ b/Documentation/networking/devlink/devlink-region.rst @@ -49,7 +49,7 @@ example usage $ devlink region show [ DEV/REGION ] $ devlink region del DEV/REGION snapshot SNAPSHOT_ID $ devlink region dump DEV/REGION [ snapshot SNAPSHOT_ID ] - $ devlink region read DEV/REGION [ snapshot SNAPSHOT_ID ] address ADDRESS length length + $ devlink region read DEV/REGION [ snapshot SNAPSHOT_ID ] address ADDRESS length LENGTH # Show all of the exposed regions with region sizes: $ devlink region show From 1cb6f0bae50441f4b4b32a28315853b279c7404e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 8 Jul 2024 15:31:29 +0200 Subject: [PATCH 09/33] bpf: Fix too early release of tcx_entry Pedro Pinto and later independently also Hyunwoo Kim and Wongi Lee reported an issue that the tcx_entry can be released too early leading to a use after free (UAF) when an active old-style ingress or clsact qdisc with a shared tc block is later replaced by another ingress or clsact instance. Essentially, the sequence to trigger the UAF (one example) can be as follows: 1. A network namespace is created 2. An ingress qdisc is created. This allocates a tcx_entry, and &tcx_entry->miniq is stored in the qdisc's miniqp->p_miniq. At the same time, a tcf block with index 1 is created. 3. chain0 is attached to the tcf block. chain0 must be connected to the block linked to the ingress qdisc to later reach the function tcf_chain0_head_change_cb_del() which triggers the UAF. 4. Create and graft a clsact qdisc. This causes the ingress qdisc created in step 1 to be removed, thus freeing the previously linked tcx_entry: rtnetlink_rcv_msg() => tc_modify_qdisc() => qdisc_create() => clsact_init() [a] => qdisc_graft() => qdisc_destroy() => __qdisc_destroy() => ingress_destroy() [b] => tcx_entry_free() => kfree_rcu() // tcx_entry freed 5. Finally, the network namespace is closed. This registers the cleanup_net worker, and during the process of releasing the remaining clsact qdisc, it accesses the tcx_entry that was already freed in step 4, causing the UAF to occur: cleanup_net() => ops_exit_list() => default_device_exit_batch() => unregister_netdevice_many() => unregister_netdevice_many_notify() => dev_shutdown() => qdisc_put() => clsact_destroy() [c] => tcf_block_put_ext() => tcf_chain0_head_change_cb_del() => tcf_chain_head_change_item() => clsact_chain_head_change() => mini_qdisc_pair_swap() // UAF There are also other variants, the gist is to add an ingress (or clsact) qdisc with a specific shared block, then to replace that qdisc, waiting for the tcx_entry kfree_rcu() to be executed and subsequently accessing the current active qdisc's miniq one way or another. The correct fix is to turn the miniq_active boolean into a counter. What can be observed, at step 2 above, the counter transitions from 0->1, at step [a] from 1->2 (in order for the miniq object to remain active during the replacement), then in [b] from 2->1 and finally [c] 1->0 with the eventual release. The reference counter in general ranges from [0,2] and it does not need to be atomic since all access to the counter is protected by the rtnl mutex. With this in place, there is no longer a UAF happening and the tcx_entry is freed at the correct time. Fixes: e420bed02507 ("bpf: Add fd-based tcx multi-prog infra with link support") Reported-by: Pedro Pinto Co-developed-by: Pedro Pinto Signed-off-by: Pedro Pinto Signed-off-by: Daniel Borkmann Cc: Hyunwoo Kim Cc: Wongi Lee Cc: Martin KaFai Lau Link: https://lore.kernel.org/r/20240708133130.11609-1-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- include/net/tcx.h | 13 +++++++++---- net/sched/sch_ingress.c | 12 ++++++------ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/include/net/tcx.h b/include/net/tcx.h index 72a3e75e539f..5ce0ce9e0c02 100644 --- a/include/net/tcx.h +++ b/include/net/tcx.h @@ -13,7 +13,7 @@ struct mini_Qdisc; struct tcx_entry { struct mini_Qdisc __rcu *miniq; struct bpf_mprog_bundle bundle; - bool miniq_active; + u32 miniq_active; struct rcu_head rcu; }; @@ -125,11 +125,16 @@ static inline void tcx_skeys_dec(bool ingress) tcx_dec(); } -static inline void tcx_miniq_set_active(struct bpf_mprog_entry *entry, - const bool active) +static inline void tcx_miniq_inc(struct bpf_mprog_entry *entry) { ASSERT_RTNL(); - tcx_entry(entry)->miniq_active = active; + tcx_entry(entry)->miniq_active++; +} + +static inline void tcx_miniq_dec(struct bpf_mprog_entry *entry) +{ + ASSERT_RTNL(); + tcx_entry(entry)->miniq_active--; } static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry) diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index c2ef9dcf91d2..cc6051d4f2ef 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -91,7 +91,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, entry = tcx_entry_fetch_or_create(dev, true, &created); if (!entry) return -ENOMEM; - tcx_miniq_set_active(entry, true); + tcx_miniq_inc(entry); mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq); if (created) tcx_entry_update(dev, entry, true); @@ -121,7 +121,7 @@ static void ingress_destroy(struct Qdisc *sch) tcf_block_put_ext(q->block, sch, &q->block_info); if (entry) { - tcx_miniq_set_active(entry, false); + tcx_miniq_dec(entry); if (!tcx_entry_is_active(entry)) { tcx_entry_update(dev, NULL, true); tcx_entry_free(entry); @@ -257,7 +257,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, entry = tcx_entry_fetch_or_create(dev, true, &created); if (!entry) return -ENOMEM; - tcx_miniq_set_active(entry, true); + tcx_miniq_inc(entry); mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq); if (created) tcx_entry_update(dev, entry, true); @@ -276,7 +276,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, entry = tcx_entry_fetch_or_create(dev, false, &created); if (!entry) return -ENOMEM; - tcx_miniq_set_active(entry, true); + tcx_miniq_inc(entry); mini_qdisc_pair_init(&q->miniqp_egress, sch, &tcx_entry(entry)->miniq); if (created) tcx_entry_update(dev, entry, false); @@ -302,7 +302,7 @@ static void clsact_destroy(struct Qdisc *sch) tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); if (ingress_entry) { - tcx_miniq_set_active(ingress_entry, false); + tcx_miniq_dec(ingress_entry); if (!tcx_entry_is_active(ingress_entry)) { tcx_entry_update(dev, NULL, true); tcx_entry_free(ingress_entry); @@ -310,7 +310,7 @@ static void clsact_destroy(struct Qdisc *sch) } if (egress_entry) { - tcx_miniq_set_active(egress_entry, false); + tcx_miniq_dec(egress_entry); if (!tcx_entry_is_active(egress_entry)) { tcx_entry_update(dev, NULL, false); tcx_entry_free(egress_entry); From 5f1d18de79180deac2822c93e431bbe547f7d3ce Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 8 Jul 2024 15:31:30 +0200 Subject: [PATCH 10/33] selftests/bpf: Extend tcx tests to cover late tcx_entry release Add a test case which replaces an active ingress qdisc while keeping the miniq in-tact during the transition period to the new clsact qdisc. # ./vmtest.sh -- ./test_progs -t tc_link [...] ./test_progs -t tc_link [ 3.412871] bpf_testmod: loading out-of-tree module taints kernel. [ 3.413343] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel #332 tc_links_after:OK #333 tc_links_append:OK #334 tc_links_basic:OK #335 tc_links_before:OK #336 tc_links_chain_classic:OK #337 tc_links_chain_mixed:OK #338 tc_links_dev_chain0:OK #339 tc_links_dev_cleanup:OK #340 tc_links_dev_mixed:OK #341 tc_links_ingress:OK #342 tc_links_invalid:OK #343 tc_links_prepend:OK #344 tc_links_replace:OK #345 tc_links_revision:OK Summary: 14/0 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Daniel Borkmann Cc: Martin KaFai Lau Link: https://lore.kernel.org/r/20240708133130.11609-2-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- tools/testing/selftests/bpf/config | 3 + .../selftests/bpf/prog_tests/tc_links.c | 61 +++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index eeabd798bc3a..98b6b6a886ce 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -58,9 +58,12 @@ CONFIG_MPLS=y CONFIG_MPLS_IPTUNNEL=y CONFIG_MPLS_ROUTING=y CONFIG_MPTCP=y +CONFIG_NET_ACT_SKBMOD=y +CONFIG_NET_CLS=y CONFIG_NET_CLS_ACT=y CONFIG_NET_CLS_BPF=y CONFIG_NET_CLS_FLOWER=y +CONFIG_NET_CLS_MATCHALL=y CONFIG_NET_FOU=y CONFIG_NET_FOU_IP_TUNNELS=y CONFIG_NET_IPGRE=y diff --git a/tools/testing/selftests/bpf/prog_tests/tc_links.c b/tools/testing/selftests/bpf/prog_tests/tc_links.c index bc9841144685..1af9ec1149aa 100644 --- a/tools/testing/selftests/bpf/prog_tests/tc_links.c +++ b/tools/testing/selftests/bpf/prog_tests/tc_links.c @@ -9,6 +9,8 @@ #define ping_cmd "ping -q -c1 -w1 127.0.0.1 > /dev/null" #include "test_tc_link.skel.h" + +#include "netlink_helpers.h" #include "tc_helpers.h" void serial_test_tc_links_basic(void) @@ -1787,6 +1789,65 @@ void serial_test_tc_links_ingress(void) test_tc_links_ingress(BPF_TCX_INGRESS, false, false); } +struct qdisc_req { + struct nlmsghdr n; + struct tcmsg t; + char buf[1024]; +}; + +static int qdisc_replace(int ifindex, const char *kind, bool block) +{ + struct rtnl_handle rth = { .fd = -1 }; + struct qdisc_req req; + int err; + + err = rtnl_open(&rth, 0); + if (!ASSERT_OK(err, "open_rtnetlink")) + return err; + + memset(&req, 0, sizeof(req)); + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.n.nlmsg_flags = NLM_F_CREATE | NLM_F_REPLACE | NLM_F_REQUEST; + req.n.nlmsg_type = RTM_NEWQDISC; + req.t.tcm_family = AF_UNSPEC; + req.t.tcm_ifindex = ifindex; + req.t.tcm_parent = 0xfffffff1; + + addattr_l(&req.n, sizeof(req), TCA_KIND, kind, strlen(kind) + 1); + if (block) + addattr32(&req.n, sizeof(req), TCA_INGRESS_BLOCK, 1); + + err = rtnl_talk(&rth, &req.n, NULL); + ASSERT_OK(err, "talk_rtnetlink"); + rtnl_close(&rth); + return err; +} + +void serial_test_tc_links_dev_chain0(void) +{ + int err, ifindex; + + ASSERT_OK(system("ip link add dev foo type veth peer name bar"), "add veth"); + ifindex = if_nametoindex("foo"); + ASSERT_NEQ(ifindex, 0, "non_zero_ifindex"); + err = qdisc_replace(ifindex, "ingress", true); + if (!ASSERT_OK(err, "attaching ingress")) + goto cleanup; + ASSERT_OK(system("tc filter add block 1 matchall action skbmod swap mac"), "add block"); + err = qdisc_replace(ifindex, "clsact", false); + if (!ASSERT_OK(err, "attaching clsact")) + goto cleanup; + /* Heuristic: kern_sync_rcu() alone does not work; a wait-time of ~5s + * triggered the issue without the fix reliably 100% of the time. + */ + sleep(5); + ASSERT_OK(system("tc filter add dev foo ingress matchall action skbmod swap mac"), "add filter"); +cleanup: + ASSERT_OK(system("ip link del dev foo"), "del veth"); + ASSERT_EQ(if_nametoindex("foo"), 0, "foo removed"); + ASSERT_EQ(if_nametoindex("bar"), 0, "bar removed"); +} + static void test_tc_links_dev_mixed(int target) { LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); From 30f747b8d53bc73555f268d0f48f56174fa5bf10 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 5 Jul 2024 10:49:54 +0200 Subject: [PATCH 11/33] net: phy: microchip: lan87xx: reinit PHY after cable test Reinit PHY after cable test, otherwise link can't be established on tested port. This issue is reproducible on LAN9372 switches with integrated 100BaseT1 PHYs. Fixes: 788050256c411 ("net: phy: microchip_t1: add cable test support for lan87xx phy") Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Reviewed-by: Michal Kubiak Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20240705084954.83048-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/phy/microchip_t1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/microchip_t1.c b/drivers/net/phy/microchip_t1.c index a838b61cd844..a35528497a57 100644 --- a/drivers/net/phy/microchip_t1.c +++ b/drivers/net/phy/microchip_t1.c @@ -748,7 +748,7 @@ static int lan87xx_cable_test_report(struct phy_device *phydev) ethnl_cable_test_result(phydev, ETHTOOL_A_CABLE_PAIR_A, lan87xx_cable_test_report_trans(detect)); - return 0; + return phy_init_hw(phydev); } static int lan87xx_cable_test_get_status(struct phy_device *phydev, From f0c18025693707ec344a70b6887f7450bf4c826b Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 3 Jul 2024 16:39:31 +0800 Subject: [PATCH 12/33] skmsg: Skip zero length skb in sk_msg_recvmsg When running BPF selftests (./test_progs -t sockmap_basic) on a Loongarch platform, the following kernel panic occurs: [...] Oops[#1]: CPU: 22 PID: 2824 Comm: test_progs Tainted: G OE 6.10.0-rc2+ #18 Hardware name: LOONGSON Dabieshan/Loongson-TC542F0, BIOS Loongson-UDK2018 ... ... ra: 90000000048bf6c0 sk_msg_recvmsg+0x120/0x560 ERA: 9000000004162774 copy_page_to_iter+0x74/0x1c0 CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE) PRMD: 0000000c (PPLV0 +PIE +PWE) EUEN: 00000007 (+FPE +SXE +ASXE -BTE) ECFG: 00071c1d (LIE=0,2-4,10-12 VS=7) ESTAT: 00010000 [PIL] (IS= ECode=1 EsubCode=0) BADV: 0000000000000040 PRID: 0014c011 (Loongson-64bit, Loongson-3C5000) Modules linked in: bpf_testmod(OE) xt_CHECKSUM xt_MASQUERADE xt_conntrack Process test_progs (pid: 2824, threadinfo=0000000000863a31, task=...) Stack : ... Call Trace: [<9000000004162774>] copy_page_to_iter+0x74/0x1c0 [<90000000048bf6c0>] sk_msg_recvmsg+0x120/0x560 [<90000000049f2b90>] tcp_bpf_recvmsg_parser+0x170/0x4e0 [<90000000049aae34>] inet_recvmsg+0x54/0x100 [<900000000481ad5c>] sock_recvmsg+0x7c/0xe0 [<900000000481e1a8>] __sys_recvfrom+0x108/0x1c0 [<900000000481e27c>] sys_recvfrom+0x1c/0x40 [<9000000004c076ec>] do_syscall+0x8c/0xc0 [<9000000003731da4>] handle_syscall+0xc4/0x160 Code: ... ---[ end trace 0000000000000000 ]--- Kernel panic - not syncing: Fatal exception Kernel relocated by 0x3510000 .text @ 0x9000000003710000 .data @ 0x9000000004d70000 .bss @ 0x9000000006469400 ---[ end Kernel panic - not syncing: Fatal exception ]--- [...] This crash happens every time when running sockmap_skb_verdict_shutdown subtest in sockmap_basic. This crash is because a NULL pointer is passed to page_address() in the sk_msg_recvmsg(). Due to the different implementations depending on the architecture, page_address(NULL) will trigger a panic on Loongarch platform but not on x86 platform. So this bug was hidden on x86 platform for a while, but now it is exposed on Loongarch platform. The root cause is that a zero length skb (skb->len == 0) was put on the queue. This zero length skb is a TCP FIN packet, which was sent by shutdown(), invoked in test_sockmap_skb_verdict_shutdown(): shutdown(p1, SHUT_WR); In this case, in sk_psock_skb_ingress_enqueue(), num_sge is zero, and no page is put to this sge (see sg_set_page in sg_set_page), but this empty sge is queued into ingress_msg list. And in sk_msg_recvmsg(), this empty sge is used, and a NULL page is got by sg_page(sge). Pass this NULL page to copy_page_to_iter(), which passes it to kmap_local_page() and to page_address(), then kernel panics. To solve this, we should skip this zero length skb. So in sk_msg_recvmsg(), if copy is zero, that means it's a zero length skb, skip invoking copy_page_to_iter(). We are using the EFAULT return triggered by copy_page_to_iter to check for is_fin in tcp_bpf.c. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Suggested-by: John Fastabend Signed-off-by: Geliang Tang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/e3a16eacdc6740658ee02a33489b1b9d4912f378.1719992715.git.tanggeliang@kylinos.cn --- net/core/skmsg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index fd20aae30be2..bbf40b999713 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -434,7 +434,8 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, page = sg_page(sge); if (copied + copy > len) copy = len - copied; - copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (copy) + copy = copy_page_to_iter(page, sge->offset, copy, iter); if (!copy) { copied = copied ? copied : -EFAULT; goto out; From 0d1b7d6c927431126ece585246e23aa877243360 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 4 Jul 2024 19:00:05 -0700 Subject: [PATCH 13/33] bnxt: fix crashes when reducing ring count with active RSS contexts bnxt doesn't check if a ring is used by RSS contexts when reducing ring count. Core performs a similar check for the drivers for the main context, but core doesn't know about additional contexts, so it can't validate them. bnxt_fill_hw_rss_tbl_p5() uses ring id to index bp->rx_ring[], which without the check may end up being out of bounds. BUG: KASAN: slab-out-of-bounds in __bnxt_hwrm_vnic_set_rss+0xb79/0xe40 Read of size 2 at addr ffff8881c5809618 by task ethtool/31525 Call Trace: __bnxt_hwrm_vnic_set_rss+0xb79/0xe40 bnxt_hwrm_vnic_rss_cfg_p5+0xf7/0x460 __bnxt_setup_vnic_p5+0x12e/0x270 __bnxt_open_nic+0x2262/0x2f30 bnxt_open_nic+0x5d/0xf0 ethnl_set_channels+0x5d4/0xb30 ethnl_default_set_doit+0x2f1/0x620 Core does track the additional contexts in net-next, so we can move this validation out of the driver as a follow up there. Fixes: b3d0083caf9a ("bnxt_en: Support RSS contexts in ethtool .{get|set}_rxfh()") Signed-off-by: Jakub Kicinski Reviewed-by: Pavan Chebbi Link: https://patch.msgid.link/20240705020005.681746-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 15 +++++++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 6 ++++++ 3 files changed, 22 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 53085058100c..64b61a8d426d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6146,6 +6146,21 @@ static u16 bnxt_get_max_rss_ring(struct bnxt *bp) return max_ring; } +u16 bnxt_get_max_rss_ctx_ring(struct bnxt *bp) +{ + u16 i, tbl_size, max_ring = 0; + struct bnxt_rss_ctx *rss_ctx; + + tbl_size = bnxt_get_rxfh_indir_size(bp->dev); + + list_for_each_entry(rss_ctx, &bp->rss_ctx_list, list) { + for (i = 0; i < tbl_size; i++) + max_ring = max(max_ring, rss_ctx->rss_indir_tbl[i]); + } + + return max_ring; +} + int bnxt_get_nr_rss_ctxs(struct bnxt *bp, int rx_rings) { if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) { diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 9cf0acfa04e5..6b10a09ee1af 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2776,6 +2776,7 @@ int bnxt_hwrm_vnic_set_tpa(struct bnxt *bp, struct bnxt_vnic_info *vnic, void bnxt_fill_ipv6_mask(__be32 mask[4]); int bnxt_alloc_rss_indir_tbl(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx); void bnxt_set_dflt_rss_indir_tbl(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx); +u16 bnxt_get_max_rss_ctx_ring(struct bnxt *bp); int bnxt_get_nr_rss_ctxs(struct bnxt *bp, int rx_rings); int bnxt_hwrm_vnic_cfg(struct bnxt *bp, struct bnxt_vnic_info *vnic); int bnxt_hwrm_vnic_alloc(struct bnxt *bp, struct bnxt_vnic_info *vnic, diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 8763f8a01457..79c09c1cdf93 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -961,6 +961,12 @@ static int bnxt_set_channels(struct net_device *dev, return rc; } + if (req_rx_rings < bp->rx_nr_rings && + req_rx_rings <= bnxt_get_max_rss_ctx_ring(bp)) { + netdev_warn(dev, "Can't deactivate rings used by RSS contexts\n"); + return -EINVAL; + } + if (bnxt_get_nr_rss_ctxs(bp, req_rx_rings) != bnxt_get_nr_rss_ctxs(bp, bp->rx_nr_rings) && netif_is_rxfh_configured(dev)) { From 442e26af9aa8115c96541026cbfeaaa76c85d178 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Fri, 5 Jul 2024 12:53:17 +0300 Subject: [PATCH 14/33] octeontx2-af: Fix incorrect value output on error path in rvu_check_rsrc_availability() In rvu_check_rsrc_availability() in case of invalid SSOW req, an incorrect data is printed to error log. 'req->sso' value is printed instead of 'req->ssow'. Looks like "copy-paste" mistake. Fix this mistake by replacing 'req->sso' with 'req->ssow'. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 746ea74241fa ("octeontx2-af: Add RVU block LF provisioning support") Signed-off-by: Aleksandr Mishin Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240705095317.12640-1-amishin@t-argos.ru Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/af/rvu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index ff78251f92d4..5f661e67ccbc 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -1643,7 +1643,7 @@ static int rvu_check_rsrc_availability(struct rvu *rvu, if (req->ssow > block->lf.max) { dev_err(&rvu->pdev->dev, "Func 0x%x: Invalid SSOW req, %d > max %d\n", - pcifunc, req->sso, block->lf.max); + pcifunc, req->ssow, block->lf.max); return -EINVAL; } mappedlfs = rvu_get_rsrc_mapcount(pfvf, block->addr); From 0913ec336a6c0c4a2b296bd9f74f8e41c4c83c8c Mon Sep 17 00:00:00 2001 From: Ronald Wahl Date: Sat, 6 Jul 2024 12:13:37 +0200 Subject: [PATCH 15/33] net: ks8851: Fix deadlock with the SPI chip variant When SMP is enabled and spinlocks are actually functional then there is a deadlock with the 'statelock' spinlock between ks8851_start_xmit_spi and ks8851_irq: watchdog: BUG: soft lockup - CPU#0 stuck for 27s! call trace: queued_spin_lock_slowpath+0x100/0x284 do_raw_spin_lock+0x34/0x44 ks8851_start_xmit_spi+0x30/0xb8 ks8851_start_xmit+0x14/0x20 netdev_start_xmit+0x40/0x6c dev_hard_start_xmit+0x6c/0xbc sch_direct_xmit+0xa4/0x22c __qdisc_run+0x138/0x3fc qdisc_run+0x24/0x3c net_tx_action+0xf8/0x130 handle_softirqs+0x1ac/0x1f0 __do_softirq+0x14/0x20 ____do_softirq+0x10/0x1c call_on_irq_stack+0x3c/0x58 do_softirq_own_stack+0x1c/0x28 __irq_exit_rcu+0x54/0x9c irq_exit_rcu+0x10/0x1c el1_interrupt+0x38/0x50 el1h_64_irq_handler+0x18/0x24 el1h_64_irq+0x64/0x68 __netif_schedule+0x6c/0x80 netif_tx_wake_queue+0x38/0x48 ks8851_irq+0xb8/0x2c8 irq_thread_fn+0x2c/0x74 irq_thread+0x10c/0x1b0 kthread+0xc8/0xd8 ret_from_fork+0x10/0x20 This issue has not been identified earlier because tests were done on a device with SMP disabled and so spinlocks were actually NOPs. Now use spin_(un)lock_bh for TX queue related locking to avoid execution of softirq work synchronously that would lead to a deadlock. Fixes: 3dc5d4454545 ("net: ks8851: Fix TX stall caused by TX buffer overrun") Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: netdev@vger.kernel.org Cc: stable@vger.kernel.org # 5.10+ Signed-off-by: Ronald Wahl Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240706101337.854474-1-rwahl@gmx.de Signed-off-by: Paolo Abeni --- drivers/net/ethernet/micrel/ks8851_common.c | 8 ++++---- drivers/net/ethernet/micrel/ks8851_spi.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c index 6453c92f0fa7..13462811eaae 100644 --- a/drivers/net/ethernet/micrel/ks8851_common.c +++ b/drivers/net/ethernet/micrel/ks8851_common.c @@ -352,11 +352,11 @@ static irqreturn_t ks8851_irq(int irq, void *_ks) netif_dbg(ks, intr, ks->netdev, "%s: txspace %d\n", __func__, tx_space); - spin_lock(&ks->statelock); + spin_lock_bh(&ks->statelock); ks->tx_space = tx_space; if (netif_queue_stopped(ks->netdev)) netif_wake_queue(ks->netdev); - spin_unlock(&ks->statelock); + spin_unlock_bh(&ks->statelock); } if (status & IRQ_SPIBEI) { @@ -635,14 +635,14 @@ static void ks8851_set_rx_mode(struct net_device *dev) /* schedule work to do the actual set of the data if needed */ - spin_lock(&ks->statelock); + spin_lock_bh(&ks->statelock); if (memcmp(&rxctrl, &ks->rxctrl, sizeof(rxctrl)) != 0) { memcpy(&ks->rxctrl, &rxctrl, sizeof(ks->rxctrl)); schedule_work(&ks->rxctrl_work); } - spin_unlock(&ks->statelock); + spin_unlock_bh(&ks->statelock); } static int ks8851_set_mac_address(struct net_device *dev, void *addr) diff --git a/drivers/net/ethernet/micrel/ks8851_spi.c b/drivers/net/ethernet/micrel/ks8851_spi.c index 670c1de966db..3062cc0f9199 100644 --- a/drivers/net/ethernet/micrel/ks8851_spi.c +++ b/drivers/net/ethernet/micrel/ks8851_spi.c @@ -340,10 +340,10 @@ static void ks8851_tx_work(struct work_struct *work) tx_space = ks8851_rdreg16_spi(ks, KS_TXMIR); - spin_lock(&ks->statelock); + spin_lock_bh(&ks->statelock); ks->queued_len -= dequeued_len; ks->tx_space = tx_space; - spin_unlock(&ks->statelock); + spin_unlock_bh(&ks->statelock); ks8851_unlock_spi(ks, &flags); } From f153831097b4435f963e385304cc0f1acba1c657 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Jul 2024 07:46:00 -0700 Subject: [PATCH 16/33] net: fix rc7's __skb_datagram_iter() X would not start in my old 32-bit partition (and the "n"-handling looks just as wrong on 64-bit, but for whatever reason did not show up there): "n" must be accumulated over all pages before it's added to "offset" and compared with "copy", immediately after the skb_frag_foreach_page() loop. Fixes: d2d30a376d9c ("net: allow skb_datagram_iter to be called from any context") Signed-off-by: Hugh Dickins Reviewed-by: Sagi Grimberg Link: https://patch.msgid.link/fef352e8-b89a-da51-f8ce-04bc39ee6481@google.com Signed-off-by: Jakub Kicinski --- net/core/datagram.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/datagram.c b/net/core/datagram.c index e9ba4c7b449d..e72dd78471a6 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -423,11 +423,12 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, if (copy > len) copy = len; + n = 0; skb_frag_foreach_page(frag, skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_local_page(p); - n = INDIRECT_CALL_1(cb, simple_copy_to_iter, + n += INDIRECT_CALL_1(cb, simple_copy_to_iter, vaddr + p_off, p_len, data, to); kunmap_local(vaddr); } From 01fc5142ae6b06b61ed51a624f2732d6525d8ea3 Mon Sep 17 00:00:00 2001 From: Michal Kubiak Date: Mon, 8 Jul 2024 16:07:49 -0700 Subject: [PATCH 17/33] i40e: Fix XDP program unloading while removing the driver The commit 6533e558c650 ("i40e: Fix reset path while removing the driver") introduced a new PF state "__I40E_IN_REMOVE" to block modifying the XDP program while the driver is being removed. Unfortunately, such a change is useful only if the ".ndo_bpf()" callback was called out of the rmmod context because unloading the existing XDP program is also a part of driver removing procedure. In other words, from the rmmod context the driver is expected to unload the XDP program without reporting any errors. Otherwise, the kernel warning with callstack is printed out to dmesg. Example failing scenario: 1. Load the i40e driver. 2. Load the XDP program. 3. Unload the i40e driver (using "rmmod" command). The example kernel warning log: [ +0.004646] WARNING: CPU: 94 PID: 10395 at net/core/dev.c:9290 unregister_netdevice_many_notify+0x7a9/0x870 [...] [ +0.010959] RIP: 0010:unregister_netdevice_many_notify+0x7a9/0x870 [...] [ +0.002726] Call Trace: [ +0.002457] [ +0.002119] ? __warn+0x80/0x120 [ +0.003245] ? unregister_netdevice_many_notify+0x7a9/0x870 [ +0.005586] ? report_bug+0x164/0x190 [ +0.003678] ? handle_bug+0x3c/0x80 [ +0.003503] ? exc_invalid_op+0x17/0x70 [ +0.003846] ? asm_exc_invalid_op+0x1a/0x20 [ +0.004200] ? unregister_netdevice_many_notify+0x7a9/0x870 [ +0.005579] ? unregister_netdevice_many_notify+0x3cc/0x870 [ +0.005586] unregister_netdevice_queue+0xf7/0x140 [ +0.004806] unregister_netdev+0x1c/0x30 [ +0.003933] i40e_vsi_release+0x87/0x2f0 [i40e] [ +0.004604] i40e_remove+0x1a1/0x420 [i40e] [ +0.004220] pci_device_remove+0x3f/0xb0 [ +0.003943] device_release_driver_internal+0x19f/0x200 [ +0.005243] driver_detach+0x48/0x90 [ +0.003586] bus_remove_driver+0x6d/0xf0 [ +0.003939] pci_unregister_driver+0x2e/0xb0 [ +0.004278] i40e_exit_module+0x10/0x5f0 [i40e] [ +0.004570] __do_sys_delete_module.isra.0+0x197/0x310 [ +0.005153] do_syscall_64+0x85/0x170 [ +0.003684] ? syscall_exit_to_user_mode+0x69/0x220 [ +0.004886] ? do_syscall_64+0x95/0x170 [ +0.003851] ? exc_page_fault+0x7e/0x180 [ +0.003932] entry_SYSCALL_64_after_hwframe+0x71/0x79 [ +0.005064] RIP: 0033:0x7f59dc9347cb [ +0.003648] Code: 73 01 c3 48 8b 0d 65 16 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 b0 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 35 16 0c 00 f7 d8 64 89 01 48 [ +0.018753] RSP: 002b:00007ffffac99048 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 [ +0.007577] RAX: ffffffffffffffda RBX: 0000559b9bb2f6e0 RCX: 00007f59dc9347cb [ +0.007140] RDX: 0000000000000000 RSI: 0000000000000800 RDI: 0000559b9bb2f748 [ +0.007146] RBP: 00007ffffac99070 R08: 1999999999999999 R09: 0000000000000000 [ +0.007133] R10: 00007f59dc9a5ac0 R11: 0000000000000206 R12: 0000000000000000 [ +0.007141] R13: 00007ffffac992d8 R14: 0000559b9bb2f6e0 R15: 0000000000000000 [ +0.007151] [ +0.002204] ---[ end trace 0000000000000000 ]--- Fix this by checking if the XDP program is being loaded or unloaded. Then, block only loading a new program while "__I40E_IN_REMOVE" is set. Also, move testing "__I40E_IN_REMOVE" flag to the beginning of XDP_SETUP callback to avoid unnecessary operations and checks. Fixes: 6533e558c650 ("i40e: Fix reset path while removing the driver") Signed-off-by: Michal Kubiak Reviewed-by: Maciej Fijalkowski Tested-by: Chandan Kumar Rout (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20240708230750.625986-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 284c3fad5a6e..310513d9321b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -13293,6 +13293,10 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog, bool need_reset; int i; + /* VSI shall be deleted in a moment, block loading new programs */ + if (prog && test_bit(__I40E_IN_REMOVE, pf->state)) + return -EINVAL; + /* Don't allow frames that span over multiple buffers */ if (vsi->netdev->mtu > frame_size - I40E_PACKET_HDR_PAD) { NL_SET_ERR_MSG_MOD(extack, "MTU too large for linear frames and XDP prog does not support frags"); @@ -13301,14 +13305,9 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, struct bpf_prog *prog, /* When turning XDP on->off/off->on we reset and rebuild the rings. */ need_reset = (i40e_enabled_xdp_vsi(vsi) != !!prog); - if (need_reset) i40e_prep_for_reset(pf); - /* VSI shall be deleted in a moment, just return EINVAL */ - if (test_bit(__I40E_IN_REMOVE, pf->state)) - return -EINVAL; - old_prog = xchg(&vsi->xdp_prog, prog); if (need_reset) { From e1533b6319ab9c3a97dad314dd88b3783bc41b69 Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Mon, 8 Jul 2024 22:58:26 +0200 Subject: [PATCH 18/33] net: ethernet: lantiq_etop: fix double free in detach The number of the currently released descriptor is never incremented which results in the same skb being released multiple times. Fixes: 504d4721ee8e ("MIPS: Lantiq: Add ethernet driver") Reported-by: Joe Perches Closes: https://lore.kernel.org/all/fc1bf93d92bb5b2f99c6c62745507cc22f3a7b2d.camel@perches.com/ Signed-off-by: Aleksander Jan Bajkowski Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20240708205826.5176-1-olek2@wp.pl Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/lantiq_etop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index 5352fee62d2b..0b9982804370 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -217,9 +217,9 @@ ltq_etop_free_channel(struct net_device *dev, struct ltq_etop_chan *ch) if (ch->dma.irq) free_irq(ch->dma.irq, priv); if (IS_RX(ch->idx)) { - int desc; + struct ltq_dma_channel *dma = &ch->dma; - for (desc = 0; desc < LTQ_DESC_NUM; desc++) + for (dma->desc = 0; dma->desc < LTQ_DESC_NUM; dma->desc++) dev_kfree_skb_any(ch->skb[ch->dma.desc]); } } From af253aef183a31ce62d2e39fc520b0ebfb562bb9 Mon Sep 17 00:00:00 2001 From: Mohammad Shehar Yaar Tausif Date: Wed, 10 Jul 2024 12:05:22 +0200 Subject: [PATCH 19/33] bpf: fix order of args in call to bpf_map_kvcalloc The original function call passed size of smap->bucket before the number of buckets which raises the error 'calloc-transposed-args' on compilation. Vlastimil Babka added: The order of parameters can be traced back all the way to 6ac99e8f23d4 ("bpf: Introduce bpf sk local storage") accross several refactorings, and that's why the commit is used as a Fixes: tag. In v6.10-rc1, a different commit 2c321f3f70bc ("mm: change inlined allocation helpers to account at the call site") however exposed the order of args in a way that gcc-14 has enough visibility to start warning about it, because (in !CONFIG_MEMCG case) bpf_map_kvcalloc is then a macro alias for kvcalloc instead of a static inline wrapper. To sum up the warning happens when the following conditions are all met: - gcc-14 is used (didn't see it with gcc-13) - commit 2c321f3f70bc is present - CONFIG_MEMCG is not enabled in .config - CONFIG_WERROR turns this from a compiler warning to error Fixes: 6ac99e8f23d4 ("bpf: Introduce bpf sk local storage") Reviewed-by: Andrii Nakryiko Tested-by: Christian Kujau Signed-off-by: Mohammad Shehar Yaar Tausif Signed-off-by: Vlastimil Babka Link: https://lore.kernel.org/r/20240710100521.15061-2-vbabka@suse.cz Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_local_storage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 976cb258a0ed..c938dea5ddbf 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -782,8 +782,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, nbuckets = max_t(u32, 2, nbuckets); smap->bucket_log = ilog2(nbuckets); - smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), - nbuckets, GFP_USER | __GFP_NOWARN); + smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets, + sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN); if (!smap->buckets) { err = -ENOMEM; goto free_smap; From d4523831f07a267a943f0dde844bf8ead7495f13 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 9 Jul 2024 18:54:38 +0000 Subject: [PATCH 20/33] bpf: Fail bpf_timer_cancel when callback is being cancelled Given a schedule: timer1 cb timer2 cb bpf_timer_cancel(timer2); bpf_timer_cancel(timer1); Both bpf_timer_cancel calls would wait for the other callback to finish executing, introducing a lockup. Add an atomic_t count named 'cancelling' in bpf_hrtimer. This keeps track of all in-flight cancellation requests for a given BPF timer. Whenever cancelling a BPF timer, we must check if we have outstanding cancellation requests, and if so, we must fail the operation with an error (-EDEADLK) since cancellation is synchronous and waits for the callback to finish executing. This implies that we can enter a deadlock situation involving two or more timer callbacks executing in parallel and attempting to cancel one another. Note that we avoid incrementing the cancelling counter for the target timer (the one being cancelled) if bpf_timer_cancel is not invoked from a callback, to avoid spurious errors. The whole point of detecting cur->cancelling and returning -EDEADLK is to not enter a busy wait loop (which may or may not lead to a lockup). This does not apply in case the caller is in a non-callback context, the other side can continue to cancel as it sees fit without running into errors. Background on prior attempts: Earlier versions of this patch used a bool 'cancelling' bit and used the following pattern under timer->lock to publish cancellation status. lock(t->lock); t->cancelling = true; mb(); if (cur->cancelling) return -EDEADLK; unlock(t->lock); hrtimer_cancel(t->timer); t->cancelling = false; The store outside the critical section could overwrite a parallel requests t->cancelling assignment to true, to ensure the parallely executing callback observes its cancellation status. It would be necessary to clear this cancelling bit once hrtimer_cancel is done, but lack of serialization introduced races. Another option was explored where bpf_timer_start would clear the bit when (re)starting the timer under timer->lock. This would ensure serialized access to the cancelling bit, but may allow it to be cleared before in-flight hrtimer_cancel has finished executing, such that lockups can occur again. Thus, we choose an atomic counter to keep track of all outstanding cancellation requests and use it to prevent lockups in case callbacks attempt to cancel each other while executing in parallel. Reported-by: Dohyun Kim Reported-by: Neel Natu Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240709185440.1104957-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2a69a9a36c0f..22e779ca50d5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1107,6 +1107,7 @@ struct bpf_async_cb { struct bpf_hrtimer { struct bpf_async_cb cb; struct hrtimer timer; + atomic_t cancelling; }; struct bpf_work { @@ -1262,6 +1263,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u clockid = flags & (MAX_CLOCKS - 1); t = (struct bpf_hrtimer *)cb; + atomic_set(&t->cancelling, 0); hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; cb->value = (void *)async - map->record->timer_off; @@ -1440,7 +1442,8 @@ static void drop_prog_refcnt(struct bpf_async_cb *async) BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) { - struct bpf_hrtimer *t; + struct bpf_hrtimer *t, *cur_t; + bool inc = false; int ret = 0; if (in_nmi()) @@ -1452,14 +1455,41 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) ret = -EINVAL; goto out; } - if (this_cpu_read(hrtimer_running) == t) { + + cur_t = this_cpu_read(hrtimer_running); + if (cur_t == t) { /* If bpf callback_fn is trying to bpf_timer_cancel() * its own timer the hrtimer_cancel() will deadlock - * since it waits for callback_fn to finish + * since it waits for callback_fn to finish. */ ret = -EDEADLK; goto out; } + + /* Only account in-flight cancellations when invoked from a timer + * callback, since we want to avoid waiting only if other _callbacks_ + * are waiting on us, to avoid introducing lockups. Non-callback paths + * are ok, since nobody would synchronously wait for their completion. + */ + if (!cur_t) + goto drop; + atomic_inc(&t->cancelling); + /* Need full barrier after relaxed atomic_inc */ + smp_mb__after_atomic(); + inc = true; + if (atomic_read(&cur_t->cancelling)) { + /* We're cancelling timer t, while some other timer callback is + * attempting to cancel us. In such a case, it might be possible + * that timer t belongs to the other callback, or some other + * callback waiting upon it (creating transitive dependencies + * upon us), and we will enter a deadlock if we continue + * cancelling and waiting for it synchronously, since it might + * do the same. Bail! + */ + ret = -EDEADLK; + goto out; + } +drop: drop_prog_refcnt(&t->cb); out: __bpf_spin_unlock_irqrestore(&timer->lock); @@ -1467,6 +1497,8 @@ out: * if it was running. */ ret = ret ?: hrtimer_cancel(&t->timer); + if (inc) + atomic_dec(&t->cancelling); rcu_read_unlock(); return ret; } From a6fcd19d7eac1335eb76bc16b6a66b7f574d1d69 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 9 Jul 2024 18:54:39 +0000 Subject: [PATCH 21/33] bpf: Defer work in bpf_timer_cancel_and_free Currently, the same case as previous patch (two timer callbacks trying to cancel each other) can be invoked through bpf_map_update_elem as well, or more precisely, freeing map elements containing timers. Since this relies on hrtimer_cancel as well, it is prone to the same deadlock situation as the previous patch. It would be sufficient to use hrtimer_try_to_cancel to fix this problem, as the timer cannot be enqueued after async_cancel_and_free. Once async_cancel_and_free has been done, the timer must be reinitialized before it can be armed again. The callback running in parallel trying to arm the timer will fail, and freeing bpf_hrtimer without waiting is sufficient (given kfree_rcu), and bpf_timer_cb will return HRTIMER_NORESTART, preventing the timer from being rearmed again. However, there exists a UAF scenario where the callback arms the timer before entering this function, such that if cancellation fails (due to timer callback invoking this routine, or the target timer callback running concurrently). In such a case, if the timer expiration is significantly far in the future, the RCU grace period expiration happening before it will free the bpf_hrtimer state and along with it the struct hrtimer, that is enqueued. Hence, it is clear cancellation needs to occur after async_cancel_and_free, and yet it cannot be done inline due to deadlock issues. We thus modify bpf_timer_cancel_and_free to defer work to the global workqueue, adding a work_struct alongside rcu_head (both used at _different_ points of time, so can share space). Update existing code comments to reflect the new state of affairs. Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20240709185440.1104957-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 61 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 22e779ca50d5..3243c83ef3e3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1084,7 +1084,10 @@ struct bpf_async_cb { struct bpf_prog *prog; void __rcu *callback_fn; void *value; - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct work_struct delete_work; + }; u64 flags; }; @@ -1220,6 +1223,21 @@ static void bpf_wq_delete_work(struct work_struct *work) kfree_rcu(w, cb.rcu); } +static void bpf_timer_delete_work(struct work_struct *work) +{ + struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work); + + /* Cancel the timer and wait for callback to complete if it was running. + * If hrtimer_cancel() can be safely called it's safe to call + * kfree_rcu(t) right after for both preallocated and non-preallocated + * maps. The async->cb = NULL was already done and no code path can see + * address 't' anymore. Timer if armed for existing bpf_hrtimer before + * bpf_timer_cancel_and_free will have been cancelled. + */ + hrtimer_cancel(&t->timer); + kfree_rcu(t, cb.rcu); +} + static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { @@ -1264,6 +1282,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u t = (struct bpf_hrtimer *)cb; atomic_set(&t->cancelling, 0); + INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); t->timer.function = bpf_timer_cb; cb->value = (void *)async - map->record->timer_off; @@ -1544,25 +1563,39 @@ void bpf_timer_cancel_and_free(void *val) if (!t) return; - /* Cancel the timer and wait for callback to complete if it was running. - * If hrtimer_cancel() can be safely called it's safe to call kfree(t) - * right after for both preallocated and non-preallocated maps. - * The async->cb = NULL was already done and no code path can - * see address 't' anymore. - * - * Check that bpf_map_delete/update_elem() wasn't called from timer - * callback_fn. In such case don't call hrtimer_cancel() (since it will - * deadlock) and don't call hrtimer_try_to_cancel() (since it will just - * return -1). Though callback_fn is still running on this cpu it's + /* We check that bpf_map_delete/update_elem() was called from timer + * callback_fn. In such case we don't call hrtimer_cancel() (since it + * will deadlock) and don't call hrtimer_try_to_cancel() (since it will + * just return -1). Though callback_fn is still running on this cpu it's * safe to do kfree(t) because bpf_timer_cb() read everything it needed * from 't'. The bpf subprog callback_fn won't be able to access 't', * since async->cb = NULL was already done. The timer will be * effectively cancelled because bpf_timer_cb() will return * HRTIMER_NORESTART. + * + * However, it is possible the timer callback_fn calling us armed the + * timer _before_ calling us, such that failing to cancel it here will + * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. + * Therefore, we _need_ to cancel any outstanding timers before we do + * kfree_rcu, even though no more timers can be armed. + * + * Moreover, we need to schedule work even if timer does not belong to + * the calling callback_fn, as on two different CPUs, we can end up in a + * situation where both sides run in parallel, try to cancel one + * another, and we end up waiting on both sides in hrtimer_cancel + * without making forward progress, since timer1 depends on time2 + * callback to finish, and vice versa. + * + * CPU 1 (timer1_cb) CPU 2 (timer2_cb) + * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1) + * + * To avoid these issues, punt to workqueue context when we are in a + * timer callback. */ - if (this_cpu_read(hrtimer_running) != t) - hrtimer_cancel(&t->timer); - kfree_rcu(t, cb.rcu); + if (this_cpu_read(hrtimer_running)) + queue_work(system_unbound_wq, &t->cb.delete_work); + else + bpf_timer_delete_work(&t->cb.delete_work); } /* This function is called by map_delete/update_elem for individual element and From 97a9063518f198ec0adb2ecb89789de342bb8283 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 10 Jul 2024 00:14:01 +0000 Subject: [PATCH 22/33] tcp: avoid too many retransmit packets If a TCP socket is using TCP_USER_TIMEOUT, and the other peer retracted its window to zero, tcp_retransmit_timer() can retransmit a packet every two jiffies (2 ms for HZ=1000), for about 4 minutes after TCP_USER_TIMEOUT has 'expired'. The fix is to make sure tcp_rtx_probe0_timed_out() takes icsk->icsk_user_timeout into account. Before blamed commit, the socket would not timeout after icsk->icsk_user_timeout, but would use standard exponential backoff for the retransmits. Also worth noting that before commit e89688e3e978 ("net: tcp: fix unexcepted socket die when snd_wnd is 0"), the issue would last 2 minutes instead of 4. Fixes: b701a99e431d ("tcp: Add tcp_clamp_rto_to_user_timeout() helper to improve accuracy") Signed-off-by: Eric Dumazet Cc: Neal Cardwell Reviewed-by: Jason Xing Reviewed-by: Jon Maxwell Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20240710001402.2758273-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_timer.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index db9d826560e5..892c86657fbc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -483,15 +483,26 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, const struct sk_buff *skb, u32 rtx_delta) { + const struct inet_connection_sock *icsk = inet_csk(sk); + u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); const struct tcp_sock *tp = tcp_sk(sk); - const int timeout = TCP_RTO_MAX * 2; + int timeout = TCP_RTO_MAX * 2; s32 rcv_delta; + if (user_timeout) { + /* If user application specified a TCP_USER_TIMEOUT, + * it does not want win 0 packets to 'reset the timer' + * while retransmits are not making progress. + */ + if (rtx_delta > user_timeout) + return true; + timeout = min_t(u32, timeout, msecs_to_jiffies(user_timeout)); + } /* Note: timer interrupt might have been delayed by at least one jiffy, * and tp->rcv_tstamp might very well have been written recently. * rcv_delta can thus be negative. */ - rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp; + rcv_delta = icsk->icsk_timeout - tp->rcv_tstamp; if (rcv_delta <= timeout) return false; From 76a0a3f9cc2fbd0e56671706bb74a9a988397898 Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Tue, 9 Jul 2024 13:31:22 -0700 Subject: [PATCH 23/33] e1000e: fix force smbus during suspend flow Commit 861e8086029e ("e1000e: move force SMBUS from enable ulp function to avoid PHY loss issue") resolved a PHY access loss during suspend on Meteor Lake consumer platforms, but it affected corporate systems incorrectly. A better fix, working for both consumer and corporate systems, was proposed in commit bfd546a552e1 ("e1000e: move force SMBUS near the end of enable_ulp function"). However, it introduced a regression on older devices, such as [8086:15B8], [8086:15F9], [8086:15BE]. This patch aims to fix the secondary regression, by limiting the scope of the changes to Meteor Lake platforms only. Fixes: bfd546a552e1 ("e1000e: move force SMBUS near the end of enable_ulp function") Reported-by: Todd Brandt Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218940 Reported-by: Dieter Mummenschanz Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218936 Signed-off-by: Vitaly Lifshits Tested-by: Mor Bar-Gabay (A Contingent Worker at Intel) Signed-off-by: Tony Nguyen Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240709203123.2103296-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 73 +++++++++++++++------ 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 2e98a2a0bead..ce227b56cf72 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -1108,6 +1108,46 @@ static s32 e1000_platform_pm_pch_lpt(struct e1000_hw *hw, bool link) return 0; } +/** + * e1000e_force_smbus - Force interfaces to transition to SMBUS mode. + * @hw: pointer to the HW structure + * + * Force the MAC and the PHY to SMBUS mode. Assumes semaphore already + * acquired. + * + * Return: 0 on success, negative errno on failure. + **/ +static s32 e1000e_force_smbus(struct e1000_hw *hw) +{ + u16 smb_ctrl = 0; + u32 ctrl_ext; + s32 ret_val; + + /* Switching PHY interface always returns MDI error + * so disable retry mechanism to avoid wasting time + */ + e1000e_disable_phy_retry(hw); + + /* Force SMBus mode in the PHY */ + ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &smb_ctrl); + if (ret_val) { + e1000e_enable_phy_retry(hw); + return ret_val; + } + + smb_ctrl |= CV_SMB_CTRL_FORCE_SMBUS; + e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, smb_ctrl); + + e1000e_enable_phy_retry(hw); + + /* Force SMBus mode in the MAC */ + ctrl_ext = er32(CTRL_EXT); + ctrl_ext |= E1000_CTRL_EXT_FORCE_SMBUS; + ew32(CTRL_EXT, ctrl_ext); + + return 0; +} + /** * e1000_enable_ulp_lpt_lp - configure Ultra Low Power mode for LynxPoint-LP * @hw: pointer to the HW structure @@ -1165,6 +1205,14 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) if (ret_val) goto out; + if (hw->mac.type != e1000_pch_mtp) { + ret_val = e1000e_force_smbus(hw); + if (ret_val) { + e_dbg("Failed to force SMBUS: %d\n", ret_val); + goto release; + } + } + /* Si workaround for ULP entry flow on i127/rev6 h/w. Enable * LPLU and disable Gig speed when entering ULP */ @@ -1225,27 +1273,12 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) } release: - /* Switching PHY interface always returns MDI error - * so disable retry mechanism to avoid wasting time - */ - e1000e_disable_phy_retry(hw); - - /* Force SMBus mode in PHY */ - ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); - if (ret_val) { - e1000e_enable_phy_retry(hw); - hw->phy.ops.release(hw); - goto out; + if (hw->mac.type == e1000_pch_mtp) { + ret_val = e1000e_force_smbus(hw); + if (ret_val) + e_dbg("Failed to force SMBUS over MTL system: %d\n", + ret_val); } - phy_reg |= CV_SMB_CTRL_FORCE_SMBUS; - e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); - - e1000e_enable_phy_retry(hw); - - /* Force SMBus mode in MAC */ - mac_reg = er32(CTRL_EXT); - mac_reg |= E1000_CTRL_EXT_FORCE_SMBUS; - ew32(CTRL_EXT, mac_reg); hw->phy.ops.release(hw); out: From 8c6790b5c25dfac11b589cc37346bcf9e23ad468 Mon Sep 17 00:00:00 2001 From: Jian Hui Lee Date: Mon, 8 Jul 2024 14:52:09 +0800 Subject: [PATCH 24/33] net: ethernet: mtk-star-emac: set mac_managed_pm when probing The below commit introduced a warning message when phy state is not in the states: PHY_HALTED, PHY_READY, and PHY_UP. commit 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state") mtk-star-emac doesn't need mdiobus suspend/resume. To fix the warning message during resume, indicate the phy resume/suspend is managed by the mac when probing. Fixes: 744d23c71af3 ("net: phy: Warn about incorrect mdio_bus_phy_resume() state") Signed-off-by: Jian Hui Lee Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20240708065210.4178980-1-jianhui.lee@canonical.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mediatek/mtk_star_emac.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c index 31aebeb2e285..25989c79c92e 100644 --- a/drivers/net/ethernet/mediatek/mtk_star_emac.c +++ b/drivers/net/ethernet/mediatek/mtk_star_emac.c @@ -1524,6 +1524,7 @@ static int mtk_star_probe(struct platform_device *pdev) { struct device_node *of_node; struct mtk_star_priv *priv; + struct phy_device *phydev; struct net_device *ndev; struct device *dev; void __iomem *base; @@ -1649,6 +1650,12 @@ static int mtk_star_probe(struct platform_device *pdev) netif_napi_add(ndev, &priv->rx_napi, mtk_star_rx_poll); netif_napi_add_tx(ndev, &priv->tx_napi, mtk_star_tx_poll); + phydev = of_phy_find_device(priv->phy_node); + if (phydev) { + phydev->mac_managed_pm = true; + put_device(&phydev->mdio.dev); + } + return devm_register_netdev(dev, ndev); } From 50bd5a0c658d132507673c4d59347c025dd149ed Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 11 Jul 2024 05:27:09 +0000 Subject: [PATCH 25/33] selftests/bpf: Add timer lockup selftest Add a selftest that tries to trigger a situation where two timer callbacks are attempting to cancel each other's timer. By running them continuously, we hit a condition where both run in parallel and cancel each other. Without the fix in the previous patch, this would cause a lockup as hrtimer_cancel on either side will wait for forward progress from the callback. Ensure that this situation leads to a EDEADLK error. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20240711052709.2148616-1-memxor@gmail.com --- .../selftests/bpf/prog_tests/timer_lockup.c | 91 +++++++++++++++++++ .../selftests/bpf/progs/timer_lockup.c | 87 ++++++++++++++++++ 2 files changed, 178 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/timer_lockup.c create mode 100644 tools/testing/selftests/bpf/progs/timer_lockup.c diff --git a/tools/testing/selftests/bpf/prog_tests/timer_lockup.c b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c new file mode 100644 index 000000000000..871d16cb95cf --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/timer_lockup.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include + +#include "timer_lockup.skel.h" + +static long cpu; +static int *timer1_err; +static int *timer2_err; +static bool skip; + +volatile int k = 0; + +static void *timer_lockup_thread(void *arg) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1000, + ); + int i, prog_fd = *(int *)arg; + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + CPU_SET(__sync_fetch_and_add(&cpu, 1), &cpuset); + ASSERT_OK(pthread_setaffinity_np(pthread_self(), sizeof(cpuset), + &cpuset), + "cpu affinity"); + + for (i = 0; !READ_ONCE(*timer1_err) && !READ_ONCE(*timer2_err); i++) { + bpf_prog_test_run_opts(prog_fd, &opts); + /* Skip the test if we can't reproduce the race in a reasonable + * amount of time. + */ + if (i > 50) { + WRITE_ONCE(skip, true); + break; + } + } + + return NULL; +} + +void test_timer_lockup(void) +{ + int timer1_prog, timer2_prog; + struct timer_lockup *skel; + pthread_t thrds[2]; + void *ret; + + skel = timer_lockup__open_and_load(); + if (!ASSERT_OK_PTR(skel, "timer_lockup__open_and_load")) + return; + + timer1_prog = bpf_program__fd(skel->progs.timer1_prog); + timer2_prog = bpf_program__fd(skel->progs.timer2_prog); + + timer1_err = &skel->bss->timer1_err; + timer2_err = &skel->bss->timer2_err; + + if (!ASSERT_OK(pthread_create(&thrds[0], NULL, timer_lockup_thread, + &timer1_prog), + "pthread_create thread1")) + goto out; + if (!ASSERT_OK(pthread_create(&thrds[1], NULL, timer_lockup_thread, + &timer2_prog), + "pthread_create thread2")) { + pthread_exit(&thrds[0]); + goto out; + } + + pthread_join(thrds[1], &ret); + pthread_join(thrds[0], &ret); + + if (skip) { + test__skip(); + goto out; + } + + if (*timer1_err != -EDEADLK && *timer1_err != 0) + ASSERT_FAIL("timer1_err bad value"); + if (*timer2_err != -EDEADLK && *timer2_err != 0) + ASSERT_FAIL("timer2_err bad value"); +out: + timer_lockup__destroy(skel); + return; +} diff --git a/tools/testing/selftests/bpf/progs/timer_lockup.c b/tools/testing/selftests/bpf/progs/timer_lockup.c new file mode 100644 index 000000000000..3e520133281e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_lockup.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") = "GPL"; + +struct elem { + struct bpf_timer t; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} timer1_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} timer2_map SEC(".maps"); + +int timer1_err; +int timer2_err; + +static int timer_cb1(void *map, int *k, struct elem *v) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&timer2_map, &key); + if (timer) + timer2_err = bpf_timer_cancel(timer); + + return 0; +} + +static int timer_cb2(void *map, int *k, struct elem *v) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&timer1_map, &key); + if (timer) + timer1_err = bpf_timer_cancel(timer); + + return 0; +} + +SEC("tc") +int timer1_prog(void *ctx) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&timer1_map, &key); + if (timer) { + bpf_timer_init(timer, &timer1_map, CLOCK_BOOTTIME); + bpf_timer_set_callback(timer, timer_cb1); + bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN); + } + + return 0; +} + +SEC("tc") +int timer2_prog(void *ctx) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&timer2_map, &key); + if (timer) { + bpf_timer_init(timer, &timer2_map, CLOCK_BOOTTIME); + bpf_timer_set_callback(timer, timer_cb2); + bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN); + } + + return 0; +} From f2aeb7306a898e1cbd03963d376f4b6656ca2b55 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Mon, 8 Jul 2024 14:56:15 +0300 Subject: [PATCH 26/33] ppp: reject claimed-as-LCP but actually malformed packets Since 'ppp_async_encode()' assumes valid LCP packets (with code from 1 to 7 inclusive), add 'ppp_check_packet()' to ensure that LCP packet has an actual body beyond PPP_LCP header bytes, and reject claimed-as-LCP but actually malformed data otherwise. Reported-by: syzbot+ec0723ba9605678b14bf@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=ec0723ba9605678b14bf Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Dmitry Antipov Signed-off-by: Paolo Abeni --- drivers/net/ppp/ppp_generic.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 0a65b6d690fe..eb9acfcaeb09 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -70,6 +70,7 @@ #define MPHDRLEN_SSN 4 /* ditto with short sequence numbers */ #define PPP_PROTO_LEN 2 +#define PPP_LCP_HDRLEN 4 /* * An instance of /dev/ppp can be associated with either a ppp @@ -493,6 +494,15 @@ static ssize_t ppp_read(struct file *file, char __user *buf, return ret; } +static bool ppp_check_packet(struct sk_buff *skb, size_t count) +{ + /* LCP packets must include LCP header which 4 bytes long: + * 1-byte code, 1-byte identifier, and 2-byte length. + */ + return get_unaligned_be16(skb->data) != PPP_LCP || + count >= PPP_PROTO_LEN + PPP_LCP_HDRLEN; +} + static ssize_t ppp_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -515,6 +525,11 @@ static ssize_t ppp_write(struct file *file, const char __user *buf, kfree_skb(skb); goto out; } + ret = -EINVAL; + if (unlikely(!ppp_check_packet(skb, count))) { + kfree_skb(skb); + goto out; + } switch (pf->kind) { case INTERFACE: From c184cf94e73b04ff7048d045f5413899bc664788 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 9 Jul 2024 08:19:43 +0200 Subject: [PATCH 27/33] ethtool: netlink: do not return SQI value if link is down Do not attach SQI value if link is down. "SQI values are only valid if link-up condition is present" per OpenAlliance specification of 100Base-T1 Interoperability Test suite [1]. The same rule would apply for other link types. [1] https://opensig.org/automotive-ethernet-specifications/# Fixes: 806602191592 ("ethtool: provide UAPI for PHY Signal Quality Index (SQI)") Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Reviewed-by: Woojung Huh Link: https://patch.msgid.link/20240709061943.729381-1-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- net/ethtool/linkstate.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index b2de2108b356..34d76e87847d 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -37,6 +37,8 @@ static int linkstate_get_sqi(struct net_device *dev) mutex_lock(&phydev->lock); if (!phydev->drv || !phydev->drv->get_sqi) ret = -EOPNOTSUPP; + else if (!phydev->link) + ret = -ENETDOWN; else ret = phydev->drv->get_sqi(phydev); mutex_unlock(&phydev->lock); @@ -55,6 +57,8 @@ static int linkstate_get_sqi_max(struct net_device *dev) mutex_lock(&phydev->lock); if (!phydev->drv || !phydev->drv->get_sqi_max) ret = -EOPNOTSUPP; + else if (!phydev->link) + ret = -ENETDOWN; else ret = phydev->drv->get_sqi_max(phydev); mutex_unlock(&phydev->lock); @@ -62,6 +66,17 @@ static int linkstate_get_sqi_max(struct net_device *dev) return ret; }; +static bool linkstate_sqi_critical_error(int sqi) +{ + return sqi < 0 && sqi != -EOPNOTSUPP && sqi != -ENETDOWN; +} + +static bool linkstate_sqi_valid(struct linkstate_reply_data *data) +{ + return data->sqi >= 0 && data->sqi_max >= 0 && + data->sqi <= data->sqi_max; +} + static int linkstate_get_link_ext_state(struct net_device *dev, struct linkstate_reply_data *data) { @@ -93,12 +108,12 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, data->link = __ethtool_get_link(dev); ret = linkstate_get_sqi(dev); - if (ret < 0 && ret != -EOPNOTSUPP) + if (linkstate_sqi_critical_error(ret)) goto out; data->sqi = ret; ret = linkstate_get_sqi_max(dev); - if (ret < 0 && ret != -EOPNOTSUPP) + if (linkstate_sqi_critical_error(ret)) goto out; data->sqi_max = ret; @@ -136,11 +151,10 @@ static int linkstate_reply_size(const struct ethnl_req_info *req_base, len = nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + 0; - if (data->sqi != -EOPNOTSUPP) - len += nla_total_size(sizeof(u32)); - - if (data->sqi_max != -EOPNOTSUPP) - len += nla_total_size(sizeof(u32)); + if (linkstate_sqi_valid(data)) { + len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI */ + len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI_MAX */ + } if (data->link_ext_state_provided) len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_STATE */ @@ -164,13 +178,14 @@ static int linkstate_fill_reply(struct sk_buff *skb, nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link)) return -EMSGSIZE; - if (data->sqi != -EOPNOTSUPP && - nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi)) - return -EMSGSIZE; + if (linkstate_sqi_valid(data)) { + if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi)) + return -EMSGSIZE; - if (data->sqi_max != -EOPNOTSUPP && - nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max)) - return -EMSGSIZE; + if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, + data->sqi_max)) + return -EMSGSIZE; + } if (data->link_ext_state_provided) { if (nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_STATE, From 631a4b3ddc7831b20442c59c28b0476d0704c9af Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Jul 2024 02:02:26 +0200 Subject: [PATCH 28/33] netfilter: nfnetlink_queue: drop bogus WARN_ON Happens when rules get flushed/deleted while packet is out, so remove this WARN_ON. This WARN exists in one form or another since v4.14, no need to backport this to older releases, hence use a more recent fixes tag. Fixes: 3f8019688894 ("netfilter: move nf_reinject into nfnetlink_queue modules") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202407081453.11ac0f63-lkp@intel.com Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f1c31757e496..55e28e1da66e 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -325,7 +325,7 @@ static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) hooks = nf_hook_entries_head(net, pf, entry->state.hook); i = entry->hook_index; - if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) { + if (!hooks || i >= hooks->num_hook_entries) { kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP); nf_queue_entry_free(entry); return; From cff3bd012a9512ac5ed858d38e6ed65f6391008c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 11 Jul 2024 11:06:39 +0200 Subject: [PATCH 29/33] netfilter: nf_tables: prefer nft_chain_validate nft_chain_validate already performs loop detection because a cycle will result in a call stack overflow (ctx->level >= NFT_JUMP_STACK_SIZE). It also follows maps via ->validate callback in nft_lookup, so there appears no reason to iterate the maps again. nf_tables_check_loops() and all its helper functions can be removed. This improves ruleset load time significantly, from 23s down to 12s. This also fixes a crash bug. Old loop detection code can result in unbounded recursion: BUG: TASK stack guard page was hit at .... Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN CPU: 4 PID: 1539 Comm: nft Not tainted 6.10.0-rc5+ #1 [..] with a suitable ruleset during validation of register stores. I can't see any actual reason to attempt to check for this from nft_validate_register_store(), at this point the transaction is still in progress, so we don't have a full picture of the rule graph. For nf-next it might make sense to either remove it or make this depend on table->validate_state in case we could catch an error earlier (for improved error reporting to userspace). Fixes: 20a69341f2d0 ("netfilter: nf_tables: add netlink set API") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 158 +++------------------------------- 1 file changed, 13 insertions(+), 145 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 081c08536d0f..91cc3a81ba8f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3823,6 +3823,15 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *r nf_tables_rule_destroy(ctx, rule); } +/** nft_chain_validate - loop detection and hook validation + * + * @ctx: context containing call depth and base chain + * @chain: chain to validate + * + * Walk through the rules of the given chain and chase all jumps/gotos + * and set lookups until either the jump limit is hit or all reachable + * chains have been validated. + */ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) { struct nft_expr *expr, *last; @@ -3844,6 +3853,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) if (!expr->ops->validate) continue; + /* This may call nft_chain_validate() recursively, + * callers that do so must increment ctx->level. + */ err = expr->ops->validate(ctx, expr, &data); if (err < 0) return err; @@ -10809,150 +10821,6 @@ int nft_chain_validate_hooks(const struct nft_chain *chain, } EXPORT_SYMBOL_GPL(nft_chain_validate_hooks); -/* - * Loop detection - walk through the ruleset beginning at the destination chain - * of a new jump until either the source chain is reached (loop) or all - * reachable chains have been traversed. - * - * The loop check is performed whenever a new jump verdict is added to an - * expression or verdict map or a verdict map is bound to a new chain. - */ - -static int nf_tables_check_loops(const struct nft_ctx *ctx, - const struct nft_chain *chain); - -static int nft_check_loops(const struct nft_ctx *ctx, - const struct nft_set_ext *ext) -{ - const struct nft_data *data; - int ret; - - data = nft_set_ext_data(ext); - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - ret = nf_tables_check_loops(ctx, data->verdict.chain); - break; - default: - ret = 0; - break; - } - - return ret; -} - -static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, - struct nft_set *set, - const struct nft_set_iter *iter, - struct nft_elem_priv *elem_priv) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); - - if (!nft_set_elem_active(ext, iter->genmask)) - return 0; - - if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) - return 0; - - return nft_check_loops(ctx, ext); -} - -static int nft_set_catchall_loops(const struct nft_ctx *ctx, - struct nft_set *set) -{ - u8 genmask = nft_genmask_next(ctx->net); - struct nft_set_elem_catchall *catchall; - struct nft_set_ext *ext; - int ret = 0; - - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { - ext = nft_set_elem_ext(set, catchall->elem); - if (!nft_set_elem_active(ext, genmask)) - continue; - - ret = nft_check_loops(ctx, ext); - if (ret < 0) - return ret; - } - - return ret; -} - -static int nf_tables_check_loops(const struct nft_ctx *ctx, - const struct nft_chain *chain) -{ - const struct nft_rule *rule; - const struct nft_expr *expr, *last; - struct nft_set *set; - struct nft_set_binding *binding; - struct nft_set_iter iter; - - if (ctx->chain == chain) - return -ELOOP; - - if (fatal_signal_pending(current)) - return -EINTR; - - list_for_each_entry(rule, &chain->rules, list) { - nft_rule_for_each_expr(expr, last, rule) { - struct nft_immediate_expr *priv; - const struct nft_data *data; - int err; - - if (strcmp(expr->ops->type->name, "immediate")) - continue; - - priv = nft_expr_priv(expr); - if (priv->dreg != NFT_REG_VERDICT) - continue; - - data = &priv->data; - switch (data->verdict.code) { - case NFT_JUMP: - case NFT_GOTO: - err = nf_tables_check_loops(ctx, - data->verdict.chain); - if (err < 0) - return err; - break; - default: - break; - } - } - } - - list_for_each_entry(set, &ctx->table->sets, list) { - if (!nft_is_active_next(ctx->net, set)) - continue; - if (!(set->flags & NFT_SET_MAP) || - set->dtype != NFT_DATA_VERDICT) - continue; - - list_for_each_entry(binding, &set->bindings, list) { - if (!(binding->flags & NFT_SET_MAP) || - binding->chain != chain) - continue; - - iter.genmask = nft_genmask_next(ctx->net); - iter.type = NFT_ITER_UPDATE; - iter.skip = 0; - iter.count = 0; - iter.err = 0; - iter.fn = nf_tables_loop_check_setelem; - - set->ops->walk(ctx, set, &iter); - if (!iter.err) - iter.err = nft_set_catchall_loops(ctx, set); - - if (iter.err < 0) - return iter.err; - } - } - - return 0; -} - /** * nft_parse_u32_check - fetch u32 attribute and check for maximum value * @@ -11065,7 +10933,7 @@ static int nft_validate_register_store(const struct nft_ctx *ctx, if (data != NULL && (data->verdict.code == NFT_GOTO || data->verdict.code == NFT_JUMP)) { - err = nf_tables_check_loops(ctx, data->verdict.chain); + err = nft_chain_validate(ctx, data->verdict.chain); if (err < 0) return err; } From 5c0b485a8c6116516f33925b9ce5b6104a6eadfd Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 9 Jul 2024 12:13:56 -0700 Subject: [PATCH 30/33] udp: Set SOCK_RCU_FREE earlier in udp_lib_get_port(). syzkaller triggered the warning [0] in udp_v4_early_demux(). In udp_v[46]_early_demux() and sk_lookup(), we do not touch the refcount of the looked-up sk and use sock_pfree() as skb->destructor, so we check SOCK_RCU_FREE to ensure that the sk is safe to access during the RCU grace period. Currently, SOCK_RCU_FREE is flagged for a bound socket after being put into the hash table. Moreover, the SOCK_RCU_FREE check is done too early in udp_v[46]_early_demux() and sk_lookup(), so there could be a small race window: CPU1 CPU2 ---- ---- udp_v4_early_demux() udp_lib_get_port() | |- hlist_add_head_rcu() |- sk = __udp4_lib_demux_lookup() | |- DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk)); `- sock_set_flag(sk, SOCK_RCU_FREE) We had the same bug in TCP and fixed it in commit 871019b22d1b ("net: set SOCK_RCU_FREE before inserting socket into hashtable"). Let's apply the same fix for UDP. [0]: WARNING: CPU: 0 PID: 11198 at net/ipv4/udp.c:2599 udp_v4_early_demux+0x481/0xb70 net/ipv4/udp.c:2599 Modules linked in: CPU: 0 PID: 11198 Comm: syz-executor.1 Not tainted 6.9.0-g93bda33046e7 #13 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:udp_v4_early_demux+0x481/0xb70 net/ipv4/udp.c:2599 Code: c5 7a 15 fe bb 01 00 00 00 44 89 e9 31 ff d3 e3 81 e3 bf ef ff ff 89 de e8 2c 74 15 fe 85 db 0f 85 02 06 00 00 e8 9f 7a 15 fe <0f> 0b e8 98 7a 15 fe 49 8d 7e 60 e8 4f 39 2f fe 49 c7 46 60 20 52 RSP: 0018:ffffc9000ce3fa58 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff8318c92c RDX: ffff888036ccde00 RSI: ffffffff8318c2f1 RDI: 0000000000000001 RBP: ffff88805a2dd6e0 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0001ffffffffffff R12: ffff88805a2dd680 R13: 0000000000000007 R14: ffff88800923f900 R15: ffff88805456004e FS: 00007fc449127640(0000) GS:ffff88807dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc449126e38 CR3: 000000003de4b002 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 PKRU: 55555554 Call Trace: ip_rcv_finish_core.constprop.0+0xbdd/0xd20 net/ipv4/ip_input.c:349 ip_rcv_finish+0xda/0x150 net/ipv4/ip_input.c:447 NF_HOOK include/linux/netfilter.h:314 [inline] NF_HOOK include/linux/netfilter.h:308 [inline] ip_rcv+0x16c/0x180 net/ipv4/ip_input.c:569 __netif_receive_skb_one_core+0xb3/0xe0 net/core/dev.c:5624 __netif_receive_skb+0x21/0xd0 net/core/dev.c:5738 netif_receive_skb_internal net/core/dev.c:5824 [inline] netif_receive_skb+0x271/0x300 net/core/dev.c:5884 tun_rx_batched drivers/net/tun.c:1549 [inline] tun_get_user+0x24db/0x2c50 drivers/net/tun.c:2002 tun_chr_write_iter+0x107/0x1a0 drivers/net/tun.c:2048 new_sync_write fs/read_write.c:497 [inline] vfs_write+0x76f/0x8d0 fs/read_write.c:590 ksys_write+0xbf/0x190 fs/read_write.c:643 __do_sys_write fs/read_write.c:655 [inline] __se_sys_write fs/read_write.c:652 [inline] __x64_sys_write+0x41/0x50 fs/read_write.c:652 x64_sys_call+0xe66/0x1990 arch/x86/include/generated/asm/syscalls_64.h:2 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x4b/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7fc44a68bc1f Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 e9 cf f5 ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 3c d0 f5 ff 48 RSP: 002b:00007fc449126c90 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00000000004bc050 RCX: 00007fc44a68bc1f RDX: 0000000000000032 RSI: 00000000200000c0 RDI: 00000000000000c8 RBP: 00000000004bc050 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000032 R11: 0000000000000293 R12: 0000000000000000 R13: 000000000000000b R14: 00007fc44a5ec530 R15: 0000000000000000 Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20240709191356.24010-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv4/udp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 189c9113fe9a..578668878a85 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -326,6 +326,8 @@ found: goto fail_unlock; } + sock_set_flag(sk, SOCK_RCU_FREE); + sk_add_node_rcu(sk, &hslot->head); hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -342,7 +344,7 @@ found: hslot2->count++; spin_unlock(&hslot2->lock); } - sock_set_flag(sk, SOCK_RCU_FREE); + error = 0; fail_unlock: spin_unlock_bh(&hslot->lock); From 7a99afef17af66c276c1d6e6f4dbcac223eaf6ac Mon Sep 17 00:00:00 2001 From: Ronald Wahl Date: Tue, 9 Jul 2024 21:58:45 +0200 Subject: [PATCH 31/33] net: ks8851: Fix potential TX stall after interface reopen The amount of TX space in the hardware buffer is tracked in the tx_space variable. The initial value is currently only set during driver probing. After closing the interface and reopening it the tx_space variable has the last value it had before close. If it is smaller than the size of the first send packet after reopeing the interface the queue will be stopped. The queue is woken up after receiving a TX interrupt but this will never happen since we did not send anything. This commit moves the initialization of the tx_space variable to the ks8851_net_open function right before starting the TX queue. Also query the value from the hardware instead of using a hard coded value. Only the SPI chip variant is affected by this issue because only this driver variant actually depends on the tx_space variable in the xmit function. Fixes: 3dc5d4454545 ("net: ks8851: Fix TX stall caused by TX buffer overrun") Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: netdev@vger.kernel.org Cc: stable@vger.kernel.org # 5.10+ Signed-off-by: Ronald Wahl Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20240709195845.9089-1-rwahl@gmx.de Signed-off-by: Paolo Abeni --- drivers/net/ethernet/micrel/ks8851_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c index 13462811eaae..7fa1820db9cc 100644 --- a/drivers/net/ethernet/micrel/ks8851_common.c +++ b/drivers/net/ethernet/micrel/ks8851_common.c @@ -482,6 +482,7 @@ static int ks8851_net_open(struct net_device *dev) ks8851_wrreg16(ks, KS_IER, ks->rc_ier); ks->queued_len = 0; + ks->tx_space = ks8851_rdreg16(ks, KS_TXMIR); netif_start_queue(ks->netdev); netif_dbg(ks, ifup, ks->netdev, "network device up\n"); @@ -1101,7 +1102,6 @@ int ks8851_probe_common(struct net_device *netdev, struct device *dev, int ret; ks->netdev = netdev; - ks->tx_space = 6144; ks->gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH); ret = PTR_ERR_OR_ZERO(ks->gpio); From 26488172b0292bed837b95a006a3f3431d1898c3 Mon Sep 17 00:00:00 2001 From: Chengen Du Date: Wed, 10 Jul 2024 13:37:47 +0800 Subject: [PATCH 32/33] net/sched: Fix UAF when resolving a clash KASAN reports the following UAF: BUG: KASAN: slab-use-after-free in tcf_ct_flow_table_process_conn+0x12b/0x380 [act_ct] Read of size 1 at addr ffff888c07603600 by task handler130/6469 Call Trace: dump_stack_lvl+0x48/0x70 print_address_description.constprop.0+0x33/0x3d0 print_report+0xc0/0x2b0 kasan_report+0xd0/0x120 __asan_load1+0x6c/0x80 tcf_ct_flow_table_process_conn+0x12b/0x380 [act_ct] tcf_ct_act+0x886/0x1350 [act_ct] tcf_action_exec+0xf8/0x1f0 fl_classify+0x355/0x360 [cls_flower] __tcf_classify+0x1fd/0x330 tcf_classify+0x21c/0x3c0 sch_handle_ingress.constprop.0+0x2c5/0x500 __netif_receive_skb_core.constprop.0+0xb25/0x1510 __netif_receive_skb_list_core+0x220/0x4c0 netif_receive_skb_list_internal+0x446/0x620 napi_complete_done+0x157/0x3d0 gro_cell_poll+0xcf/0x100 __napi_poll+0x65/0x310 net_rx_action+0x30c/0x5c0 __do_softirq+0x14f/0x491 __irq_exit_rcu+0x82/0xc0 irq_exit_rcu+0xe/0x20 common_interrupt+0xa1/0xb0 asm_common_interrupt+0x27/0x40 Allocated by task 6469: kasan_save_stack+0x38/0x70 kasan_set_track+0x25/0x40 kasan_save_alloc_info+0x1e/0x40 __kasan_krealloc+0x133/0x190 krealloc+0xaa/0x130 nf_ct_ext_add+0xed/0x230 [nf_conntrack] tcf_ct_act+0x1095/0x1350 [act_ct] tcf_action_exec+0xf8/0x1f0 fl_classify+0x355/0x360 [cls_flower] __tcf_classify+0x1fd/0x330 tcf_classify+0x21c/0x3c0 sch_handle_ingress.constprop.0+0x2c5/0x500 __netif_receive_skb_core.constprop.0+0xb25/0x1510 __netif_receive_skb_list_core+0x220/0x4c0 netif_receive_skb_list_internal+0x446/0x620 napi_complete_done+0x157/0x3d0 gro_cell_poll+0xcf/0x100 __napi_poll+0x65/0x310 net_rx_action+0x30c/0x5c0 __do_softirq+0x14f/0x491 Freed by task 6469: kasan_save_stack+0x38/0x70 kasan_set_track+0x25/0x40 kasan_save_free_info+0x2b/0x60 ____kasan_slab_free+0x180/0x1f0 __kasan_slab_free+0x12/0x30 slab_free_freelist_hook+0xd2/0x1a0 __kmem_cache_free+0x1a2/0x2f0 kfree+0x78/0x120 nf_conntrack_free+0x74/0x130 [nf_conntrack] nf_ct_destroy+0xb2/0x140 [nf_conntrack] __nf_ct_resolve_clash+0x529/0x5d0 [nf_conntrack] nf_ct_resolve_clash+0xf6/0x490 [nf_conntrack] __nf_conntrack_confirm+0x2c6/0x770 [nf_conntrack] tcf_ct_act+0x12ad/0x1350 [act_ct] tcf_action_exec+0xf8/0x1f0 fl_classify+0x355/0x360 [cls_flower] __tcf_classify+0x1fd/0x330 tcf_classify+0x21c/0x3c0 sch_handle_ingress.constprop.0+0x2c5/0x500 __netif_receive_skb_core.constprop.0+0xb25/0x1510 __netif_receive_skb_list_core+0x220/0x4c0 netif_receive_skb_list_internal+0x446/0x620 napi_complete_done+0x157/0x3d0 gro_cell_poll+0xcf/0x100 __napi_poll+0x65/0x310 net_rx_action+0x30c/0x5c0 __do_softirq+0x14f/0x491 The ct may be dropped if a clash has been resolved but is still passed to the tcf_ct_flow_table_process_conn function for further usage. This issue can be fixed by retrieving ct from skb again after confirming conntrack. Fixes: 0cc254e5aa37 ("net/sched: act_ct: Offload connections with commit action") Co-developed-by: Gerald Yang Signed-off-by: Gerald Yang Signed-off-by: Chengen Du Link: https://patch.msgid.link/20240710053747.13223-1-chengen.du@canonical.com Signed-off-by: Paolo Abeni --- net/sched/act_ct.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 2a96d9c1db65..6fa3cca87d34 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1077,6 +1077,14 @@ do_nat: */ if (nf_conntrack_confirm(skb) != NF_ACCEPT) goto drop; + + /* The ct may be dropped if a clash has been resolved, + * so it's necessary to retrieve it from skb again to + * prevent UAF. + */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + skip_add = true; } if (!skip_add) From 626dfed5fa3bfb41e0dffd796032b555b69f9cde Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 4 Jul 2024 08:41:57 +0200 Subject: [PATCH 33/33] net, sunrpc: Remap EPERM in case of connection failure in xs_tcp_setup_socket When using a BPF program on kernel_connect(), the call can return -EPERM. This causes xs_tcp_setup_socket() to loop forever, filling up the syslog and causing the kernel to potentially freeze up. Neil suggested: This will propagate -EPERM up into other layers which might not be ready to handle it. It might be safer to map EPERM to an error we would be more likely to expect from the network system - such as ECONNREFUSED or ENETDOWN. ECONNREFUSED as error seems reasonable. For programs setting a different error can be out of reach (see handling in 4fbac77d2d09) in particular on kernels which do not have f10d05966196 ("bpf: Make BPF_PROG_RUN_ARRAY return -err instead of allow boolean"), thus given that it is better to simply remap for consistent behavior. UDP does handle EPERM in xs_udp_send_request(). Fixes: d74bad4e74ee ("bpf: Hooks for sys_connect") Fixes: 4fbac77d2d09 ("bpf: Hooks for sys_bind") Co-developed-by: Lex Siegel Signed-off-by: Lex Siegel Signed-off-by: Daniel Borkmann Cc: Neil Brown Cc: Trond Myklebust Cc: Anna Schumaker Link: https://github.com/cilium/cilium/issues/33395 Link: https://lore.kernel.org/bpf/171374175513.12877.8993642908082014881@noble.neil.brown.name Link: https://patch.msgid.link/9069ec1d59e4b2129fc23433349fd5580ad43921.1720075070.git.daniel@iogearbox.net Signed-off-by: Paolo Abeni --- net/sunrpc/xprtsock.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index dfc353eea8ed..0e1691316f42 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2441,6 +2441,13 @@ static void xs_tcp_setup_socket(struct work_struct *work) transport->srcport = 0; status = -EAGAIN; break; + case -EPERM: + /* Happens, for instance, if a BPF program is preventing + * the connect. Remap the error so upper layers can better + * deal with it. + */ + status = -ECONNREFUSED; + fallthrough; case -EINVAL: /* Happens, for instance, if the user specified a link * local IPv6 address without a scope-id.