From dbd7ae5154d5fff7e84a9f3010bb06499017ef29 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 2 Nov 2020 16:14:45 +0000 Subject: [PATCH 01/81] xfrm/compat: Translate by copying XFRMA_UNSPEC attribute xfrm_xlate32() translates 64-bit message provided by kernel to be sent for 32-bit listener (acknowledge or monitor). Translator code doesn't expect XFRMA_UNSPEC attribute as it doesn't know its payload. Kernel never attaches such attribute, but a user can. I've searched if any opensource does it and the answer is no. Nothing on github and google finds only tfcproject that has such code commented-out. What will happen if a user sends a netlink message with XFRMA_UNSPEC attribute? Ipsec code ignores this attribute. But if there is a monitor-process or 32-bit user requested ack - kernel will try to translate such message and will hit WARN_ONCE() in xfrm_xlate64_attr(). Deal with XFRMA_UNSPEC by copying the attribute payload with xfrm_nla_cpy(). In result, the default switch-case in xfrm_xlate64_attr() becomes an unused code. Leave those 3 lines in case a new xfrm attribute will be added. Fixes: 5461fc0c8d9f ("xfrm/compat: Add 64=>32-bit messages translator") Reported-by: syzbot+a7e701c8385bd8543074@syzkaller.appspotmail.com Signed-off-by: Dmitry Safonov Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_compat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index e28f0c9ecd6a..17edbf935e35 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -234,6 +234,7 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) case XFRMA_PAD: /* Ignore */ return 0; + case XFRMA_UNSPEC: case XFRMA_ALG_AUTH: case XFRMA_ALG_CRYPT: case XFRMA_ALG_COMP: From d1949d045fd67eab8a32a579a8c1ab1681330854 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 2 Nov 2020 16:14:46 +0000 Subject: [PATCH 02/81] xfrm/compat: memset(0) 64-bit padding at right place 32-bit messages translated by xfrm_compat can have attributes attached. For all, but XFRMA_SA, XFRMA_POLICY the size of payload is the same in 32-bit UABI and 64-bit UABI. For XFRMA_SA (struct xfrm_usersa_info) and XFRMA_POLICY (struct xfrm_userpolicy_info) it's only tail-padding that is present in 64-bit payload, but not in 32-bit. The proper size for destination nlattr is already calculated by xfrm_user_rcv_calculate_len64() and allocated with kvmalloc(). xfrm_attr_cpy32() copies 32-bit copy_len into 64-bit attribute translated payload, zero-filling possible padding for SA/POLICY. Due to a typo, *pos already has 64-bit payload size, in a result next memset(0) is called on the memory after the translated attribute, not on the tail-padding of it. Fixes: 5106f4a8acff ("xfrm/compat: Add 32=>64-bit messages translator") Reported-by: syzbot+c43831072e7df506a646@syzkaller.appspotmail.com Signed-off-by: Dmitry Safonov Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 17edbf935e35..556e9f33b815 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -388,7 +388,7 @@ static int xfrm_attr_cpy32(void *dst, size_t *pos, const struct nlattr *src, memcpy(nla, src, nla_attr_size(copy_len)); nla->nla_len = nla_attr_size(payload); - *pos += nla_attr_size(payload); + *pos += nla_attr_size(copy_len); nlmsg->nlmsg_len += nla->nla_len; memset(dst + *pos, 0, payload - copy_len); From ad37f77fd3659e87fd9833a83692e0e4eba0f5cd Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 2 Nov 2020 16:14:47 +0000 Subject: [PATCH 03/81] xfrm/compat: Don't allocate memory with __GFP_ZERO 32-bit to 64-bit messages translator zerofies needed paddings in the translation, the rest is the actual payload. Don't allocate zero pages as they are not needed. Fixes: 5106f4a8acff ("xfrm/compat: Add 32=>64-bit messages translator") Signed-off-by: Dmitry Safonov Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 556e9f33b815..d8e8a11ca845 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -564,7 +564,7 @@ static struct nlmsghdr *xfrm_user_rcv_msg_compat(const struct nlmsghdr *h32, return NULL; len += NLMSG_HDRLEN; - h64 = kvmalloc(len, GFP_KERNEL | __GFP_ZERO); + h64 = kvmalloc(len, GFP_KERNEL); if (!h64) return ERR_PTR(-ENOMEM); From 48f486e13ffdb49fbb9b38c21d0e108ed60ab1a2 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 10 Nov 2020 09:14:43 +0800 Subject: [PATCH 04/81] net: xfrm: fix memory leak in xfrm_user_policy() if xfrm_get_translator() failed, xfrm_user_policy() return without freeing 'data', which is allocated in memdup_sockptr(). Fixes: 96392ee5a13b ("xfrm/compat: Translate 32-bit user_policy from sockptr") Reported-by: Hulk Robot Signed-off-by: Yu Kuai Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index a77da7aae6fe..2f1517827995 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2382,8 +2382,10 @@ int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen) if (in_compat_syscall()) { struct xfrm_translator *xtr = xfrm_get_translator(); - if (!xtr) + if (!xtr) { + kfree(data); return -EOPNOTSUPP; + } err = xtr->xlate_user_policy_sockptr(&data, optlen); xfrm_put_translator(xtr); From ed1182dc004dbcc7cfe64fb0e8ac520b25431715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 27 Nov 2020 18:17:26 +0100 Subject: [PATCH 05/81] xdp: Handle MEM_TYPE_XSK_BUFF_POOL correctly in xdp_return_buff() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It turns out that it does exist a path where xdp_return_buff() is being passed an XDP buffer of type MEM_TYPE_XSK_BUFF_POOL. This path is when AF_XDP zero-copy mode is enabled, and a buffer is redirected to a DEVMAP with an attached XDP program that drops the buffer. This change simply puts the handling of MEM_TYPE_XSK_BUFF_POOL back into xdp_return_buff(). Fixes: 82c41671ca4f ("xdp: Simplify xdp_return_{frame, frame_rx_napi, buff}") Reported-by: Maxim Mikityanskiy Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Maxim Mikityanskiy Link: https://lore.kernel.org/bpf/20201127171726.123627-1-bjorn.topel@gmail.com --- net/core/xdp.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/net/core/xdp.c b/net/core/xdp.c index 48aba933a5a8..491ad569a79c 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -335,11 +335,10 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling - * of xdp_frames/pages in those cases. This path is never used by the - * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of - * the switch-statement. + * of xdp_frames/pages in those cases. */ -static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, + struct xdp_buff *xdp) { struct xdp_mem_allocator *xa; struct page *page; @@ -361,6 +360,10 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); break; + case MEM_TYPE_XSK_BUFF_POOL: + /* NB! Only valid from an xdp_buff! */ + xsk_buff_free(xdp); + break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); @@ -370,19 +373,19 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false); + __xdp_return(xdpf->data, &xdpf->mem, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true); + __xdp_return(xdpf->data, &xdpf->mem, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true); + __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); } /* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ From f6a8250ea1e42ad1f4f3bab01c851ec5fd48f0e7 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 30 Nov 2020 14:33:35 -0800 Subject: [PATCH 06/81] libbpf: Fix ring_buffer__poll() to return number of consumed samples Fix ring_buffer__poll() to return the number of non-discarded records consumed, just like its documentation states. It's also consistent with ring_buffer__consume() return. Fix up selftests with wrong expected results. Fixes: bf99c936f947 ("libbpf: Add BPF ring buffer support") Fixes: cb1c9ddd5525 ("selftests/bpf: Add BPF ringbuf selftests") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201130223336.904192-1-andrii@kernel.org --- tools/lib/bpf/ringbuf.c | 2 +- tools/testing/selftests/bpf/prog_tests/ringbuf.c | 2 +- tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index 5c6522c89af1..98537ff2679e 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -278,7 +278,7 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) err = ringbuf_process_ring(ring); if (err < 0) return err; - res += cnt; + res += err; } return cnt < 0 ? -errno : res; } diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index c1650548433c..1a48c6f7f54e 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -217,7 +217,7 @@ void test_ringbuf(void) if (CHECK(err, "join_bg", "err %d\n", err)) goto cleanup; - if (CHECK(bg_ret != 1, "bg_ret", "epoll_wait result: %ld", bg_ret)) + if (CHECK(bg_ret <= 0, "bg_ret", "epoll_wait result: %ld", bg_ret)) goto cleanup; /* 3 rounds, 2 samples each */ diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c index 78e450609803..d37161e59bb2 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c @@ -81,7 +81,7 @@ void test_ringbuf_multi(void) /* poll for samples, should get 2 ringbufs back */ err = ring_buffer__poll(ringbuf, -1); - if (CHECK(err != 4, "poll_res", "expected 4 records, got %d\n", err)) + if (CHECK(err != 2, "poll_res", "expected 2 records, got %d\n", err)) goto cleanup; /* expect extra polling to return nothing */ From 156c9b70dbfb83eeeff39e9202eb5f8bb6d0fd04 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 30 Nov 2020 14:33:36 -0800 Subject: [PATCH 07/81] selftests/bpf: Drain ringbuf samples at the end of test Avoid occasional test failures due to the last sample being delayed to another ring_buffer__poll() call. Instead, drain samples completely with ring_buffer__consume(). This is supposed to fix a rare and non-deterministic test failure in libbpf CI. Fixes: cb1c9ddd5525 ("selftests/bpf: Add BPF ringbuf selftests") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201130223336.904192-2-andrii@kernel.org --- tools/testing/selftests/bpf/prog_tests/ringbuf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index 1a48c6f7f54e..fddbc5db5d6a 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -220,6 +220,12 @@ void test_ringbuf(void) if (CHECK(bg_ret <= 0, "bg_ret", "epoll_wait result: %ld", bg_ret)) goto cleanup; + /* due to timing variations, there could still be non-notified + * samples, so consume them here to collect all the samples + */ + err = ring_buffer__consume(ringbuf); + CHECK(err < 0, "rb_consume", "failed: %d\b", err); + /* 3 rounds, 2 samples each */ cnt = atomic_xchg(&sample_cnt, 0); CHECK(cnt != 6, "cnt", "exp %d samples, got %d\n", 6, cnt); From f5da54187e33dce9bea63170667dbb0ca8d98194 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 1 Dec 2020 21:56:57 +0800 Subject: [PATCH 08/81] xsk: Replace datagram_poll by sock_poll_wait datagram_poll will judge the current socket status (EPOLLIN, EPOLLOUT) based on the traditional socket information (eg: sk_wmem_alloc), but this does not apply to xsk. So this patch uses sock_poll_wait instead of datagram_poll, and the mask is calculated by xsk_poll. Fixes: c497176cb2e4 ("xsk: add Rx receive functions and poll support") Signed-off-by: Xuan Zhuo Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/e82f4697438cd63edbf271ebe1918db8261b7c09.1606555939.git.xuanzhuo@linux.alibaba.com --- net/xdp/xsk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b7b039bd9d03..9bbfd8adbb73 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -471,11 +471,13 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) static __poll_t xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { - __poll_t mask = datagram_poll(file, sock, wait); + __poll_t mask = 0; struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct xsk_buff_pool *pool; + sock_poll_wait(file, sock, wait); + if (unlikely(!xsk_is_bound(xs))) return mask; From 3413f04141aa440c71da187755e8e22f5093ce83 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Tue, 1 Dec 2020 21:56:58 +0800 Subject: [PATCH 09/81] xsk: Change the tx writeable condition Modify the tx writeable condition from the queue is not full to the number of present tx queues is less than the half of the total number of queues. Because the tx queue not full is a very short time, this will cause a large number of EPOLLOUT events, and cause a large number of process wake up. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Xuan Zhuo Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/508fef55188d4e1160747ead64c6dcda36735880.1606555939.git.xuanzhuo@linux.alibaba.com --- net/xdp/xsk.c | 16 +++++++++++++--- net/xdp/xsk_queue.h | 6 ++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9bbfd8adbb73..62504471fd20 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -211,6 +211,14 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, return 0; } +static bool xsk_tx_writeable(struct xdp_sock *xs) +{ + if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) + return false; + + return true; +} + static bool xsk_is_bound(struct xdp_sock *xs) { if (READ_ONCE(xs->state) == XSK_BOUND) { @@ -296,7 +304,8 @@ void xsk_tx_release(struct xsk_buff_pool *pool) rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { __xskq_cons_release(xs->tx); - xs->sk.sk_write_space(&xs->sk); + if (xsk_tx_writeable(xs)) + xs->sk.sk_write_space(&xs->sk); } rcu_read_unlock(); } @@ -436,7 +445,8 @@ static int xsk_generic_xmit(struct sock *sk) out: if (sent_frame) - sk->sk_write_space(sk); + if (xsk_tx_writeable(xs)) + sk->sk_write_space(sk); mutex_unlock(&xs->mutex); return err; @@ -493,7 +503,7 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock, if (xs->rx && !xskq_prod_is_empty(xs->rx)) mask |= EPOLLIN | EPOLLRDNORM; - if (xs->tx && !xskq_cons_is_full(xs->tx)) + if (xs->tx && xsk_tx_writeable(xs)) mask |= EPOLLOUT | EPOLLWRNORM; return mask; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index cdb9cf3cd136..9e71b9f27679 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -264,6 +264,12 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q) q->nentries; } +static inline u32 xskq_cons_present_entries(struct xsk_queue *q) +{ + /* No barriers needed since data is not accessed */ + return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer); +} + /* Functions for producers */ static inline bool xskq_prod_is_full(struct xsk_queue *q) From 8fca2b8706f39f86312c086229e0cb364f8b4f97 Mon Sep 17 00:00:00 2001 From: Wen Gong Date: Mon, 23 Nov 2020 16:45:52 +0800 Subject: [PATCH 10/81] mac80211: fix return value of ieee80211_chandef_he_6ghz_oper ieee80211_chandef_he_6ghz_oper() needs to return true if it determined a value 6 GHz chandef, fix that. Fixes: 1d00ce807efa ("mac80211: support S1G association") Signed-off-by: Wen Gong Link: https://lore.kernel.org/r/1606121152-3452-1-git-send-email-wgong@codeaurora.org [rewrite commit message] Signed-off-by: Johannes Berg --- net/mac80211/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 49342060490f..94e624e9439b 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3455,7 +3455,7 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, *chandef = he_chandef; - return false; + return true; } bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, From f495acd8851d7b345e5f0e521b2645b1e1f928a0 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Sun, 29 Nov 2020 17:30:44 +0200 Subject: [PATCH 11/81] cfg80211: initialize rekey_data In case we have old supplicant, the akm field is uninitialized. Signed-off-by: Sara Sharon Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20201129172929.930f0ab7ebee.Ic546e384efab3f4a89f318eafddc3eb7d556aecb@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a77174b99b07..f67ddf2cebcb 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12634,7 +12634,7 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; struct nlattr *tb[NUM_NL80211_REKEY_DATA]; - struct cfg80211_gtk_rekey_data rekey_data; + struct cfg80211_gtk_rekey_data rekey_data = {}; int err; if (!info->attrs[NL80211_ATTR_REKEY_DATA]) From bdeca45a0cc58f864f1eb2e919304203ff5c5f39 Mon Sep 17 00:00:00 2001 From: "Borwankar, Antara" Date: Sun, 29 Nov 2020 17:30:53 +0200 Subject: [PATCH 12/81] mac80211: set SDATA_STATE_RUNNING for monitor interfaces During restarrt, mac80211 is supposed to reconfigure the driver. When there's a monitor interface, the interface is added and the channel context for it was created, but not assigned to it as it was not considered running during the restart. Fix this by setting SDATA_STATE_RUNNING while adding monitor interfaces. Signed-off-by: Borwankar, Antara Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20201129172929.e1df99693a4c.I494579f28018c2d0b9d4083a664cf872c28405ae@changeid [reword commit log] Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 1be775979132..44154cc596cd 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -948,6 +948,8 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) return ret; } + set_bit(SDATA_STATE_RUNNING, &sdata->state); + ret = ieee80211_check_queues(sdata, NL80211_IFTYPE_MONITOR); if (ret) { kfree(sdata); From 12c8a8ca117f3d734babc3fba131fdaa329d2163 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Fri, 4 Dec 2020 18:21:16 +0800 Subject: [PATCH 13/81] xsk: Return error code if force_zc is set If force_zc is set, we should exit out with an error, not fall back to copy mode. Fixes: 921b68692abb ("xsk: Enable sharing of dma mappings") Reported-by: Hulk Robot Signed-off-by: Zhang Changzhong Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/1607077277-41995-1-git-send-email-zhangchangzhong@huawei.com --- net/xdp/xsk_buff_pool.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 9287eddec52c..d5adeee9d5d9 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -175,6 +175,7 @@ static int __xp_assign_dev(struct xsk_buff_pool *pool, if (!pool->dma_pages) { WARN(1, "Driver did not DMA map zero-copy buffers"); + err = -EINVAL; goto err_unreg_xsk; } pool->umem->zc = true; From 4eef8b1f36f2ff06966b8f7c2143ef0c447877de Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 3 Dec 2020 19:40:47 +0100 Subject: [PATCH 14/81] net/sched: fq_pie: initialize timer earlier in fq_pie_init() with the following tdc testcase: 83be: (qdisc, fq_pie) Create FQ-PIE with invalid number of flows as fq_pie_init() fails, fq_pie_destroy() is called to clean up. Since the timer is not yet initialized, it's possible to observe a splat like this: INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 0 PID: 975 Comm: tc Not tainted 5.10.0-rc4+ #298 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 Call Trace: dump_stack+0x99/0xcb register_lock_class+0x12dd/0x1750 __lock_acquire+0xfe/0x3970 lock_acquire+0x1c8/0x7f0 del_timer_sync+0x49/0xd0 fq_pie_destroy+0x3f/0x80 [sch_fq_pie] qdisc_create+0x916/0x1160 tc_modify_qdisc+0x3c4/0x1630 rtnetlink_rcv_msg+0x346/0x8e0 netlink_unicast+0x439/0x630 netlink_sendmsg+0x719/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5ba/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 [...] ODEBUG: assert_init not available (active state 0) object type: timer_list hint: 0x0 WARNING: CPU: 0 PID: 975 at lib/debugobjects.c:508 debug_print_object+0x162/0x210 [...] Call Trace: debug_object_assert_init+0x268/0x380 try_to_del_timer_sync+0x6a/0x100 del_timer_sync+0x9e/0xd0 fq_pie_destroy+0x3f/0x80 [sch_fq_pie] qdisc_create+0x916/0x1160 tc_modify_qdisc+0x3c4/0x1630 rtnetlink_rcv_msg+0x346/0x8e0 netlink_rcv_skb+0x120/0x380 netlink_unicast+0x439/0x630 netlink_sendmsg+0x719/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5ba/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 fix it moving timer_setup() before any failure, like it was done on 'red' with former commit 608b4adab178 ("net_sched: initialize timer earlier in red_init()"). Fixes: ec97ecf1ebe4 ("net: sched: add Flow Queue PIE packet scheduler") Signed-off-by: Davide Caratti Reviewed-by: Cong Wang Link: https://lore.kernel.org/r/2e78e01c504c633ebdff18d041833cf2e079a3a4.1607020450.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/sch_fq_pie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index 4dda15588cf4..949163fe68af 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -401,6 +401,7 @@ static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt, INIT_LIST_HEAD(&q->new_flows); INIT_LIST_HEAD(&q->old_flows); + timer_setup(&q->adapt_timer, fq_pie_timer, 0); if (opt) { err = fq_pie_change(sch, opt, extack); @@ -426,7 +427,6 @@ static int fq_pie_init(struct Qdisc *sch, struct nlattr *opt, pie_vars_init(&flow->vars); } - timer_setup(&q->adapt_timer, fq_pie_timer, 0); mod_timer(&q->adapt_timer, jiffies + HZ / 2); return 0; From 1130b252480f3c98cf468e78c1c5c516b390a29c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 3 Dec 2020 15:51:06 -0600 Subject: [PATCH 15/81] net: ipa: pass the correct size when freeing DMA memory When the coherent memory is freed in gsi_trans_pool_exit_dma(), we are mistakenly passing the size of a single element in the pool rather than the actual allocated size. Fix this bug. Fixes: 9dd441e4ed575 ("soc: qcom: ipa: GSI transactions") Reported-by: Stephen Boyd Tested-by: Sujit Kautkar Signed-off-by: Alex Elder Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20201203215106.17450-1-elder@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ipa/gsi_trans.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ipa/gsi_trans.c b/drivers/net/ipa/gsi_trans.c index e8599bb948c0..6c3ed5b17b80 100644 --- a/drivers/net/ipa/gsi_trans.c +++ b/drivers/net/ipa/gsi_trans.c @@ -156,6 +156,9 @@ int gsi_trans_pool_init_dma(struct device *dev, struct gsi_trans_pool *pool, /* The allocator will give us a power-of-2 number of pages. But we * can't guarantee that, so request it. That way we won't waste any * memory that would be available beyond the required space. + * + * Note that gsi_trans_pool_exit_dma() assumes the total allocated + * size is exactly (count * size). */ total_size = get_order(total_size) << PAGE_SHIFT; @@ -175,7 +178,9 @@ int gsi_trans_pool_init_dma(struct device *dev, struct gsi_trans_pool *pool, void gsi_trans_pool_exit_dma(struct device *dev, struct gsi_trans_pool *pool) { - dma_free_coherent(dev, pool->size, pool->base, pool->addr); + size_t total_size = pool->count * pool->size; + + dma_free_coherent(dev, total_size, pool->base, pool->addr); memset(pool, 0, sizeof(*pool)); } From 0b32e91fdfd87314af9943e69eb85a88adb4233c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 4 Dec 2020 00:20:37 +0100 Subject: [PATCH 16/81] ethernet: select CONFIG_CRC32 as needed A number of ethernet drivers require crc32 functionality to be avaialable in the kernel, causing a link error otherwise: arm-linux-gnueabi-ld: drivers/net/ethernet/agere/et131x.o: in function `et1310_setup_device_for_multicast': et131x.c:(.text+0x5918): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/net/ethernet/cadence/macb_main.o: in function `macb_start_xmit': macb_main.c:(.text+0x4b88): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/net/ethernet/faraday/ftgmac100.o: in function `ftgmac100_set_rx_mode': ftgmac100.c:(.text+0x2b38): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/net/ethernet/freescale/fec_main.o: in function `set_multicast_list': fec_main.c:(.text+0x6120): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/net/ethernet/freescale/fman/fman_dtsec.o: in function `dtsec_add_hash_mac_address': fman_dtsec.c:(.text+0x830): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/net/ethernet/freescale/fman/fman_dtsec.o:fman_dtsec.c:(.text+0xb68): more undefined references to `crc32_le' follow arm-linux-gnueabi-ld: drivers/net/ethernet/netronome/nfp/nfpcore/nfp_hwinfo.o: in function `nfp_hwinfo_read': nfp_hwinfo.c:(.text+0x250): undefined reference to `crc32_be' arm-linux-gnueabi-ld: nfp_hwinfo.c:(.text+0x288): undefined reference to `crc32_be' arm-linux-gnueabi-ld: drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.o: in function `nfp_resource_acquire': nfp_resource.c:(.text+0x144): undefined reference to `crc32_be' arm-linux-gnueabi-ld: nfp_resource.c:(.text+0x158): undefined reference to `crc32_be' arm-linux-gnueabi-ld: drivers/net/ethernet/nxp/lpc_eth.o: in function `lpc_eth_set_multicast_list': lpc_eth.c:(.text+0x1934): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/net/ethernet/rocker/rocker_ofdpa.o: in function `ofdpa_flow_tbl_do': rocker_ofdpa.c:(.text+0x2e08): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/net/ethernet/rocker/rocker_ofdpa.o: in function `ofdpa_flow_tbl_del': rocker_ofdpa.c:(.text+0x3074): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/net/ethernet/rocker/rocker_ofdpa.o: in function `ofdpa_port_fdb': arm-linux-gnueabi-ld: drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.o: in function `mlx5dr_ste_calc_hash_index': dr_ste.c:(.text+0x354): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/net/ethernet/microchip/lan743x_main.o: in function `lan743x_netdev_set_multicast': lan743x_main.c:(.text+0x5dc4): undefined reference to `crc32_le' Add the missing 'select CRC32' entries in Kconfig for each of them. Signed-off-by: Arnd Bergmann Acked-by: Nicolas Ferre Acked-by: Madalin Bucur Acked-by: Mark Einon Acked-by: Simon Horman Link: https://lore.kernel.org/r/20201203232114.1485603-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/agere/Kconfig | 1 + drivers/net/ethernet/cadence/Kconfig | 1 + drivers/net/ethernet/faraday/Kconfig | 1 + drivers/net/ethernet/freescale/Kconfig | 1 + drivers/net/ethernet/freescale/fman/Kconfig | 1 + drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 1 + drivers/net/ethernet/microchip/Kconfig | 1 + drivers/net/ethernet/netronome/Kconfig | 1 + drivers/net/ethernet/nxp/Kconfig | 1 + drivers/net/ethernet/rocker/Kconfig | 1 + 10 files changed, 10 insertions(+) diff --git a/drivers/net/ethernet/agere/Kconfig b/drivers/net/ethernet/agere/Kconfig index d92516ae59cc..9cd750184947 100644 --- a/drivers/net/ethernet/agere/Kconfig +++ b/drivers/net/ethernet/agere/Kconfig @@ -21,6 +21,7 @@ config ET131X tristate "Agere ET-1310 Gigabit Ethernet support" depends on PCI select PHYLIB + select CRC32 help This driver supports Agere ET-1310 ethernet adapters. diff --git a/drivers/net/ethernet/cadence/Kconfig b/drivers/net/ethernet/cadence/Kconfig index 85858163bac5..e432a68ac520 100644 --- a/drivers/net/ethernet/cadence/Kconfig +++ b/drivers/net/ethernet/cadence/Kconfig @@ -23,6 +23,7 @@ config MACB tristate "Cadence MACB/GEM support" depends on HAS_DMA && COMMON_CLK select PHYLINK + select CRC32 help The Cadence MACB ethernet interface is found on many Atmel AT32 and AT91 parts. This driver also supports the Cadence GEM (Gigabit diff --git a/drivers/net/ethernet/faraday/Kconfig b/drivers/net/ethernet/faraday/Kconfig index c2677ec0564d..3d1e9a302148 100644 --- a/drivers/net/ethernet/faraday/Kconfig +++ b/drivers/net/ethernet/faraday/Kconfig @@ -33,6 +33,7 @@ config FTGMAC100 depends on !64BIT || BROKEN select PHYLIB select MDIO_ASPEED if MACH_ASPEED_G6 + select CRC32 help This driver supports the FTGMAC100 Gigabit Ethernet controller from Faraday. It is used on Faraday A369, Andes AG102 and some diff --git a/drivers/net/ethernet/freescale/Kconfig b/drivers/net/ethernet/freescale/Kconfig index a1d53ddf1593..3f9175bdce77 100644 --- a/drivers/net/ethernet/freescale/Kconfig +++ b/drivers/net/ethernet/freescale/Kconfig @@ -25,6 +25,7 @@ config FEC depends on (M523x || M527x || M5272 || M528x || M520x || M532x || \ ARCH_MXC || SOC_IMX28 || COMPILE_TEST) default ARCH_MXC || SOC_IMX28 if ARM + select CRC32 select PHYLIB imply PTP_1588_CLOCK help diff --git a/drivers/net/ethernet/freescale/fman/Kconfig b/drivers/net/ethernet/freescale/fman/Kconfig index 34150182cc35..48bf8088795d 100644 --- a/drivers/net/ethernet/freescale/fman/Kconfig +++ b/drivers/net/ethernet/freescale/fman/Kconfig @@ -4,6 +4,7 @@ config FSL_FMAN depends on FSL_SOC || ARCH_LAYERSCAPE || COMPILE_TEST select GENERIC_ALLOCATOR select PHYLIB + select CRC32 default n help Freescale Data-Path Acceleration Architecture Frame Manager diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index 99f1ec3b2575..3e371d24c462 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -198,6 +198,7 @@ config MLX5_EN_TLS config MLX5_SW_STEERING bool "Mellanox Technologies software-managed steering" depends on MLX5_CORE_EN && MLX5_ESWITCH + select CRC32 default y help Build support for software-managed steering in the NIC. diff --git a/drivers/net/ethernet/microchip/Kconfig b/drivers/net/ethernet/microchip/Kconfig index 31f9a82dc113..d0f6dfe0dcf3 100644 --- a/drivers/net/ethernet/microchip/Kconfig +++ b/drivers/net/ethernet/microchip/Kconfig @@ -47,6 +47,7 @@ config LAN743X depends on PCI select PHYLIB select CRC16 + select CRC32 help Support for the Microchip LAN743x PCI Express Gigabit Ethernet chip diff --git a/drivers/net/ethernet/netronome/Kconfig b/drivers/net/ethernet/netronome/Kconfig index d8b99d6a0356..b82758d5beed 100644 --- a/drivers/net/ethernet/netronome/Kconfig +++ b/drivers/net/ethernet/netronome/Kconfig @@ -22,6 +22,7 @@ config NFP depends on VXLAN || VXLAN=n depends on TLS && TLS_DEVICE || TLS_DEVICE=n select NET_DEVLINK + select CRC32 help This driver supports the Netronome(R) NFP4000/NFP6000 based cards working as a advanced Ethernet NIC. It works with both diff --git a/drivers/net/ethernet/nxp/Kconfig b/drivers/net/ethernet/nxp/Kconfig index ee83a71c2509..c84997db828c 100644 --- a/drivers/net/ethernet/nxp/Kconfig +++ b/drivers/net/ethernet/nxp/Kconfig @@ -3,6 +3,7 @@ config LPC_ENET tristate "NXP ethernet MAC on LPC devices" depends on ARCH_LPC32XX || COMPILE_TEST select PHYLIB + select CRC32 help Say Y or M here if you want to use the NXP ethernet MAC included on some NXP LPC devices. You can safely enable this option for LPC32xx diff --git a/drivers/net/ethernet/rocker/Kconfig b/drivers/net/ethernet/rocker/Kconfig index 99e1290e0307..2318811ff75a 100644 --- a/drivers/net/ethernet/rocker/Kconfig +++ b/drivers/net/ethernet/rocker/Kconfig @@ -19,6 +19,7 @@ if NET_VENDOR_ROCKER config ROCKER tristate "Rocker switch driver (EXPERIMENTAL)" depends on PCI && NET_SWITCHDEV && BRIDGE + select CRC32 help This driver supports Rocker switch device. From b410f04eb5b482b5efc4eee90de81ad35d3d923b Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Fri, 4 Dec 2020 16:48:14 +0800 Subject: [PATCH 17/81] ipv4: fix error return code in rtm_to_fib_config() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Fixes: d15662682db2 ("ipv4: Allow ipv6 gateway with ipv4 routes") Reported-by: Hulk Robot Signed-off-by: Zhang Changzhong Reviewed-by: David Ahern Link: https://lore.kernel.org/r/1607071695-33740-1-git-send-email-zhangchangzhong@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b87140a1fa28..cdf6ec5aa45d 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -825,7 +825,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); - goto errout; + return -EINVAL; } return 0; From ee4f52a8de2c6f78b01f10b4c330867d88c1653a Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Fri, 4 Dec 2020 16:48:56 +0800 Subject: [PATCH 18/81] net: bridge: vlan: fix error return code in __vlan_add() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Fixes: f8ed289fab84 ("bridge: vlan: use br_vlan_(get|put)_master to deal with refcounts") Reported-by: Hulk Robot Signed-off-by: Zhang Changzhong Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/1607071737-33875-1-git-send-email-zhangchangzhong@huawei.com Signed-off-by: Jakub Kicinski --- net/bridge/br_vlan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 3e493eb85bb2..08c77418c687 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -266,8 +266,10 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, } masterv = br_vlan_get_master(br, v->vid, extack); - if (!masterv) + if (!masterv) { + err = -ENOMEM; goto out_filt; + } v->brvlan = masterv; if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) { v->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats); From bb2da7651a47dc042cb7fc9c40cd77092b6b4445 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Fri, 4 Dec 2020 19:43:14 +0800 Subject: [PATCH 19/81] openvswitch: fix error return code in validate_and_copy_dec_ttl() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Changing 'return start' to 'return action_start' can fix this bug. Fixes: 69929d4c49e1 ("net: openvswitch: fix TTL decrement action netlink message format") Reported-by: Hulk Robot Signed-off-by: Wang Hai Reviewed-by: Eelco Chaudron Link: https://lore.kernel.org/r/20201204114314.1596-1-wanghai38@huawei.com Signed-off-by: Jakub Kicinski --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index ec0689ddc635..4c5c2331e764 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2531,7 +2531,7 @@ static int validate_and_copy_dec_ttl(struct net *net, action_start = add_nested_action_start(sfa, OVS_DEC_TTL_ATTR_ACTION, log); if (action_start < 0) - return start; + return action_start; err = __ovs_nla_copy_actions(net, actions, key, sfa, eth_type, vlan_tci, mpls_label_count, log); From 905b2032fa424f253d9126271439cc1db2b01130 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Dec 2020 08:24:28 -0800 Subject: [PATCH 20/81] mac80211: mesh: fix mesh_pathtbl_init() error path If tbl_mpp can not be allocated, we call mesh_table_free(tbl_path) while tbl_path rhashtable has not yet been initialized, which causes panics. Simply factorize the rhashtable_init() call into mesh_table_alloc() WARNING: CPU: 1 PID: 8474 at kernel/workqueue.c:3040 __flush_work kernel/workqueue.c:3040 [inline] WARNING: CPU: 1 PID: 8474 at kernel/workqueue.c:3040 __cancel_work_timer+0x514/0x540 kernel/workqueue.c:3136 Modules linked in: CPU: 1 PID: 8474 Comm: syz-executor663 Not tainted 5.10.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__flush_work kernel/workqueue.c:3040 [inline] RIP: 0010:__cancel_work_timer+0x514/0x540 kernel/workqueue.c:3136 Code: 5d c3 e8 bf ae 29 00 0f 0b e9 f0 fd ff ff e8 b3 ae 29 00 0f 0b 43 80 3c 3e 00 0f 85 31 ff ff ff e9 34 ff ff ff e8 9c ae 29 00 <0f> 0b e9 dc fe ff ff 89 e9 80 e1 07 80 c1 03 38 c1 0f 8c 7d fd ff RSP: 0018:ffffc9000165f5a0 EFLAGS: 00010293 RAX: ffffffff814b7064 RBX: 0000000000000001 RCX: ffff888021c80000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffff888024039ca0 R08: dffffc0000000000 R09: fffffbfff1dd3e64 R10: fffffbfff1dd3e64 R11: 0000000000000000 R12: 1ffff920002cbebd R13: ffff888024039c88 R14: 1ffff11004807391 R15: dffffc0000000000 FS: 0000000001347880(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000140 CR3: 000000002cc0a000 CR4: 00000000001506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: rhashtable_free_and_destroy+0x25/0x9c0 lib/rhashtable.c:1137 mesh_table_free net/mac80211/mesh_pathtbl.c:69 [inline] mesh_pathtbl_init+0x287/0x2e0 net/mac80211/mesh_pathtbl.c:785 ieee80211_mesh_init_sdata+0x2ee/0x530 net/mac80211/mesh.c:1591 ieee80211_setup_sdata+0x733/0xc40 net/mac80211/iface.c:1569 ieee80211_if_add+0xd5c/0x1cd0 net/mac80211/iface.c:1987 ieee80211_add_iface+0x59/0x130 net/mac80211/cfg.c:125 rdev_add_virtual_intf net/wireless/rdev-ops.h:45 [inline] nl80211_new_interface+0x563/0xb40 net/wireless/nl80211.c:3855 genl_family_rcv_msg_doit net/netlink/genetlink.c:739 [inline] genl_family_rcv_msg net/netlink/genetlink.c:783 [inline] genl_rcv_msg+0xe4e/0x1280 net/netlink/genetlink.c:800 netlink_rcv_skb+0x190/0x3a0 net/netlink/af_netlink.c:2494 genl_rcv+0x24/0x40 net/netlink/genetlink.c:811 netlink_unicast_kernel net/netlink/af_netlink.c:1304 [inline] netlink_unicast+0x780/0x930 net/netlink/af_netlink.c:1330 netlink_sendmsg+0x9a8/0xd40 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:651 [inline] sock_sendmsg net/socket.c:671 [inline] ____sys_sendmsg+0x519/0x800 net/socket.c:2353 ___sys_sendmsg net/socket.c:2407 [inline] __sys_sendmsg+0x2b1/0x360 net/socket.c:2440 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 60854fd94573 ("mac80211: mesh: convert path table to rhashtable") Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: Johannes Berg Link: https://lore.kernel.org/r/20201204162428.2583119-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/mac80211/mesh_pathtbl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 48f31ac9233c..620ecf922408 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -60,6 +60,7 @@ static struct mesh_table *mesh_table_alloc(void) atomic_set(&newtbl->entries, 0); spin_lock_init(&newtbl->gates_lock); spin_lock_init(&newtbl->walk_lock); + rhashtable_init(&newtbl->rhead, &mesh_rht_params); return newtbl; } @@ -773,9 +774,6 @@ int mesh_pathtbl_init(struct ieee80211_sub_if_data *sdata) goto free_path; } - rhashtable_init(&tbl_path->rhead, &mesh_rht_params); - rhashtable_init(&tbl_mpp->rhead, &mesh_rht_params); - sdata->u.mesh.mesh_paths = tbl_path; sdata->u.mesh.mpp_paths = tbl_mpp; From a54ba3465d86fa5dd7d41bb88c0b5e71fb3b627e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 3 Dec 2020 23:26:16 +0100 Subject: [PATCH 21/81] ch_ktls: fix build warning for ipv4-only config When CONFIG_IPV6 is disabled, clang complains that a variable is uninitialized for non-IPv4 data: drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c:1046:6: error: variable 'cntrl1' is used uninitialized whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized] if (tx_info->ip_family == AF_INET) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c:1059:2: note: uninitialized use occurs here cntrl1 |= T6_TXPKT_ETHHDR_LEN_V(maclen - ETH_HLEN) | ^~~~~~ Replace the preprocessor conditional with the corresponding C version, and make the ipv4 case unconditional in this configuration to improve readability and avoid the warning. Fixes: 86716b51d14f ("ch_ktls: Update cheksum information") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20201203222641.964234-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 7f90b828d159..1b7e8c91b541 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -987,9 +987,7 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, struct fw_eth_tx_pkt_wr *wr; struct cpl_tx_pkt_core *cpl; u32 ctrl, iplen, maclen; -#if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *ip6; -#endif unsigned int ndesc; struct tcphdr *tcp; int len16, pktlen; @@ -1043,17 +1041,15 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, cpl->len = htons(pktlen); memcpy(buf, skb->data, pktlen); - if (tx_info->ip_family == AF_INET) { + if (!IS_ENABLED(CONFIG_IPV6) || tx_info->ip_family == AF_INET) { /* we need to correct ip header len */ ip = (struct iphdr *)(buf + maclen); ip->tot_len = htons(pktlen - maclen); cntrl1 = TXPKT_CSUM_TYPE_V(TX_CSUM_TCPIP); -#if IS_ENABLED(CONFIG_IPV6) } else { ip6 = (struct ipv6hdr *)(buf + maclen); ip6->payload_len = htons(pktlen - maclen - iplen); cntrl1 = TXPKT_CSUM_TYPE_V(TX_CSUM_TCPIP6); -#endif } cntrl1 |= T6_TXPKT_ETHHDR_LEN_V(maclen - ETH_HLEN) | From 4d1be581ec6b92a338bb7ed23e1381f45ddf336f Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Fri, 4 Dec 2020 14:35:06 +0100 Subject: [PATCH 22/81] can: softing: softing_netdev_open(): fix error handling If softing_netdev_open() fails, we should call close_candev() to avoid reference leak. Fixes: 03fd3cf5a179d ("can: add driver for Softing card") Signed-off-by: Zhang Qilong Acked-by: Kurt Van Dijck Link: https://lore.kernel.org/r/20201202151632.1343786-1-zhangqilong3@huawei.com Signed-off-by: Marc Kleine-Budde Link: https://lore.kernel.org/r/20201204133508.742120-2-mkl@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/can/softing/softing_main.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/softing/softing_main.c b/drivers/net/can/softing/softing_main.c index 9d2faaa39ce4..c9ca8b9fceb9 100644 --- a/drivers/net/can/softing/softing_main.c +++ b/drivers/net/can/softing/softing_main.c @@ -382,8 +382,13 @@ static int softing_netdev_open(struct net_device *ndev) /* check or determine and set bittime */ ret = open_candev(ndev); - if (!ret) - ret = softing_startstop(ndev, 1); + if (ret) + return ret; + + ret = softing_startstop(ndev, 1); + if (ret < 0) + close_candev(ndev); + return ret; } From 205704c618af0ab2366015d2281a3b0814d918a0 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Thu, 3 Dec 2020 22:06:04 -0500 Subject: [PATCH 23/81] vrf: packets with lladdr src needs dst at input with orig_iif when needs strict Depending on the order of the routes to fe80::/64 are installed on the VRF table, the NS for the source link-local address of the originator might be sent to the wrong interface. This patch ensures that packets with link-local addr source is doing a lookup with the orig_iif when the destination addr indicates that it is strict. Add the reproducer as a use case in self test script fcnal-test.sh. Fixes: b4869aa2f881 ("net: vrf: ipv6 support for local traffic to local addresses") Signed-off-by: Stephen Suryaputra Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20201204030604.18828-1-ssuryaextr@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/vrf.c | 10 ++- tools/testing/selftests/net/fcnal-test.sh | 95 +++++++++++++++++++++++ 2 files changed, 103 insertions(+), 2 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index f2793ffde191..b9b7e00b72a8 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1315,11 +1315,17 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, int orig_iif = skb->skb_iif; bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); bool is_ndisc = ipv6_ndisc_frame(skb); + bool is_ll_src; /* loopback, multicast & non-ND link-local traffic; do not push through - * packet taps again. Reset pkt_type for upper layers to process skb + * packet taps again. Reset pkt_type for upper layers to process skb. + * for packets with lladdr src, however, skip so that the dst can be + * determine at input using original ifindex in the case that daddr + * needs strict */ - if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { + is_ll_src = ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL; + if (skb->pkt_type == PACKET_LOOPBACK || + (need_strict && !is_ndisc && !is_ll_src)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index fb5c55dd6df8..02b0b9ead40b 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -256,6 +256,28 @@ setup_cmd_nsb() fi } +setup_cmd_nsc() +{ + local cmd="$*" + local rc + + run_cmd_nsc ${cmd} + rc=$? + if [ $rc -ne 0 ]; then + # show user the command if not done so already + if [ "$VERBOSE" = "0" ]; then + echo "setup command: $cmd" + fi + echo "failed. stopping tests" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue" + read a + fi + exit $rc + fi +} + # set sysctl values in NS-A set_sysctl() { @@ -471,6 +493,36 @@ setup() sleep 1 } +setup_lla_only() +{ + # make sure we are starting with a clean slate + kill_procs + cleanup 2>/dev/null + + log_debug "Configuring network namespaces" + set -e + + create_ns ${NSA} "-" "-" + create_ns ${NSB} "-" "-" + create_ns ${NSC} "-" "-" + connect_ns ${NSA} ${NSA_DEV} "-" "-" \ + ${NSB} ${NSB_DEV} "-" "-" + connect_ns ${NSA} ${NSA_DEV2} "-" "-" \ + ${NSC} ${NSC_DEV} "-" "-" + + NSA_LINKIP6=$(get_linklocal ${NSA} ${NSA_DEV}) + NSB_LINKIP6=$(get_linklocal ${NSB} ${NSB_DEV}) + NSC_LINKIP6=$(get_linklocal ${NSC} ${NSC_DEV}) + + create_vrf ${NSA} ${VRF} ${VRF_TABLE} "-" "-" + ip -netns ${NSA} link set dev ${NSA_DEV} vrf ${VRF} + ip -netns ${NSA} link set dev ${NSA_DEV2} vrf ${VRF} + + set +e + + sleep 1 +} + ################################################################################ # IPv4 @@ -3787,10 +3839,53 @@ use_case_br() setup_cmd_nsb ip li del vlan100 2>/dev/null } +# VRF only. +# ns-A device is connected to both ns-B and ns-C on a single VRF but only has +# LLA on the interfaces +use_case_ping_lla_multi() +{ + setup_lla_only + # only want reply from ns-A + setup_cmd_nsb sysctl -qw net.ipv6.icmp.echo_ignore_multicast=1 + setup_cmd_nsc sysctl -qw net.ipv6.icmp.echo_ignore_multicast=1 + + log_start + run_cmd_nsb ping -c1 -w1 ${MCAST}%${NSB_DEV} + log_test_addr ${MCAST}%${NSB_DEV} $? 0 "Pre cycle, ping out ns-B" + + run_cmd_nsc ping -c1 -w1 ${MCAST}%${NSC_DEV} + log_test_addr ${MCAST}%${NSC_DEV} $? 0 "Pre cycle, ping out ns-C" + + # cycle/flap the first ns-A interface + setup_cmd ip link set ${NSA_DEV} down + setup_cmd ip link set ${NSA_DEV} up + sleep 1 + + log_start + run_cmd_nsb ping -c1 -w1 ${MCAST}%${NSB_DEV} + log_test_addr ${MCAST}%${NSB_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV}, ping out ns-B" + run_cmd_nsc ping -c1 -w1 ${MCAST}%${NSC_DEV} + log_test_addr ${MCAST}%${NSC_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV}, ping out ns-C" + + # cycle/flap the second ns-A interface + setup_cmd ip link set ${NSA_DEV2} down + setup_cmd ip link set ${NSA_DEV2} up + sleep 1 + + log_start + run_cmd_nsb ping -c1 -w1 ${MCAST}%${NSB_DEV} + log_test_addr ${MCAST}%${NSB_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV2}, ping out ns-B" + run_cmd_nsc ping -c1 -w1 ${MCAST}%${NSC_DEV} + log_test_addr ${MCAST}%${NSC_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV2}, ping out ns-C" +} + use_cases() { log_section "Use cases" + log_subsection "Device enslaved to bridge" use_case_br + log_subsection "Ping LLA with multiple interfaces" + use_case_ping_lla_multi } ################################################################################ From 4de377b659035309ba48638d70f3150d5c67611f Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Fri, 4 Dec 2020 16:49:42 +0800 Subject: [PATCH 24/81] net: marvell: prestera: Fix error return code in prestera_port_create() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Fixes: 501ef3066c89 ("net: marvell: prestera: Add driver for Prestera family ASIC devices") Reported-by: Hulk Robot Signed-off-by: Zhang Changzhong Link: https://lore.kernel.org/r/1607071782-34006-1-git-send-email-zhangchangzhong@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/prestera/prestera_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c index 0f20e0788cce..da4b286d1337 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_main.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c @@ -318,8 +318,10 @@ static int prestera_port_create(struct prestera_switch *sw, u32 id) goto err_port_init; } - if (port->fp_id >= PRESTERA_MAC_ADDR_NUM_MAX) + if (port->fp_id >= PRESTERA_MAC_ADDR_NUM_MAX) { + err = -EINVAL; goto err_port_init; + } /* firmware requires that port's MAC address consist of the first * 5 bytes of the base MAC address From edd2410b165e2ef00b2264ae362edf7441ca929c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 4 Dec 2020 19:54:16 +0200 Subject: [PATCH 25/81] net: mscc: ocelot: fix dropping of unknown IPv4 multicast on Seville The current assumption is that the felix DSA driver has flooding knobs per traffic class, while ocelot switchdev has a single flooding knob. This was correct for felix VSC9959 and ocelot VSC7514, but with the introduction of seville VSC9953, we see a switch driven by felix.c which has a single flooding knob. So it is clear that we must do what should have been done from the beginning, which is not to overwrite the configuration done by ocelot.c in felix, but instead to teach the common ocelot library about the differences in our switches, and set up the flooding PGIDs centrally. The effect that the bogus iteration through FELIX_NUM_TC has upon seville is quite dramatic. ANA_FLOODING is located at 0x00b548, and ANA_FLOODING_IPMC is located at 0x00b54c. So the bogus iteration will actually overwrite ANA_FLOODING_IPMC when attempting to write ANA_FLOODING[1]. There is no ANA_FLOODING[1] in sevile, just ANA_FLOODING. And when ANA_FLOODING_IPMC is overwritten with a bogus value, the effect is that ANA_FLOODING_IPMC gets the value of 0x0003CF7D: MC6_DATA = 61, MC6_CTRL = 61, MC4_DATA = 60, MC4_CTRL = 0. Because MC4_CTRL is zero, this means that IPv4 multicast control packets are not flooded, but dropped. An invalid configuration, and this is how the issue was actually spotted. Reported-by: Eldar Gasanov Reported-by: Maxim Kochetkov Tested-by: Eldar Gasanov Fixes: 84705fc16552 ("net: dsa: felix: introduce support for Seville VSC9953 switch") Fixes: 3c7b51bd39b2 ("net: dsa: felix: allow flooding for all traffic classes") Signed-off-by: Vladimir Oltean Reviewed-by: Alexandre Belloni Link: https://lore.kernel.org/r/20201204175416.1445937-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix.c | 7 ------- drivers/net/dsa/ocelot/felix_vsc9959.c | 1 + drivers/net/dsa/ocelot/seville_vsc9953.c | 1 + drivers/net/ethernet/mscc/ocelot.c | 9 +++++---- drivers/net/ethernet/mscc/ocelot_vsc7514.c | 1 + include/soc/mscc/ocelot.h | 3 +++ 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index f791860d495f..c444ef3da3e2 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -569,7 +569,6 @@ static int felix_setup(struct dsa_switch *ds) struct ocelot *ocelot = ds->priv; struct felix *felix = ocelot_to_felix(ocelot); int port, err; - int tc; err = felix_init_structs(felix, ds->num_ports); if (err) @@ -608,12 +607,6 @@ static int felix_setup(struct dsa_switch *ds) ocelot_write_rix(ocelot, ANA_PGID_PGID_PGID(GENMASK(ocelot->num_phys_ports, 0)), ANA_PGID_PGID, PGID_UC); - /* Setup the per-traffic class flooding PGIDs */ - for (tc = 0; tc < FELIX_NUM_TC; tc++) - ocelot_write_rix(ocelot, ANA_FLOODING_FLD_MULTICAST(PGID_MC) | - ANA_FLOODING_FLD_BROADCAST(PGID_MC) | - ANA_FLOODING_FLD_UNICAST(PGID_UC), - ANA_FLOODING, tc); ds->mtu_enforcement_ingress = true; ds->configure_vlan_while_not_filtering = true; diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index 3e925b8d5306..2e5bbdca5ea4 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1429,6 +1429,7 @@ static int felix_pci_probe(struct pci_dev *pdev, pci_set_drvdata(pdev, felix); ocelot = &felix->ocelot; ocelot->dev = &pdev->dev; + ocelot->num_flooding_pgids = FELIX_NUM_TC; felix->info = &felix_info_vsc9959; felix->switch_base = pci_resource_start(pdev, felix->info->switch_pci_bar); diff --git a/drivers/net/dsa/ocelot/seville_vsc9953.c b/drivers/net/dsa/ocelot/seville_vsc9953.c index 1d420c4a2f0f..ebbaf6817ec8 100644 --- a/drivers/net/dsa/ocelot/seville_vsc9953.c +++ b/drivers/net/dsa/ocelot/seville_vsc9953.c @@ -1210,6 +1210,7 @@ static int seville_probe(struct platform_device *pdev) ocelot = &felix->ocelot; ocelot->dev = &pdev->dev; + ocelot->num_flooding_pgids = 1; felix->info = &seville_info_vsc9953; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 70bf8c67d7ef..a53bd36b11c6 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1489,10 +1489,11 @@ int ocelot_init(struct ocelot *ocelot) SYS_FRM_AGING_MAX_AGE(307692), SYS_FRM_AGING); /* Setup flooding PGIDs */ - ocelot_write_rix(ocelot, ANA_FLOODING_FLD_MULTICAST(PGID_MC) | - ANA_FLOODING_FLD_BROADCAST(PGID_MC) | - ANA_FLOODING_FLD_UNICAST(PGID_UC), - ANA_FLOODING, 0); + for (i = 0; i < ocelot->num_flooding_pgids; i++) + ocelot_write_rix(ocelot, ANA_FLOODING_FLD_MULTICAST(PGID_MC) | + ANA_FLOODING_FLD_BROADCAST(PGID_MC) | + ANA_FLOODING_FLD_UNICAST(PGID_UC), + ANA_FLOODING, i); ocelot_write(ocelot, ANA_FLOODING_IPMC_FLD_MC6_DATA(PGID_MCIPV6) | ANA_FLOODING_IPMC_FLD_MC6_CTRL(PGID_MC) | ANA_FLOODING_IPMC_FLD_MC4_DATA(PGID_MCIPV4) | diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index dc00772950e5..1e7729421a82 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -1254,6 +1254,7 @@ static int mscc_ocelot_probe(struct platform_device *pdev) } ocelot->num_phys_ports = of_get_child_count(ports); + ocelot->num_flooding_pgids = 1; ocelot->vcap = vsc7514_vcap_props; ocelot->inj_prefix = OCELOT_TAG_PREFIX_NONE; diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 1e9db9577441..49b46df476f2 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -618,6 +618,9 @@ struct ocelot { /* Keep track of the vlan port masks */ u32 vlan_mask[VLAN_N_VID]; + /* Switches like VSC9959 have flooding per traffic class */ + int num_flooding_pgids; + /* In tables like ANA:PORT and the ANA:PGID:PGID mask, * the CPU is located after the physical ports (at the * num_phys_ports index). From 61f54de2e9194f01874d5eda12037b0978e77519 Mon Sep 17 00:00:00 2001 From: Huazhong Tan Date: Mon, 7 Dec 2020 15:20:25 +0800 Subject: [PATCH 26/81] net: hns3: remove a misused pragma packed hclge_dbg_reg_info[] is defined as an array of packed structure accidentally. However, this array contains pointers, which are no longer aligned naturally, and cannot be relocated on PPC64. Hence, when compile-testing this driver on PPC64 with CONFIG_RELOCATABLE=y (e.g. PowerPC allyesconfig), there will be some warnings. Since each field in structure hclge_qos_pri_map_cmd and hclge_dbg_bitmap_cmd is type u8, the pragma packed is unnecessary for these two structures as well, so remove the pragma packed in hclge_debugfs.h to fix this issue, and this increases hclge_dbg_reg_info[] by 4 bytes per entry. Fixes: a582b78dfc33 ("net: hns3: code optimization for debugfs related to "dump reg"") Reported-by: Stephen Rothwell Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h index a9066e6ff697..ca2ab6cf84d9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.h @@ -35,8 +35,6 @@ #define HCLGE_DBG_DFX_SSU_2_OFFSET 12 -#pragma pack(1) - struct hclge_qos_pri_map_cmd { u8 pri0_tc : 4, pri1_tc : 4; @@ -85,8 +83,6 @@ struct hclge_dbg_reg_type_info { struct hclge_dbg_reg_common_msg reg_msg; }; -#pragma pack() - static const struct hclge_dbg_dfx_message hclge_dbg_bios_common_reg[] = { {false, "Reserved"}, {true, "BP_CPU_STATE"}, From 10c678bd0a035ac2c64a9b26b222f20556227a53 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 7 Dec 2020 15:55:40 +0800 Subject: [PATCH 27/81] udp: fix the proto value passed to ip_protocol_deliver_rcu for the segments Guillaume noticed that: for segments udp_queue_rcv_one_skb() returns the proto, and it should pass "ret" unmodified to ip_protocol_deliver_rcu(). Otherwize, with a negtive value passed, it will underflow inet_protos. This can be reproduced with IPIP FOU: # ip fou add port 5555 ipproto 4 # ethtool -K eth1 rx-gro-list on Fixes: cf329aa42b66 ("udp: cope with UDP GRO packet misdirection") Reported-by: Guillaume Nault Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 09f0a23d1a01..9eeebd4a0054 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2173,7 +2173,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) __skb_pull(skb, skb_transport_offset(skb)); ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0) - ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret); + ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret); } return 0; } From d9054a1ff585ba01029584ab730efc794603d68f Mon Sep 17 00:00:00 2001 From: Dongdong Wang Date: Fri, 4 Dec 2020 23:59:45 -0800 Subject: [PATCH 28/81] lwt: Disable BH too in run_lwt_bpf() The per-cpu bpf_redirect_info is shared among all skb_do_redirect() and BPF redirect helpers. Callers on RX path are all in BH context, disabling preemption is not sufficient to prevent BH interruption. In production, we observed strange packet drops because of the race condition between LWT xmit and TC ingress, and we verified this issue is fixed after we disable BH. Although this bug was technically introduced from the beginning, that is commit 3a0af8fd61f9 ("bpf: BPF for lightweight tunnel infrastructure"), at that time call_rcu() had to be call_rcu_bh() to match the RCU context. So this patch may not work well before RCU flavor consolidation has been completed around v5.0. Update the comments above the code too, as call_rcu() is now BH friendly. Signed-off-by: Dongdong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Cong Wang Link: https://lore.kernel.org/bpf/20201205075946.497763-1-xiyou.wangcong@gmail.com --- net/core/lwt_bpf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 7d3438215f32..4f3cb7c15ddf 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -39,12 +39,11 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, { int ret; - /* Preempt disable is needed to protect per-cpu redirect_info between - * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and - * access to maps strictly require a rcu_read_lock() for protection, - * mixing with BH RCU lock doesn't work. + /* Preempt disable and BH disable are needed to protect per-cpu + * redirect_info between BPF prog and skb_do_redirect(). */ preempt_disable(); + local_bh_disable(); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); @@ -78,6 +77,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, break; } + local_bh_enable(); preempt_enable(); return ret; From e3366884b383073a7edc1bad9634412ae0a22d4e Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 4 Dec 2020 23:59:46 -0800 Subject: [PATCH 29/81] lwt_bpf: Replace preempt_disable() with migrate_disable() migrate_disable() is just a wrapper for preempt_disable() in non-RT kernel. It is safe to replace it, and RT kernel will benefit. Note that it is introduced since Feb 2020. Suggested-by: Alexei Starovoitov Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201205075946.497763-2-xiyou.wangcong@gmail.com --- net/core/lwt_bpf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 4f3cb7c15ddf..2f7940bcf715 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -39,10 +39,10 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, { int ret; - /* Preempt disable and BH disable are needed to protect per-cpu + /* Migration disable and BH disable are needed to protect per-cpu * redirect_info between BPF prog and skb_do_redirect(). */ - preempt_disable(); + migrate_disable(); local_bh_disable(); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); @@ -78,7 +78,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, } local_bh_enable(); - preempt_enable(); + migrate_enable(); return ret; } From eb96b686fc2c601e78903cc61b6cf4588ddde013 Mon Sep 17 00:00:00 2001 From: Claudiu Manoil Date: Fri, 4 Dec 2020 19:15:05 +0200 Subject: [PATCH 30/81] enetc: Fix reporting of h/w packet counters Noticed some inconsistencies in packet statistics reporting. This patch adds the missing Tx packet counter registers to ethtool reporting and fixes the information strings for a few of them. Fixes: 16eb4c85c964 ("enetc: Add ethtool statistics") Signed-off-by: Claudiu Manoil Link: https://lore.kernel.org/r/20201204171505.21389-1-claudiu.manoil@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc_ethtool.c | 10 +++++++--- drivers/net/ethernet/freescale/enetc/enetc_hw.h | 10 +++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c index 8ed1ebd5a183..89e558135432 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c @@ -143,8 +143,8 @@ static const struct { { ENETC_PM0_R255, "MAC rx 128-255 byte packets" }, { ENETC_PM0_R511, "MAC rx 256-511 byte packets" }, { ENETC_PM0_R1023, "MAC rx 512-1023 byte packets" }, - { ENETC_PM0_R1518, "MAC rx 1024-1518 byte packets" }, - { ENETC_PM0_R1519X, "MAC rx 1519 to max-octet packets" }, + { ENETC_PM0_R1522, "MAC rx 1024-1522 byte packets" }, + { ENETC_PM0_R1523X, "MAC rx 1523 to max-octet packets" }, { ENETC_PM0_ROVR, "MAC rx oversized packets" }, { ENETC_PM0_RJBR, "MAC rx jabber packets" }, { ENETC_PM0_RFRG, "MAC rx fragment packets" }, @@ -163,9 +163,13 @@ static const struct { { ENETC_PM0_TBCA, "MAC tx broadcast frames" }, { ENETC_PM0_TPKT, "MAC tx packets" }, { ENETC_PM0_TUND, "MAC tx undersized packets" }, + { ENETC_PM0_T64, "MAC tx 64 byte packets" }, { ENETC_PM0_T127, "MAC tx 65-127 byte packets" }, + { ENETC_PM0_T255, "MAC tx 128-255 byte packets" }, + { ENETC_PM0_T511, "MAC tx 256-511 byte packets" }, { ENETC_PM0_T1023, "MAC tx 512-1023 byte packets" }, - { ENETC_PM0_T1518, "MAC tx 1024-1518 byte packets" }, + { ENETC_PM0_T1522, "MAC tx 1024-1522 byte packets" }, + { ENETC_PM0_T1523X, "MAC tx 1523 to max-octet packets" }, { ENETC_PM0_TCNP, "MAC tx control packets" }, { ENETC_PM0_TDFR, "MAC tx deferred packets" }, { ENETC_PM0_TMCOL, "MAC tx multiple collisions" }, diff --git a/drivers/net/ethernet/freescale/enetc/enetc_hw.h b/drivers/net/ethernet/freescale/enetc/enetc_hw.h index eb6bbf1113c7..4cbf1667d7ff 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_hw.h +++ b/drivers/net/ethernet/freescale/enetc/enetc_hw.h @@ -267,8 +267,8 @@ enum enetc_bdr_type {TX, RX}; #define ENETC_PM0_R255 0x8180 #define ENETC_PM0_R511 0x8188 #define ENETC_PM0_R1023 0x8190 -#define ENETC_PM0_R1518 0x8198 -#define ENETC_PM0_R1519X 0x81A0 +#define ENETC_PM0_R1522 0x8198 +#define ENETC_PM0_R1523X 0x81A0 #define ENETC_PM0_ROVR 0x81A8 #define ENETC_PM0_RJBR 0x81B0 #define ENETC_PM0_RFRG 0x81B8 @@ -287,9 +287,13 @@ enum enetc_bdr_type {TX, RX}; #define ENETC_PM0_TBCA 0x8250 #define ENETC_PM0_TPKT 0x8260 #define ENETC_PM0_TUND 0x8268 +#define ENETC_PM0_T64 0x8270 #define ENETC_PM0_T127 0x8278 +#define ENETC_PM0_T255 0x8280 +#define ENETC_PM0_T511 0x8288 #define ENETC_PM0_T1023 0x8290 -#define ENETC_PM0_T1518 0x8298 +#define ENETC_PM0_T1522 0x8298 +#define ENETC_PM0_T1523X 0x82A0 #define ENETC_PM0_TCNP 0x82C0 #define ENETC_PM0_TDFR 0x82D0 #define ENETC_PM0_TMCOL 0x82D8 From 851d0a73c90e6c8c63fef106c6c1e73df7e05d9d Mon Sep 17 00:00:00 2001 From: Joseph Huang Date: Fri, 4 Dec 2020 18:56:28 -0500 Subject: [PATCH 31/81] bridge: Fix a deadlock when enabling multicast snooping When enabling multicast snooping, bridge module deadlocks on multicast_lock if 1) IPv6 is enabled, and 2) there is an existing querier on the same L2 network. The deadlock was caused by the following sequence: While holding the lock, br_multicast_open calls br_multicast_join_snoopers, which eventually causes IP stack to (attempt to) send out a Listener Report (in igmp6_join_group). Since the destination Ethernet address is a multicast address, br_dev_xmit feeds the packet back to the bridge via br_multicast_rcv, which in turn calls br_multicast_add_group, which then deadlocks on multicast_lock. The fix is to move the call br_multicast_join_snoopers outside of the critical section. This works since br_multicast_join_snoopers only deals with IP and does not modify any multicast data structures of the bridge, so there's no need to hold the lock. Steps to reproduce: 1. sysctl net.ipv6.conf.all.force_mld_version=1 2. have another querier 3. ip link set dev bridge type bridge mcast_snooping 0 && \ ip link set dev bridge type bridge mcast_snooping 1 < deadlock > A typical call trace looks like the following: [ 936.251495] _raw_spin_lock+0x5c/0x68 [ 936.255221] br_multicast_add_group+0x40/0x170 [bridge] [ 936.260491] br_multicast_rcv+0x7ac/0xe30 [bridge] [ 936.265322] br_dev_xmit+0x140/0x368 [bridge] [ 936.269689] dev_hard_start_xmit+0x94/0x158 [ 936.273876] __dev_queue_xmit+0x5ac/0x7f8 [ 936.277890] dev_queue_xmit+0x10/0x18 [ 936.281563] neigh_resolve_output+0xec/0x198 [ 936.285845] ip6_finish_output2+0x240/0x710 [ 936.290039] __ip6_finish_output+0x130/0x170 [ 936.294318] ip6_output+0x6c/0x1c8 [ 936.297731] NF_HOOK.constprop.0+0xd8/0xe8 [ 936.301834] igmp6_send+0x358/0x558 [ 936.305326] igmp6_join_group.part.0+0x30/0xf0 [ 936.309774] igmp6_group_added+0xfc/0x110 [ 936.313787] __ipv6_dev_mc_inc+0x1a4/0x290 [ 936.317885] ipv6_dev_mc_inc+0x10/0x18 [ 936.321677] br_multicast_open+0xbc/0x110 [bridge] [ 936.326506] br_multicast_toggle+0xec/0x140 [bridge] Fixes: 4effd28c1245 ("bridge: join all-snoopers multicast address") Signed-off-by: Joseph Huang Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20201204235628.50653-1-Joseph.Huang@garmin.com Signed-off-by: Jakub Kicinski --- net/bridge/br_device.c | 6 ++++++ net/bridge/br_multicast.c | 34 +++++++++++++++++++++++++--------- net/bridge/br_private.h | 10 ++++++++++ 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 7730c8f3cb53..d3ea9d0779fb 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -177,6 +177,9 @@ static int br_dev_open(struct net_device *dev) br_stp_enable_bridge(br); br_multicast_open(br); + if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) + br_multicast_join_snoopers(br); + return 0; } @@ -197,6 +200,9 @@ static int br_dev_stop(struct net_device *dev) br_stp_disable_bridge(br); br_multicast_stop(br); + if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) + br_multicast_leave_snoopers(br); + netif_stop_queue(dev); return 0; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index eae898c3cff7..54cb82a69056 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -3286,7 +3286,7 @@ static inline void br_ip6_multicast_join_snoopers(struct net_bridge *br) } #endif -static void br_multicast_join_snoopers(struct net_bridge *br) +void br_multicast_join_snoopers(struct net_bridge *br) { br_ip4_multicast_join_snoopers(br); br_ip6_multicast_join_snoopers(br); @@ -3317,7 +3317,7 @@ static inline void br_ip6_multicast_leave_snoopers(struct net_bridge *br) } #endif -static void br_multicast_leave_snoopers(struct net_bridge *br) +void br_multicast_leave_snoopers(struct net_bridge *br) { br_ip4_multicast_leave_snoopers(br); br_ip6_multicast_leave_snoopers(br); @@ -3336,9 +3336,6 @@ static void __br_multicast_open(struct net_bridge *br, void br_multicast_open(struct net_bridge *br) { - if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) - br_multicast_join_snoopers(br); - __br_multicast_open(br, &br->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) __br_multicast_open(br, &br->ip6_own_query); @@ -3354,9 +3351,6 @@ void br_multicast_stop(struct net_bridge *br) del_timer_sync(&br->ip6_other_query.timer); del_timer_sync(&br->ip6_own_query.timer); #endif - - if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) - br_multicast_leave_snoopers(br); } void br_multicast_dev_del(struct net_bridge *br) @@ -3487,6 +3481,7 @@ static void br_multicast_start_querier(struct net_bridge *br, int br_multicast_toggle(struct net_bridge *br, unsigned long val) { struct net_bridge_port *port; + bool change_snoopers = false; spin_lock_bh(&br->multicast_lock); if (!!br_opt_get(br, BROPT_MULTICAST_ENABLED) == !!val) @@ -3495,7 +3490,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val) br_mc_disabled_update(br->dev, val); br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) { - br_multicast_leave_snoopers(br); + change_snoopers = true; goto unlock; } @@ -3506,9 +3501,30 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val) list_for_each_entry(port, &br->port_list, list) __br_multicast_enable_port(port); + change_snoopers = true; + unlock: spin_unlock_bh(&br->multicast_lock); + /* br_multicast_join_snoopers has the potential to cause + * an MLD Report/Leave to be delivered to br_multicast_rcv, + * which would in turn call br_multicast_add_group, which would + * attempt to acquire multicast_lock. This function should be + * called after the lock has been released to avoid deadlocks on + * multicast_lock. + * + * br_multicast_leave_snoopers does not have the problem since + * br_multicast_rcv first checks BROPT_MULTICAST_ENABLED, and + * returns without calling br_multicast_ipv4/6_rcv if it's not + * enabled. Moved both functions out just for symmetry. + */ + if (change_snoopers) { + if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) + br_multicast_join_snoopers(br); + else + br_multicast_leave_snoopers(br); + } + return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 345118e35c42..8424464186a6 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -792,6 +792,8 @@ void br_multicast_del_port(struct net_bridge_port *port); void br_multicast_enable_port(struct net_bridge_port *port); void br_multicast_disable_port(struct net_bridge_port *port); void br_multicast_init(struct net_bridge *br); +void br_multicast_join_snoopers(struct net_bridge *br); +void br_multicast_leave_snoopers(struct net_bridge *br); void br_multicast_open(struct net_bridge *br); void br_multicast_stop(struct net_bridge *br); void br_multicast_dev_del(struct net_bridge *br); @@ -969,6 +971,14 @@ static inline void br_multicast_init(struct net_bridge *br) { } +static inline void br_multicast_join_snoopers(struct net_bridge *br) +{ +} + +static inline void br_multicast_leave_snoopers(struct net_bridge *br) +{ +} + static inline void br_multicast_open(struct net_bridge *br) { } From f55628b3e7648198e9c072b52080c5dea8678adf Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Sat, 5 Dec 2020 15:56:33 +0800 Subject: [PATCH 32/81] mptcp: print new line in mptcp_seq_show() if mptcp isn't in use When do cat /proc/net/netstat, the output isn't append with a new line, it looks like this: [root@localhost ~]# cat /proc/net/netstat ... MPTcpExt: 0 0 0 0 0 0 0 0 0 0 0 0 0[root@localhost ~]# This is because in mptcp_seq_show(), if mptcp isn't in use, net->mib.mptcp_statistics is NULL, so it just puts all 0 after "MPTcpExt:", and return, forgot the '\n'. After this patch: [root@localhost ~]# cat /proc/net/netstat ... MPTcpExt: 0 0 0 0 0 0 0 0 0 0 0 0 0 [root@localhost ~]# Fixes: fc518953bc9c8d7d ("mptcp: add and use MIB counter infrastructure") Signed-off-by: Jianguo Wu Acked-by: Florian Westphal Link: https://lore.kernel.org/r/142e2fd9-58d9-bb13-fb75-951cccc2331e@163.com Signed-off-by: Jakub Kicinski --- net/mptcp/mib.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 84d119436b22..b921cbdd9aaa 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -67,6 +67,7 @@ void mptcp_seq_show(struct seq_file *seq) for (i = 0; mptcp_snmp_list[i].name; i++) seq_puts(seq, " 0"); + seq_putc(seq, '\n'); return; } From bbef72c630b522a9ffbf62dae19b59c880da6ea1 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 6 Dec 2020 16:13:39 +0100 Subject: [PATCH 33/81] dpaa2-mac: Add a missing of_node_put after of_device_is_available Add an 'of_node_put()' call when a tested device node is not available. Fixes: 94ae899b2096 ("dpaa2-mac: add PCS support through the Lynx module") Signed-off-by: Christophe JAILLET Reviewed-by: Ioana Ciornei Link: https://lore.kernel.org/r/20201206151339.44306-1-christophe.jaillet@wanadoo.fr Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c index 90cd243070d7..828c177df03d 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-mac.c @@ -269,6 +269,7 @@ static int dpaa2_pcs_create(struct dpaa2_mac *mac, if (!of_device_is_available(node)) { netdev_err(mac->net_dev, "pcs-handle node not available\n"); + of_node_put(node); return -ENODEV; } From 82ca4c922b8992013a238d65cf4e60cc33e12f36 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sat, 5 Dec 2020 22:32:07 +0100 Subject: [PATCH 34/81] net: stmmac: dwmac-meson8b: fix mask definition of the m250_sel mux The m250_sel mux clock uses bit 4 in the PRG_ETH0 register. Fix this by shifting the PRG_ETH0_CLK_M250_SEL_MASK accordingly as the "mask" in struct clk_mux expects the mask relative to the "shift" field in the same struct. While here, get rid of the PRG_ETH0_CLK_M250_SEL_SHIFT macro and use __ffs() to determine it from the existing PRG_ETH0_CLK_M250_SEL_MASK macro. Fixes: 566e8251625304 ("net: stmmac: add a glue driver for the Amlogic Meson 8b / GXBB DWMAC") Signed-off-by: Martin Blumenstingl Reviewed-by: Jerome Brunet Link: https://lore.kernel.org/r/20201205213207.519341-1-martin.blumenstingl@googlemail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c index 5afcf05bbf9c..6d6bd77bb6af 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c @@ -30,7 +30,6 @@ #define PRG_ETH0_EXT_RMII_MODE 4 /* mux to choose between fclk_div2 (bit unset) and mpll2 (bit set) */ -#define PRG_ETH0_CLK_M250_SEL_SHIFT 4 #define PRG_ETH0_CLK_M250_SEL_MASK GENMASK(4, 4) /* TX clock delay in ns = "8ns / 4 * tx_dly_val" (where 8ns are exactly one @@ -155,8 +154,9 @@ static int meson8b_init_rgmii_tx_clk(struct meson8b_dwmac *dwmac) return -ENOMEM; clk_configs->m250_mux.reg = dwmac->regs + PRG_ETH0; - clk_configs->m250_mux.shift = PRG_ETH0_CLK_M250_SEL_SHIFT; - clk_configs->m250_mux.mask = PRG_ETH0_CLK_M250_SEL_MASK; + clk_configs->m250_mux.shift = __ffs(PRG_ETH0_CLK_M250_SEL_MASK); + clk_configs->m250_mux.mask = PRG_ETH0_CLK_M250_SEL_MASK >> + clk_configs->m250_mux.shift; clk = meson8b_dwmac_register_clk(dwmac, "m250_sel", mux_parents, ARRAY_SIZE(mux_parents), &clk_mux_ops, &clk_configs->m250_mux.hw); From cc00bcaa589914096edef7fb87ca5cee4a166b5c Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 25 Nov 2020 11:27:22 -0700 Subject: [PATCH 35/81] netfilter: x_tables: Switch synchronization to RCU When running concurrent iptables rules replacement with data, the per CPU sequence count is checked after the assignment of the new information. The sequence count is used to synchronize with the packet path without the use of any explicit locking. If there are any packets in the packet path using the table information, the sequence count is incremented to an odd value and is incremented to an even after the packet process completion. The new table value assignment is followed by a write memory barrier so every CPU should see the latest value. If the packet path has started with the old table information, the sequence counter will be odd and the iptables replacement will wait till the sequence count is even prior to freeing the old table info. However, this assumes that the new table information assignment and the memory barrier is actually executed prior to the counter check in the replacement thread. If CPU decides to execute the assignment later as there is no user of the table information prior to the sequence check, the packet path in another CPU may use the old table information. The replacement thread would then free the table information under it leading to a use after free in the packet processing context- Unable to handle kernel NULL pointer dereference at virtual address 000000000000008e pc : ip6t_do_table+0x5d0/0x89c lr : ip6t_do_table+0x5b8/0x89c ip6t_do_table+0x5d0/0x89c ip6table_filter_hook+0x24/0x30 nf_hook_slow+0x84/0x120 ip6_input+0x74/0xe0 ip6_rcv_finish+0x7c/0x128 ipv6_rcv+0xac/0xe4 __netif_receive_skb+0x84/0x17c process_backlog+0x15c/0x1b8 napi_poll+0x88/0x284 net_rx_action+0xbc/0x23c __do_softirq+0x20c/0x48c This could be fixed by forcing instruction order after the new table information assignment or by switching to RCU for the synchronization. Fixes: 80055dab5de0 ("netfilter: x_tables: make xt_replace_table wait until old rules are not used anymore") Reported-by: Sean Tranchetti Reported-by: kernel test robot Suggested-by: Florian Westphal Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 5 ++- net/ipv4/netfilter/arp_tables.c | 14 ++++----- net/ipv4/netfilter/ip_tables.c | 14 ++++----- net/ipv6/netfilter/ip6_tables.c | 14 ++++----- net/netfilter/x_tables.c | 49 +++++++++--------------------- 5 files changed, 40 insertions(+), 56 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 5deb099d156d..8ebb64193757 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -227,7 +227,7 @@ struct xt_table { unsigned int valid_hooks; /* Man behind the curtain... */ - struct xt_table_info *private; + struct xt_table_info __rcu *private; /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; @@ -448,6 +448,9 @@ xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu) struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *); +struct xt_table_info +*xt_table_get_private_protected(const struct xt_table *table); + #ifdef CONFIG_COMPAT #include diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index d1e04d2b5170..563b62b76a5f 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -203,7 +203,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); - private = READ_ONCE(table->private); /* Address dependency. */ + private = rcu_access_pointer(table->private); cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; @@ -649,7 +649,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -673,7 +673,7 @@ static int copy_entries_to_user(unsigned int total_size, unsigned int off, num; const struct arpt_entry *e; struct xt_counters *counters; - struct xt_table_info *private = table->private; + struct xt_table_info *private = xt_table_get_private_protected(table); int ret = 0; void *loc_cpu_entry; @@ -807,7 +807,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, NFPROTO_ARP, name); if (!IS_ERR(t)) { struct arpt_getinfo info; - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); #ifdef CONFIG_COMPAT struct xt_table_info tmp; @@ -860,7 +860,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); if (get.size == private->size) ret = copy_entries_to_user(private->size, @@ -1017,7 +1017,7 @@ static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = t->private; + private = xt_table_get_private_protected(t); if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1330,7 +1330,7 @@ static int compat_copy_entries_to_user(unsigned int total_size, void __user *userptr) { struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); void __user *pos; unsigned int size; int ret = 0; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index f15bc21d7301..6e2851f8d3a3 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -258,7 +258,7 @@ ipt_do_table(struct sk_buff *skb, WARN_ON(!(table->valid_hooks & (1 << hook))); local_bh_disable(); addend = xt_write_recseq_begin(); - private = READ_ONCE(table->private); /* Address dependency. */ + private = rcu_access_pointer(table->private); cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; @@ -791,7 +791,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -815,7 +815,7 @@ copy_entries_to_user(unsigned int total_size, unsigned int off, num; const struct ipt_entry *e; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); int ret = 0; const void *loc_cpu_entry; @@ -964,7 +964,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, AF_INET, name); if (!IS_ERR(t)) { struct ipt_getinfo info; - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); #ifdef CONFIG_COMPAT struct xt_table_info tmp; @@ -1018,7 +1018,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); @@ -1173,7 +1173,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = t->private; + private = xt_table_get_private_protected(t); if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1543,7 +1543,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); void __user *pos; unsigned int size; int ret = 0; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 2e2119bfcf13..c4f532f4d311 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -280,7 +280,7 @@ ip6t_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); - private = READ_ONCE(table->private); /* Address dependency. */ + private = rcu_access_pointer(table->private); cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; @@ -807,7 +807,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -831,7 +831,7 @@ copy_entries_to_user(unsigned int total_size, unsigned int off, num; const struct ip6t_entry *e; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); int ret = 0; const void *loc_cpu_entry; @@ -980,7 +980,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, AF_INET6, name); if (!IS_ERR(t)) { struct ip6t_getinfo info; - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); #ifdef CONFIG_COMPAT struct xt_table_info tmp; @@ -1035,7 +1035,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { - struct xt_table_info *private = t->private; + struct xt_table_info *private = xt_table_get_private_protected(t); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); @@ -1189,7 +1189,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = t->private; + private = xt_table_get_private_protected(t); if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1552,7 +1552,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); void __user *pos; unsigned int size; int ret = 0; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index af22dbe85e2c..acce622582e3 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1349,6 +1349,14 @@ struct xt_counters *xt_counters_alloc(unsigned int counters) } EXPORT_SYMBOL(xt_counters_alloc); +struct xt_table_info +*xt_table_get_private_protected(const struct xt_table *table) +{ + return rcu_dereference_protected(table->private, + mutex_is_locked(&xt[table->af].mutex)); +} +EXPORT_SYMBOL(xt_table_get_private_protected); + struct xt_table_info * xt_replace_table(struct xt_table *table, unsigned int num_counters, @@ -1356,7 +1364,6 @@ xt_replace_table(struct xt_table *table, int *error) { struct xt_table_info *private; - unsigned int cpu; int ret; ret = xt_jumpstack_alloc(newinfo); @@ -1366,47 +1373,20 @@ xt_replace_table(struct xt_table *table, } /* Do the substitution. */ - local_bh_disable(); - private = table->private; + private = xt_table_get_private_protected(table); /* Check inside lock: is the old number correct? */ if (num_counters != private->number) { pr_debug("num_counters != table->private->number (%u/%u)\n", num_counters, private->number); - local_bh_enable(); *error = -EAGAIN; return NULL; } newinfo->initial_entries = private->initial_entries; - /* - * Ensure contents of newinfo are visible before assigning to - * private. - */ - smp_wmb(); - table->private = newinfo; - /* make sure all cpus see new ->private value */ - smp_wmb(); - - /* - * Even though table entries have now been swapped, other CPU's - * may still be using the old entries... - */ - local_bh_enable(); - - /* ... so wait for even xt_recseq on all cpus */ - for_each_possible_cpu(cpu) { - seqcount_t *s = &per_cpu(xt_recseq, cpu); - u32 seq = raw_read_seqcount(s); - - if (seq & 1) { - do { - cond_resched(); - cpu_relax(); - } while (seq == raw_read_seqcount(s)); - } - } + rcu_assign_pointer(table->private, newinfo); + synchronize_rcu(); audit_log_nfcfg(table->name, table->af, private->number, !private->number ? AUDIT_XT_OP_REGISTER : @@ -1442,12 +1422,12 @@ struct xt_table *xt_register_table(struct net *net, } /* Simplifies replace_table code. */ - table->private = bootstrap; + rcu_assign_pointer(table->private, bootstrap); if (!xt_replace_table(table, 0, newinfo, &ret)) goto unlock; - private = table->private; + private = xt_table_get_private_protected(table); pr_debug("table->private->number = %u\n", private->number); /* save number of initial entries */ @@ -1470,7 +1450,8 @@ void *xt_unregister_table(struct xt_table *table) struct xt_table_info *private; mutex_lock(&xt[table->af].mutex); - private = table->private; + private = xt_table_get_private_protected(table); + RCU_INIT_POINTER(table->private, NULL); list_del(&table->list); mutex_unlock(&xt[table->af].mutex); audit_log_nfcfg(table->name, table->af, private->number, From 932c60558109a9131e54dacfda6070147fd1cdfb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 4 Dec 2020 15:20:01 -0800 Subject: [PATCH 36/81] tools/bpftool: Fix PID fetching with a lot of results In case of having so many PID results that they don't fit into a singe page (4096) bytes, bpftool will erroneously conclude that it got corrupted data due to 4096 not being a multiple of struct pid_iter_entry, so the last entry will be partially truncated. Fix this by sizing the buffer to fit exactly N entries with no truncation in the middle of record. Fixes: d53dee3fe013 ("tools/bpftool: Show info for processes holding BPF map/prog/link/btf FDs") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201204232002.3589803-1-andrii@kernel.org --- tools/bpf/bpftool/pids.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/pids.c b/tools/bpf/bpftool/pids.c index df7d8ec76036..477e55d59c34 100644 --- a/tools/bpf/bpftool/pids.c +++ b/tools/bpf/bpftool/pids.c @@ -89,9 +89,9 @@ libbpf_print_none(__maybe_unused enum libbpf_print_level level, int build_obj_refs_table(struct obj_refs_table *table, enum bpf_obj_type type) { - char buf[4096]; - struct pid_iter_bpf *skel; struct pid_iter_entry *e; + char buf[4096 / sizeof(*e) * sizeof(*e)]; + struct pid_iter_bpf *skel; int err, ret, fd = -1, i; libbpf_print_fn_t default_print; From 007ab5345545aba2f9cbe4c096cc35d2fd3275ac Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Sat, 5 Dec 2020 12:22:29 -0500 Subject: [PATCH 37/81] bonding: fix feature flag setting at init time Don't try to adjust XFRM support flags if the bond device isn't yet registered. Bad things can currently happen when netdev_change_features() is called without having wanted_features fully filled in yet. This code runs both on post-module-load mode changes, as well as at module init time, and when run at module init time, it is before register_netdevice() has been called and filled in wanted_features. The empty wanted_features led to features also getting emptied out, which was definitely not the intended behavior, so prevent that from happening. Originally, I'd hoped to stop adjusting wanted_features at all in the bonding driver, as it's documented as being something only the network core should touch, but we actually do need to do this to properly update both the features and wanted_features fields when changing the bond type, or we get to a situation where ethtool sees: esp-hw-offload: off [requested on] I do think we should be using netdev_update_features instead of netdev_change_features here though, so we only send notifiers when the features actually changed. Fixes: a3b658cfb664 ("bonding: allow xfrm offload setup post-module-load") Reported-by: Ivan Vecera Suggested-by: Ivan Vecera Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Signed-off-by: Jarod Wilson Link: https://lore.kernel.org/r/20201205172229.576587-1-jarod@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_options.c | 22 +++++++++++++++------- include/net/bonding.h | 2 -- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 9abfaae1c6f7..a4e4e15f574d 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -745,6 +745,19 @@ const struct bond_option *bond_opt_get(unsigned int option) return &bond_opts[option]; } +static void bond_set_xfrm_features(struct net_device *bond_dev, u64 mode) +{ + if (!IS_ENABLED(CONFIG_XFRM_OFFLOAD)) + return; + + if (mode == BOND_MODE_ACTIVEBACKUP) + bond_dev->wanted_features |= BOND_XFRM_FEATURES; + else + bond_dev->wanted_features &= ~BOND_XFRM_FEATURES; + + netdev_update_features(bond_dev); +} + static int bond_option_mode_set(struct bonding *bond, const struct bond_opt_value *newval) { @@ -767,13 +780,8 @@ static int bond_option_mode_set(struct bonding *bond, if (newval->value == BOND_MODE_ALB) bond->params.tlb_dynamic_lb = 1; -#ifdef CONFIG_XFRM_OFFLOAD - if (newval->value == BOND_MODE_ACTIVEBACKUP) - bond->dev->wanted_features |= BOND_XFRM_FEATURES; - else - bond->dev->wanted_features &= ~BOND_XFRM_FEATURES; - netdev_change_features(bond->dev); -#endif /* CONFIG_XFRM_OFFLOAD */ + if (bond->dev->reg_state == NETREG_REGISTERED) + bond_set_xfrm_features(bond->dev, newval->value); /* don't cache arp_validate between modes */ bond->params.arp_validate = BOND_ARP_VALIDATE_NONE; diff --git a/include/net/bonding.h b/include/net/bonding.h index d9d0ff3b0ad3..adc3da776970 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -86,10 +86,8 @@ #define bond_for_each_slave_rcu(bond, pos, iter) \ netdev_for_each_lower_private_rcu((bond)->dev, pos, iter) -#ifdef CONFIG_XFRM_OFFLOAD #define BOND_XFRM_FEATURES (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | \ NETIF_F_GSO_ESP) -#endif /* CONFIG_XFRM_OFFLOAD */ #ifdef CONFIG_NET_POLL_CONTROLLER extern atomic_t netpoll_block_tx; From 917d80d376ffbaa9725fde9e3c0282f63643f278 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 8 Dec 2020 18:25:53 +0100 Subject: [PATCH 38/81] netfilter: nft_dynset: fix timeouts later than 23 days Use nf_msecs_to_jiffies64 and nf_jiffies64_to_msecs as provided by 8e1102d5a159 ("netfilter: nf_tables: support timeouts larger than 23 days"), otherwise ruleset listing breaks. Fixes: a8b1e36d0d1d ("netfilter: nft_dynset: fix element timeout for HZ != 1000") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ net/netfilter/nf_tables_api.c | 4 ++-- net/netfilter/nft_dynset.c | 8 +++++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 55b4cadf290a..c1c0a4ff92ae 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1524,4 +1524,8 @@ void __init nft_chain_route_init(void); void nft_chain_route_fini(void); void nf_tables_trans_destroy_flush_work(void); + +int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result); +__be64 nf_jiffies64_to_msecs(u64 input); + #endif /* _NET_NF_TABLES_H */ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 23abf1578594..c2f59879a48d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3719,7 +3719,7 @@ cont: return 0; } -static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) +int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) { u64 ms = be64_to_cpu(nla_get_be64(nla)); u64 max = (u64)(~((u64)0)); @@ -3733,7 +3733,7 @@ static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) return 0; } -static __be64 nf_jiffies64_to_msecs(u64 input) +__be64 nf_jiffies64_to_msecs(u64 input) { return cpu_to_be64(jiffies64_to_msecs(input)); } diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 64ca13a1885b..9af4f93c7f0e 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -157,8 +157,10 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (tb[NFTA_DYNSET_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) return -EINVAL; - timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64( - tb[NFTA_DYNSET_TIMEOUT]))); + + err = nf_msecs_to_jiffies64(tb[NFTA_DYNSET_TIMEOUT], &timeout); + if (err) + return err; } priv->sreg_key = nft_parse_register(tb[NFTA_DYNSET_SREG_KEY]); @@ -267,7 +269,7 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name)) goto nla_put_failure; if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, - cpu_to_be64(jiffies_to_msecs(priv->timeout)), + nf_jiffies64_to_msecs(priv->timeout), NFTA_DYNSET_PAD)) goto nla_put_failure; if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr)) From 42f1c27120906a54e73101a7d6a12f58813f6a9f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 8 Dec 2020 18:57:02 +0100 Subject: [PATCH 39/81] netfilter: nftables: comment indirect serialization of commit_mutex with rtnl_mutex Add an explicit comment in the code to describe the indirect serialization of the holders of the commit_mutex with the rtnl_mutex. Commit 90d2723c6d4c ("netfilter: nf_tables: do not hold reference on netdevice from preparation phase") already describes this, but a comment in this case is better for reference. Reported-by: Vladimir Oltean Reviewed-by: Vladimir Oltean Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c2f59879a48d..9a080767667b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1723,6 +1723,10 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, } nla_strlcpy(ifname, attr, IFNAMSIZ); + /* nf_tables_netdev_event() is called under rtnl_mutex, this is + * indirectly serializing all the other holders of the commit_mutex with + * the rtnl_mutex. + */ dev = __dev_get_by_name(net, ifname); if (!dev) { err = -ENOENT; From 9d14edfdeabf37d8d8f045e63e5873712aadcd6b Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Mon, 7 Dec 2020 18:51:37 +0800 Subject: [PATCH 40/81] net: stmmac: increase the timeout for dma reset Current timeout value is not enough for gmac5 dma reset on imx8mp platform, increase the timeout range. Signed-off-by: Fugang Duan Signed-off-by: Joakim Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c index 6e30d7eb4983..0b4ee2dbb691 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c @@ -22,7 +22,7 @@ int dwmac4_dma_reset(void __iomem *ioaddr) return readl_poll_timeout(ioaddr + DMA_BUS_MODE, value, !(value & DMA_BUS_MODE_SFT_RESET), - 10000, 100000); + 10000, 1000000); } void dwmac4_set_rx_tail_ptr(void __iomem *ioaddr, u32 tail_ptr, u32 chan) From 36d18b5664ef617ccf4da266291d2f2342fab89d Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Mon, 7 Dec 2020 18:51:38 +0800 Subject: [PATCH 41/81] net: stmmac: start phylink instance before stmmac_hw_setup() Start phylink instance and resume back the PHY to supply RX clock to MAC before MAC layer initialization by calling .stmmac_hw_setup(), since DMA reset depends on the RX clock, otherwise DMA reset cost maximum timeout value then finally timeout. Fixes: 74371272f97f ("net: stmmac: Convert to phylink and remove phylib logic") Signed-off-by: Fugang Duan Signed-off-by: Joakim Zhang Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index ba45fe237512..0cef414f1289 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5247,6 +5247,14 @@ int stmmac_resume(struct device *dev) return ret; } + if (!device_may_wakeup(priv->device) || !priv->plat->pmt) { + rtnl_lock(); + phylink_start(priv->phylink); + /* We may have called phylink_speed_down before */ + phylink_speed_up(priv->phylink); + rtnl_unlock(); + } + rtnl_lock(); mutex_lock(&priv->lock); @@ -5265,14 +5273,6 @@ int stmmac_resume(struct device *dev) mutex_unlock(&priv->lock); rtnl_unlock(); - if (!device_may_wakeup(priv->device) || !priv->plat->pmt) { - rtnl_lock(); - phylink_start(priv->phylink); - /* We may have called phylink_speed_down before */ - phylink_speed_up(priv->phylink); - rtnl_unlock(); - } - phylink_mac_change(priv->phylink, true); netif_device_attach(ndev); From 4ec236c7c51f89abb0224a4da4a6b77f9beb6600 Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Mon, 7 Dec 2020 18:51:39 +0800 Subject: [PATCH 42/81] net: stmmac: free tx skb buffer in stmmac_resume() When do suspend/resume test, there have WARN_ON() log dump from stmmac_xmit() funciton, the code logic: entry = tx_q->cur_tx; first_entry = entry; WARN_ON(tx_q->tx_skbuff[first_entry]); In normal case, tx_q->tx_skbuff[txq->cur_tx] should be NULL because the skb should be handled and freed in stmmac_tx_clean(). But stmmac_resume() reset queue parameters like below, skb buffers may not be freed. tx_q->cur_tx = 0; tx_q->dirty_tx = 0; So free tx skb buffer in stmmac_resume() to avoid warning and memory leak. log: [ 46.139824] ------------[ cut here ]------------ [ 46.144453] WARNING: CPU: 0 PID: 0 at drivers/net/ethernet/stmicro/stmmac/stmmac_main.c:3235 stmmac_xmit+0x7a0/0x9d0 [ 46.154969] Modules linked in: crct10dif_ce vvcam(O) flexcan can_dev [ 46.161328] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G O 5.4.24-2.1.0+g2ad925d15481 #1 [ 46.170369] Hardware name: NXP i.MX8MPlus EVK board (DT) [ 46.175677] pstate: 80000005 (Nzcv daif -PAN -UAO) [ 46.180465] pc : stmmac_xmit+0x7a0/0x9d0 [ 46.184387] lr : dev_hard_start_xmit+0x94/0x158 [ 46.188913] sp : ffff800010003cc0 [ 46.192224] x29: ffff800010003cc0 x28: ffff000177e2a100 [ 46.197533] x27: ffff000176ef0840 x26: ffff000176ef0090 [ 46.202842] x25: 0000000000000000 x24: 0000000000000000 [ 46.208151] x23: 0000000000000003 x22: ffff8000119ddd30 [ 46.213460] x21: ffff00017636f000 x20: ffff000176ef0cc0 [ 46.218769] x19: 0000000000000003 x18: 0000000000000000 [ 46.224078] x17: 0000000000000000 x16: 0000000000000000 [ 46.229386] x15: 0000000000000079 x14: 0000000000000000 [ 46.234695] x13: 0000000000000003 x12: 0000000000000003 [ 46.240003] x11: 0000000000000010 x10: 0000000000000010 [ 46.245312] x9 : ffff00017002b140 x8 : 0000000000000000 [ 46.250621] x7 : ffff00017636f000 x6 : 0000000000000010 [ 46.255930] x5 : 0000000000000001 x4 : ffff000176ef0000 [ 46.261238] x3 : 0000000000000003 x2 : 00000000ffffffff [ 46.266547] x1 : ffff000177e2a000 x0 : 0000000000000000 [ 46.271856] Call trace: [ 46.274302] stmmac_xmit+0x7a0/0x9d0 [ 46.277874] dev_hard_start_xmit+0x94/0x158 [ 46.282056] sch_direct_xmit+0x11c/0x338 [ 46.285976] __qdisc_run+0x118/0x5f0 [ 46.289549] net_tx_action+0x110/0x198 [ 46.293297] __do_softirq+0x120/0x23c [ 46.296958] irq_exit+0xb8/0xd8 [ 46.300098] __handle_domain_irq+0x64/0xb8 [ 46.304191] gic_handle_irq+0x5c/0x148 [ 46.307936] el1_irq+0xb8/0x180 [ 46.311076] cpuidle_enter_state+0x84/0x360 [ 46.315256] cpuidle_enter+0x34/0x48 [ 46.318829] call_cpuidle+0x18/0x38 [ 46.322314] do_idle+0x1e0/0x280 [ 46.325539] cpu_startup_entry+0x24/0x40 [ 46.329460] rest_init+0xd4/0xe0 [ 46.332687] arch_call_rest_init+0xc/0x14 [ 46.336695] start_kernel+0x420/0x44c [ 46.340353] ---[ end trace bc1ee695123cbacd ]--- Fixes: 47dd7a540b8a0 ("net: add support for STMicroelectronics Ethernet controllers.") Signed-off-by: Fugang Duan Signed-off-by: Joakim Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 0cef414f1289..7452f3c1cab9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1533,6 +1533,19 @@ static void dma_free_tx_skbufs(struct stmmac_priv *priv, u32 queue) stmmac_free_tx_buffer(priv, queue, i); } +/** + * stmmac_free_tx_skbufs - free TX skb buffers + * @priv: private structure + */ +static void stmmac_free_tx_skbufs(struct stmmac_priv *priv) +{ + u32 tx_queue_cnt = priv->plat->tx_queues_to_use; + u32 queue; + + for (queue = 0; queue < tx_queue_cnt; queue++) + dma_free_tx_skbufs(priv, queue); +} + /** * free_dma_rx_desc_resources - free RX dma desc resources * @priv: private structure @@ -5260,6 +5273,7 @@ int stmmac_resume(struct device *dev) stmmac_reset_queues_param(priv); + stmmac_free_tx_skbufs(priv); stmmac_clear_descriptors(priv); stmmac_hw_setup(ndev, false); From 5f58591323bf3f342920179f24515935c4b5fd60 Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Mon, 7 Dec 2020 18:51:40 +0800 Subject: [PATCH 43/81] net: stmmac: delete the eee_ctrl_timer after napi disabled There have chance to re-enable the eee_ctrl_timer and fire the timer in napi callback after delete the timer in .stmmac_release(), which introduces to access eee registers in the timer function after clocks are disabled then causes system hang. Found this issue when do suspend/resume and reboot stress test. It is safe to delete the timer after napi disabled and disable lpi mode. Fixes: d765955d2ae0b ("stmmac: add the Energy Efficient Ethernet support") Signed-off-by: Fugang Duan Signed-off-by: Joakim Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 7452f3c1cab9..d2521ebb8217 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2908,9 +2908,6 @@ static int stmmac_release(struct net_device *dev) struct stmmac_priv *priv = netdev_priv(dev); u32 chan; - if (priv->eee_enabled) - del_timer_sync(&priv->eee_ctrl_timer); - if (device_may_wakeup(priv->device)) phylink_speed_down(priv->phylink, false); /* Stop and disconnect the PHY */ @@ -2929,6 +2926,11 @@ static int stmmac_release(struct net_device *dev) if (priv->lpi_irq > 0) free_irq(priv->lpi_irq, dev); + if (priv->eee_enabled) { + priv->tx_path_in_lpi_mode = false; + del_timer_sync(&priv->eee_ctrl_timer); + } + /* Stop TX/RX DMA and clear the descriptors */ stmmac_stop_all_dma(priv); @@ -5155,6 +5157,11 @@ int stmmac_suspend(struct device *dev) for (chan = 0; chan < priv->plat->tx_queues_to_use; chan++) del_timer_sync(&priv->tx_queue[chan].txtimer); + if (priv->eee_enabled) { + priv->tx_path_in_lpi_mode = false; + del_timer_sync(&priv->eee_ctrl_timer); + } + /* Stop TX/RX DMA */ stmmac_stop_all_dma(priv); From f119cc9818eb33b66e977ad3af75aef6500bbdc3 Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Mon, 7 Dec 2020 18:51:41 +0800 Subject: [PATCH 44/81] net: stmmac: overwrite the dma_cap.addr64 according to HW design The current IP register MAC_HW_Feature1[ADDR64] only defines 32/40/64 bit width, but some SOCs support others like i.MX8MP support 34 bits but it maps to 40 bits width in MAC_HW_Feature1[ADDR64]. So overwrite dma_cap.addr64 according to HW real design. Fixes: 94abdad6974a ("net: ethernet: dwmac: add ethernet glue logic for NXP imx8 chip") Signed-off-by: Fugang Duan Signed-off-by: Joakim Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 9 +-------- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 8 ++++++++ include/linux/stmmac.h | 1 + 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index efef5476a577..223f69da7e95 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -246,13 +246,7 @@ static int imx_dwmac_probe(struct platform_device *pdev) goto err_parse_dt; } - ret = dma_set_mask_and_coherent(&pdev->dev, - DMA_BIT_MASK(dwmac->ops->addr_width)); - if (ret) { - dev_err(&pdev->dev, "DMA mask set failed\n"); - goto err_dma_mask; - } - + plat_dat->addr64 = dwmac->ops->addr_width; plat_dat->init = imx_dwmac_init; plat_dat->exit = imx_dwmac_exit; plat_dat->fix_mac_speed = imx_dwmac_fix_speed; @@ -272,7 +266,6 @@ static int imx_dwmac_probe(struct platform_device *pdev) err_dwmac_init: err_drv_probe: imx_dwmac_exit(pdev, plat_dat->bsp_priv); -err_dma_mask: err_parse_dt: err_match_data: stmmac_remove_config_dt(pdev, plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index d2521ebb8217..c33db79cdd0a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4945,6 +4945,14 @@ int stmmac_dvr_probe(struct device *device, dev_info(priv->device, "SPH feature enabled\n"); } + /* The current IP register MAC_HW_Feature1[ADDR64] only define + * 32/40/64 bit width, but some SOC support others like i.MX8MP + * support 34 bits but it map to 40 bits width in MAC_HW_Feature1[ADDR64]. + * So overwrite dma_cap.addr64 according to HW real design. + */ + if (priv->plat->addr64) + priv->dma_cap.addr64 = priv->plat->addr64; + if (priv->dma_cap.addr64) { ret = dma_set_mask_and_coherent(device, DMA_BIT_MASK(priv->dma_cap.addr64)); diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 628e28903b8b..15ca6b4167cc 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -170,6 +170,7 @@ struct plat_stmmacenet_data { int unicast_filter_entries; int tx_fifo_size; int rx_fifo_size; + u32 addr64; u32 rx_queues_to_use; u32 tx_queues_to_use; u8 rx_sched_algorithm; From 0398ba9e5a4b5675aa571e0445689d3c2e499c2d Mon Sep 17 00:00:00 2001 From: Cengiz Can Date: Mon, 7 Dec 2020 11:14:24 +0300 Subject: [PATCH 45/81] net: tipc: prevent possible null deref of link `tipc_node_apply_property` does a null check on a `tipc_link_entry` pointer but also accesses the same pointer out of the null check block. This triggers a warning on Coverity Static Analyzer because we're implying that `e->link` can BE null. Move "Update MTU for node link entry" line into if block to make sure that we're not in a state that `e->link` is null. Signed-off-by: Cengiz Can Signed-off-by: David S. Miller --- net/tipc/node.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/tipc/node.c b/net/tipc/node.c index c95d037fde51..83978d5dae59 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2181,9 +2181,11 @@ void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, &xmitq); else if (prop == TIPC_NLA_PROP_MTU) tipc_link_set_mtu(e->link, b->mtu); + + /* Update MTU for node link entry */ + e->mtu = tipc_link_mss(e->link); } - /* Update MTU for node link entry */ - e->mtu = tipc_link_mss(e->link); + tipc_node_write_unlock(n); tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL); } From cc6596fc7295e9dcd78156ed42f9f8e1221f7530 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Tue, 8 Dec 2020 09:53:42 +0800 Subject: [PATCH 46/81] net: ll_temac: Fix potential NULL dereference in temac_probe() platform_get_resource() may fail and in this case a NULL dereference will occur. Fix it to use devm_platform_ioremap_resource() instead of calling platform_get_resource() and devm_ioremap(). This is detected by Coccinelle semantic patch. @@ expression pdev, res, n, t, e, e1, e2; @@ res = \(platform_get_resource\|platform_get_resource_byname\)(pdev, t, n); + if (!res) + return -EINVAL; ... when != res == NULL e = devm_ioremap(e1, res->start, e2); Fixes: 8425c41d1ef7 ("net: ll_temac: Extend support to non-device-tree platforms") Signed-off-by: Zhang Changzhong Acked-by: Esben Haabendal Signed-off-by: David S. Miller --- drivers/net/ethernet/xilinx/ll_temac_main.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index 60c199fcb91e..030185301014 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -1351,7 +1351,6 @@ static int temac_probe(struct platform_device *pdev) struct device_node *temac_np = dev_of_node(&pdev->dev), *dma_np; struct temac_local *lp; struct net_device *ndev; - struct resource *res; const void *addr; __be32 *p; bool little_endian; @@ -1500,13 +1499,11 @@ static int temac_probe(struct platform_device *pdev) of_node_put(dma_np); } else if (pdata) { /* 2nd memory resource specifies DMA registers */ - res = platform_get_resource(pdev, IORESOURCE_MEM, 1); - lp->sdma_regs = devm_ioremap(&pdev->dev, res->start, - resource_size(res)); - if (!lp->sdma_regs) { + lp->sdma_regs = devm_platform_ioremap_resource(pdev, 1); + if (IS_ERR(lp->sdma_regs)) { dev_err(&pdev->dev, "could not map DMA registers\n"); - return -ENOMEM; + return PTR_ERR(lp->sdma_regs); } if (pdata->dma_little_endian) { lp->dma_in = temac_dma_in32_le; From 72d05c00d7ecda85df29abd046da7e41cc071c17 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Dec 2020 08:21:31 -0800 Subject: [PATCH 47/81] tcp: select sane initial rcvq_space.space for big MSS Before commit a337531b942b ("tcp: up initial rmem to 128KB and SYN rwin to around 64KB") small tcp_rmem[1] values were overridden by tcp_fixup_rcvbuf() to accommodate various MSS. This is no longer the case, and Hazem Mohamed Abuelfotoh reported that DRS would not work for MTU 9000 endpoints receiving regular (1500 bytes) frames. Root cause is that tcp_init_buffer_space() uses tp->rcv_wnd for upper limit of rcvq_space.space computation, while it can select later a smaller value for tp->rcv_ssthresh and tp->window_clamp. ss -temoi on receiver would show : skmem:(r0,rb131072,t0,tb46080,f0,w0,o0,bl0,d0) rcv_space:62496 rcv_ssthresh:56596 This means that TCP can not increase its window in tcp_grow_window(), and that DRS can never kick. Fix this by making sure that rcvq_space.space is not bigger than number of bytes that can be held in TCP receive queue. People unable/unwilling to change their kernel can work around this issue by selecting a bigger tcp_rmem[1] value as in : echo "4096 196608 6291456" >/proc/sys/net/ipv4/tcp_rmem Based on an initial report and patch from Hazem Mohamed Abuelfotoh https://lore.kernel.org/netdev/20201204180622.14285-1-abuehaze@amazon.com/ Fixes: a337531b942b ("tcp: up initial rmem to 128KB and SYN rwin to around 64KB") Fixes: 041a14d26715 ("tcp: start receiver buffer autotuning sooner") Reported-by: Hazem Mohamed Abuelfotoh Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 389d1b340248..ef4bdb038a4b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -510,7 +510,6 @@ static void tcp_init_buffer_space(struct sock *sk) if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) tcp_sndbuf_expand(sk); - tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss); tcp_mstamp_refresh(tp); tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; @@ -534,6 +533,8 @@ static void tcp_init_buffer_space(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_jiffies32; + tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd, + (u32)TCP_INIT_CWND * tp->advmss); } /* 4. Recalculate window clamp after socket hit its memory bounds. */ From b62527005d46d52b4733cbc57f2f9b514b673ed9 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 8 Dec 2020 22:49:00 +0100 Subject: [PATCH 48/81] bpf, doc: Update KP's email in MAINTAINERS Helps me use a single account to sign off and send patches use appropriate email redirection without needing to update MAINTAINERS. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201208214900.80684-1-kpsingh@kernel.org --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 98f0bd050ff5..ba63f61e4ef1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3237,7 +3237,7 @@ R: Martin KaFai Lau R: Song Liu R: Yonghong Song R: John Fastabend -R: KP Singh +R: KP Singh L: netdev@vger.kernel.org L: bpf@vger.kernel.org S: Supported @@ -3356,7 +3356,7 @@ F: arch/x86/net/ X: arch/x86/net/bpf_jit_comp32.c BPF LSM (Security Audit and Enforcement using BPF) -M: KP Singh +M: KP Singh R: Florent Revest R: Brendan Jackman L: bpf@vger.kernel.org From 2d94b20b95b009eec1a267dcf026b01af627c0cd Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Tue, 8 Dec 2020 16:39:24 -0500 Subject: [PATCH 49/81] netfilter: nft_ct: Remove confirmation check for NFT_CT_ID Since commit 656c8e9cc1ba ("netfilter: conntrack: Use consistent ct id hash calculation") the ct id will not change from initialization to confirmation. Removing the confirmation check allows for things like adding an element to a 'typeof ct id' set in prerouting upon reception of the first packet of a new connection, and then being able to reference that set consistently both before and after the connection is confirmed. Fixes: 656c8e9cc1ba ("netfilter: conntrack: Use consistent ct id hash calculation") Signed-off-by: Brett Mastbergen Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 322bd674963e..a1b0aac46e9e 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -177,8 +177,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr, } #endif case NFT_CT_ID: - if (!nf_ct_is_confirmed(ct)) - goto err; *dest = nf_ct_get_id(ct); return; default: From 998f17296234aa8d3676b4a13962eb39f4ad24e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 9 Dec 2020 14:57:37 +0100 Subject: [PATCH 50/81] xdp: Remove the xdp_attachment_flags_ok() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 7f0a838254bd ("bpf, xdp: Maintain info on attached XDP BPF programs in net_device"), the XDP program attachment info is now maintained in the core code. This interacts badly with the xdp_attachment_flags_ok() check that prevents unloading an XDP program with different load flags than it was loaded with. In practice, two kinds of failures are seen: - An XDP program loaded without specifying a mode (and which then ends up in driver mode) cannot be unloaded if the program mode is specified on unload. - The dev_xdp_uninstall() hook always calls the driver callback with the mode set to the type of the program but an empty flags argument, which means the flags_ok() check prevents the program from being removed, leading to bpf prog reference leaks. The original reason this check was added was to avoid ambiguity when multiple programs were loaded. With the way the checks are done in the core now, this is quite simple to enforce in the core code, so let's add a check there and get rid of the xdp_attachment_flags_ok() callback entirely. Fixes: 7f0a838254bd ("bpf, xdp: Maintain info on attached XDP BPF programs in net_device") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/160752225751.110217.10267659521308669050.stgit@toke.dk --- .../ethernet/netronome/nfp/nfp_net_common.c | 6 ----- drivers/net/ethernet/ti/cpsw_priv.c | 3 --- drivers/net/netdevsim/bpf.c | 3 --- include/net/xdp.h | 2 -- net/core/dev.c | 22 +++++++++++++++++-- net/core/xdp.c | 12 ---------- 6 files changed, 20 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index b150da43adb2..437226866ce8 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3562,9 +3562,6 @@ static int nfp_net_xdp_setup_drv(struct nfp_net *nn, struct netdev_bpf *bpf) struct nfp_net_dp *dp; int err; - if (!xdp_attachment_flags_ok(&nn->xdp, bpf)) - return -EBUSY; - if (!prog == !nn->dp.xdp_prog) { WRITE_ONCE(nn->dp.xdp_prog, prog); xdp_attachment_setup(&nn->xdp, bpf); @@ -3593,9 +3590,6 @@ static int nfp_net_xdp_setup_hw(struct nfp_net *nn, struct netdev_bpf *bpf) { int err; - if (!xdp_attachment_flags_ok(&nn->xdp_hw, bpf)) - return -EBUSY; - err = nfp_app_xdp_offload(nn->app, nn, bpf->prog, bpf->extack); if (err) return err; diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index 31c5e36ff706..424e644724e4 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -1265,9 +1265,6 @@ static int cpsw_xdp_prog_setup(struct cpsw_priv *priv, struct netdev_bpf *bpf) if (!priv->xdpi.prog && !prog) return 0; - if (!xdp_attachment_flags_ok(&priv->xdpi, bpf)) - return -EBUSY; - WRITE_ONCE(priv->xdp_prog, prog); xdp_attachment_setup(&priv->xdpi, bpf); diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 2e90512f3bbe..85546664bdd5 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -190,9 +190,6 @@ nsim_xdp_set_prog(struct netdevsim *ns, struct netdev_bpf *bpf, { int err; - if (!xdp_attachment_flags_ok(xdp, bpf)) - return -EBUSY; - if (bpf->command == XDP_SETUP_PROG && !ns->bpf_xdpdrv_accept) { NSIM_EA(bpf->extack, "driver XDP disabled in DebugFS"); return -EOPNOTSUPP; diff --git a/include/net/xdp.h b/include/net/xdp.h index 3814fb631d52..9dab2bc6f187 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -240,8 +240,6 @@ struct xdp_attachment_info { }; struct netdev_bpf; -bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, - struct netdev_bpf *bpf); void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf); diff --git a/net/core/dev.c b/net/core/dev.c index 8588ade790cb..38412e70f761 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8917,6 +8917,17 @@ static struct bpf_prog *dev_xdp_prog(struct net_device *dev, return dev->xdp_state[mode].prog; } +static u8 dev_xdp_prog_count(struct net_device *dev) +{ + u8 count = 0; + int i; + + for (i = 0; i < __MAX_XDP_MODE; i++) + if (dev->xdp_state[i].prog || dev->xdp_state[i].link) + count++; + return count; +} + u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_prog *prog = dev_xdp_prog(dev, mode); @@ -9007,6 +9018,7 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack struct bpf_xdp_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog, u32 flags) { + unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); struct bpf_prog *cur_prog; enum bpf_xdp_mode mode; bpf_op_t bpf_op; @@ -9022,11 +9034,17 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); return -EINVAL; } - /* just one XDP mode bit should be set, zero defaults to SKB mode */ - if (hweight32(flags & XDP_FLAGS_MODES) > 1) { + /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ + if (num_modes > 1) { NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); return -EINVAL; } + /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ + if (!num_modes && dev_xdp_prog_count(dev) > 1) { + NL_SET_ERR_MSG(extack, + "More than one program loaded, unset mode is ambiguous"); + return -EINVAL; + } /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); diff --git a/net/core/xdp.c b/net/core/xdp.c index 491ad569a79c..d900cebc0acd 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -403,18 +403,6 @@ void __xdp_release_frame(void *data, struct xdp_mem_info *mem) } EXPORT_SYMBOL_GPL(__xdp_release_frame); -bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, - struct netdev_bpf *bpf) -{ - if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) { - NL_SET_ERR_MSG(bpf->extack, - "program loaded with different flags"); - return false; - } - return true; -} -EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok); - void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf) { From 0b5b6e747c86e57b7ebd64ccb84314a227ccfcc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 9 Dec 2020 14:57:38 +0100 Subject: [PATCH 51/81] selftests/bpf/test_offload.py: Remove check for program load flags match MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we just removed the xdp_attachment_flags_ok() callback, also remove the check for it in test_offload.py, and replace it with a test for the new ambiguity-avoid check when multiple programs are loaded. Fixes: 7f0a838254bd ("bpf, xdp: Maintain info on attached XDP BPF programs in net_device") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/160752225858.110217.13036901876869496246.stgit@toke.dk --- tools/testing/selftests/bpf/test_offload.py | 22 +++++---------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index 43c9cda199b8..becd27b2f4ba 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -716,13 +716,11 @@ def test_multi_prog(simdev, sim, obj, modename, modeid): fail(ret == 0, "Replaced one of programs without -force") check_extack(err, "XDP program already attached.", args) - if modename == "" or modename == "drv": - othermode = "" if modename == "drv" else "drv" - start_test("Test multi-attachment XDP - detach...") - ret, _, err = sim.unset_xdp(othermode, force=True, - fail=False, include_stderr=True) - fail(ret == 0, "Removed program with a bad mode") - check_extack(err, "program loaded with different flags.", args) + start_test("Test multi-attachment XDP - remove without mode...") + ret, _, err = sim.unset_xdp("", force=True, + fail=False, include_stderr=True) + fail(ret == 0, "Removed program without a mode flag") + check_extack(err, "More than one program loaded, unset mode is ambiguous.", args) sim.unset_xdp("offload") xdp = sim.ip_link_show(xdp=True)["xdp"] @@ -1001,16 +999,6 @@ try: check_extack(err, "native and generic XDP can't be active at the same time.", args) - ret, _, err = sim.set_xdp(obj, "", force=True, - fail=False, include_stderr=True) - fail(ret == 0, "Replaced XDP program with a program in different mode") - check_extack(err, "program loaded with different flags.", args) - - start_test("Test XDP prog remove with bad flags...") - ret, _, err = sim.unset_xdp("", force=True, - fail=False, include_stderr=True) - fail(ret == 0, "Removed program with a bad mode") - check_extack(err, "program loaded with different flags.", args) start_test("Test MTU restrictions...") ret, _ = sim.set_mtu(9000, fail=False) From e4ff5aa469403462091eb22e2b0843b894167e10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 9 Dec 2020 14:57:39 +0100 Subject: [PATCH 52/81] netdevsim: Add debugfs toggle to reject BPF programs in verifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds a new debugfs toggle ('bpf_bind_verifier_accept') that can be used to make netdevsim reject BPF programs from being accepted by the verifier. If this toggle (which defaults to true) is set to false, nsim_bpf_verify_insn() will return EOPNOTSUPP on the last instruction (after outputting the 'Hello from netdevsim' verifier message). This makes it possible to check the verification callback in the driver from test_offload.py in selftests, since the verifier now clears the verifier log on a successful load, hiding the message from the driver. Fixes: 6f8a57ccf851 ("bpf: Make verifier log more relevant by default") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/160752225964.110217.12584017165318065332.stgit@toke.dk --- drivers/net/netdevsim/bpf.c | 12 ++++++++++-- drivers/net/netdevsim/netdevsim.h | 1 + 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 85546664bdd5..90aafb56f140 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -63,15 +63,20 @@ static int nsim_bpf_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn) { struct nsim_bpf_bound_prog *state; + int ret = 0; state = env->prog->aux->offload->dev_priv; if (state->nsim_dev->bpf_bind_verifier_delay && !insn_idx) msleep(state->nsim_dev->bpf_bind_verifier_delay); - if (insn_idx == env->prog->len - 1) + if (insn_idx == env->prog->len - 1) { pr_vlog(env, "Hello from netdevsim!\n"); - return 0; + if (!state->nsim_dev->bpf_bind_verifier_accept) + ret = -EOPNOTSUPP; + } + + return ret; } static int nsim_bpf_finalize(struct bpf_verifier_env *env) @@ -595,6 +600,9 @@ int nsim_bpf_dev_init(struct nsim_dev *nsim_dev) &nsim_dev->bpf_bind_accept); debugfs_create_u32("bpf_bind_verifier_delay", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_verifier_delay); + nsim_dev->bpf_bind_verifier_accept = true; + debugfs_create_bool("bpf_bind_verifier_accept", 0600, nsim_dev->ddir, + &nsim_dev->bpf_bind_verifier_accept); return 0; } diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index 827fc80f50a0..c4e7ad2a1964 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -189,6 +189,7 @@ struct nsim_dev { struct dentry *take_snapshot; struct bpf_offload_dev *bpf_dev; bool bpf_bind_accept; + bool bpf_bind_verifier_accept; u32 bpf_bind_verifier_delay; struct dentry *ddir_bpf_bound_progs; u32 prog_id_gen; From d8b5e76ae4e02908d000397597c6bc2868362fbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 9 Dec 2020 14:57:40 +0100 Subject: [PATCH 53/81] selftests/bpf/test_offload.py: Only check verifier log on verification fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 6f8a57ccf851 ("bpf: Make verifier log more relevant by default"), the verifier discards log messages for successfully-verified programs. This broke test_offload.py which is looking for a verification message from the driver callback. Change test_offload.py to use the toggle in netdevsim to make the verification fail before looking for the verification message. Fixes: 6f8a57ccf851 ("bpf: Make verifier log more relevant by default") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/160752226069.110217.12370824996153348073.stgit@toke.dk --- tools/testing/selftests/bpf/test_offload.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index becd27b2f4ba..61527b43f067 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -911,11 +911,18 @@ try: sim.tc_flush_filters() + start_test("Test TC offloads failure...") + sim.dfs["dev/bpf_bind_verifier_accept"] = 0 + ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True, + fail=False, include_stderr=True) + fail(ret == 0, "TC filter did not reject with TC offloads enabled") + check_verifier_log(err, "[netdevsim] Hello from netdevsim!") + sim.dfs["dev/bpf_bind_verifier_accept"] = 1 + start_test("Test TC offloads work...") ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True, fail=False, include_stderr=True) fail(ret != 0, "TC filter did not load with TC offloads enabled") - check_verifier_log(err, "[netdevsim] Hello from netdevsim!") start_test("Test TC offload basics...") dfs = simdev.dfs_get_bound_progs(expected=1) @@ -1032,6 +1039,15 @@ try: rm("/sys/fs/bpf/offload") sim.wait_for_flush() + start_test("Test XDP load failure...") + sim.dfs["dev/bpf_bind_verifier_accept"] = 0 + ret, _, err = bpftool_prog_load("sample_ret0.o", "/sys/fs/bpf/offload", + dev=sim['ifname'], fail=False, include_stderr=True) + fail(ret == 0, "verifier should fail on load") + check_verifier_log(err, "[netdevsim] Hello from netdevsim!") + sim.dfs["dev/bpf_bind_verifier_accept"] = 1 + sim.wait_for_flush() + start_test("Test XDP offload...") _, _, err = sim.set_xdp(obj, "offload", verbose=True, include_stderr=True) ipl = sim.ip_link_show(xdp=True) @@ -1039,7 +1055,6 @@ try: progs = bpftool_prog_list(expected=1) prog = progs[0] fail(link_xdp["id"] != prog["id"], "Loaded program has wrong ID") - check_verifier_log(err, "[netdevsim] Hello from netdevsim!") start_test("Test XDP offload is device bound...") dfs = simdev.dfs_get_bound_progs(expected=1) From 852c2ee338f0ac6026458615b624e1c496142cf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 9 Dec 2020 14:57:41 +0100 Subject: [PATCH 54/81] selftests/bpf/test_offload.py: Fix expected case of extack messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 7f0a838254bd ("bpf, xdp: Maintain info on attached XDP BPF programs in net_device") changed the case of some of the extack messages being returned when attaching of XDP programs failed. This broke test_offload.py, so let's fix the test to reflect this. Fixes: 7f0a838254bd ("bpf, xdp: Maintain info on attached XDP BPF programs in net_device") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/160752226175.110217.11214100824416344952.stgit@toke.dk --- tools/testing/selftests/bpf/test_offload.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index 61527b43f067..51a5e4d939cc 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -1004,7 +1004,7 @@ try: fail=False, include_stderr=True) fail(ret == 0, "Replaced XDP program with a program in different mode") check_extack(err, - "native and generic XDP can't be active at the same time.", + "Native and generic XDP can't be active at the same time.", args) start_test("Test MTU restrictions...") @@ -1035,7 +1035,7 @@ try: offload = bpf_pinned("/sys/fs/bpf/offload") ret, _, err = sim.set_xdp(offload, "drv", fail=False, include_stderr=True) fail(ret == 0, "attached offloaded XDP program to drv") - check_extack(err, "using device-bound program without HW_MODE flag is not supported.", args) + check_extack(err, "Using device-bound program without HW_MODE flag is not supported.", args) rm("/sys/fs/bpf/offload") sim.wait_for_flush() From 766e62b7fcd2cf1d43e6594ba37c659dc48f7ddb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 9 Dec 2020 14:57:42 +0100 Subject: [PATCH 55/81] selftests/bpf/test_offload.py: Reset ethtool features after failed setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When setting the ethtool feature flag fails (as expected for the test), the kernel now tracks that the feature was requested to be 'off' and refuses to subsequently disable it again. So reset it back to 'on' so a subsequent disable (that's not supposed to fail) can succeed. Fixes: 417ec26477a5 ("selftests/bpf: add offload test based on netdevsim") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/160752226280.110217.10696241563705667871.stgit@toke.dk --- tools/testing/selftests/bpf/test_offload.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index 51a5e4d939cc..2128fbd8414b 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -946,6 +946,7 @@ try: start_test("Test disabling TC offloads is rejected while filters installed...") ret, _ = sim.set_ethtool_tc_offloads(False, fail=False) fail(ret == 0, "Driver should refuse to disable TC offloads with filters installed...") + sim.set_ethtool_tc_offloads(True) start_test("Test qdisc removal frees things...") sim.tc_flush_filters() From 8158cad13435639cd4962fb88970960f880ef6d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 9 Dec 2020 14:57:43 +0100 Subject: [PATCH 56/81] selftests/bpf/test_offload.py: Filter bpftool internal map when counting maps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A few of the tests in test_offload.py expects to see a certain number of maps created, and checks this by counting the number of maps returned by bpftool. There is already a filter that will remove any maps already there at the beginning of the test, but bpftool now creates a map for the PID iterator rodata on each invocation, which makes the map count wrong. Fix this by also filtering the pid_iter.rodata map by name when counting. Fixes: d53dee3fe013 ("tools/bpftool: Show info for processes holding BPF map/prog/link/btf FDs") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/160752226387.110217.9887866138149423444.stgit@toke.dk --- tools/testing/selftests/bpf/test_offload.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index 2128fbd8414b..b99bb8ed3ed4 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -184,9 +184,7 @@ def bpftool_prog_list(expected=None, ns=""): def bpftool_map_list(expected=None, ns=""): _, maps = bpftool("map show", JSON=True, ns=ns, fail=True) # Remove the base maps - for m in base_maps: - if m in maps: - maps.remove(m) + maps = [m for m in maps if m not in base_maps and m.get('name') not in base_map_names] if expected is not None: if len(maps) != expected: fail(True, "%d BPF maps loaded, expected %d" % @@ -770,6 +768,9 @@ ret, progs = bpftool("prog", fail=False) skip(ret != 0, "bpftool not installed") base_progs = progs _, base_maps = bpftool("map") +base_map_names = [ + 'pid_iter.rodata' # created on each bpftool invocation +] # Check netdevsim ret, out = cmd("modprobe netdevsim", fail=False) From 323a391a220c4a234cb1e678689d7f4c3b73f863 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 4 Dec 2020 14:35:07 +0100 Subject: [PATCH 57/81] can: isotp: isotp_setsockopt(): block setsockopt on bound sockets The isotp socket can be widely configured in its behaviour regarding addressing types, fill-ups, receive pattern tests and link layer length. Usually all these settings need to be fixed before bind() and can not be changed afterwards. This patch adds a check to enforce the common usage pattern. Fixes: e057dd3fc20f ("can: add ISO 15765-2:2016 transport protocol") Signed-off-by: Oliver Hartkopp Tested-by: Thomas Wagner Link: https://lore.kernel.org/r/20201203140604.25488-2-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde Link: https://lore.kernel.org/r/20201204133508.742120-3-mkl@pengutronix.de Signed-off-by: Jakub Kicinski --- net/can/isotp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/can/isotp.c b/net/can/isotp.c index d78ab13bd8be..26bdc3c20b7e 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1157,6 +1157,9 @@ static int isotp_setsockopt(struct socket *sock, int level, int optname, if (level != SOL_CAN_ISOTP) return -EINVAL; + if (so->bound) + return -EISCONN; + switch (optname) { case CAN_ISOTP_OPTS: if (optlen != sizeof(struct can_isotp_options)) From c02bd115b1d25931159f89c7d9bf47a30f5d4b41 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Dec 2020 14:39:56 -0800 Subject: [PATCH 58/81] Revert "geneve: pull IP header before ECN decapsulation" This reverts commit 4179b00c04d1 ("geneve: pull IP header before ECN decapsulation"). Eric says: "network header should have been pulled already before hitting geneve_rx()". Let's revert the syzbot fix since it's causing more harm than good, and revisit. Suggested-by: Eric Dumazet Reported-by: Jianlin Shi Fixes: 4179b00c04d1 ("geneve: pull IP header before ECN decapsulation") Link: https://bugzilla.kernel.org/show_bug.cgi?id=210569 Link: https://lore.kernel.org/netdev/CANn89iJVWfb=2i7oU1=D55rOyQnBbbikf+Mc6XHMkY7YX-yGEw@mail.gmail.com/ Signed-off-by: Jakub Kicinski --- drivers/net/geneve.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 8ae9ce2014a4..1426bfc009bc 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -257,21 +257,11 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, skb_dst_set(skb, &tun_dst->dst); /* Ignore packet loops (and multicast echo) */ - if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) - goto rx_error; - - switch (skb_protocol(skb, true)) { - case htons(ETH_P_IP): - if (pskb_may_pull(skb, sizeof(struct iphdr))) - goto rx_error; - break; - case htons(ETH_P_IPV6): - if (pskb_may_pull(skb, sizeof(struct ipv6hdr))) - goto rx_error; - break; - default: - goto rx_error; + if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) { + geneve->dev->stats.rx_errors++; + goto drop; } + oiph = skb_network_header(skb); skb_reset_network_header(skb); @@ -308,8 +298,6 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, dev_sw_netstats_rx_add(geneve->dev, len); return; -rx_error: - geneve->dev->stats.rx_errors++; drop: /* Consume bad packet */ kfree_skb(skb); From cfb33e174fa25d9d830683a1e1b22850546103b5 Mon Sep 17 00:00:00 2001 From: Sven Auhagen Date: Wed, 11 Nov 2020 18:04:48 +0100 Subject: [PATCH 59/81] igb: XDP xmit back fix error code The igb XDP xmit back function should only return defined error codes. Fixes: 9cbc948b5a20 ("igb: add XDP support") Reported-by: Dan Carpenter Acked-by: Maciej Fijalkowski Signed-off-by: Sven Auhagen Tested-by: Sandeep Penigalapati Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 5fc2c381da55..08cc6f59aa2e 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2910,7 +2910,7 @@ static int igb_xdp_xmit_back(struct igb_adapter *adapter, struct xdp_buff *xdp) */ tx_ring = adapter->xdp_prog ? igb_xdp_tx_queue_mapping(adapter) : NULL; if (unlikely(!tx_ring)) - return -ENXIO; + return IGB_XDP_CONSUMED; nq = txring_txq(tx_ring); __netif_tx_lock(nq, cpu); From b829ec1a66bc3dda4b01ab4c57d41ad1a1f82fed Mon Sep 17 00:00:00 2001 From: Sven Auhagen Date: Wed, 11 Nov 2020 18:04:49 +0100 Subject: [PATCH 60/81] igb: take VLAN double header into account Increase the packet header padding to include double VLAN tagging. This patch uses a macro for this. Fixes: 9cbc948b5a20 ("igb: add XDP support") Suggested-by: Maciej Fijalkowski Reviewed-by: Maciej Fijalkowski Acked-by: Maciej Fijalkowski Signed-off-by: Sven Auhagen Tested-by: Sandeep Penigalapati Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb.h | 5 +++++ drivers/net/ethernet/intel/igb/igb_main.c | 7 +++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index 0286d2fceee4..aaa954aae574 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -138,6 +138,8 @@ struct vf_mac_filter { /* this is the size past which hardware will drop packets when setting LPE=0 */ #define MAXIMUM_ETHERNET_VLAN_SIZE 1522 +#define IGB_ETH_PKT_HDR_PAD (ETH_HLEN + ETH_FCS_LEN + (VLAN_HLEN * 2)) + /* Supported Rx Buffer Sizes */ #define IGB_RXBUFFER_256 256 #define IGB_RXBUFFER_1536 1536 @@ -247,6 +249,9 @@ enum igb_tx_flags { #define IGB_SFF_ADDRESSING_MODE 0x4 #define IGB_SFF_8472_UNSUP 0x00 +/* TX resources are shared between XDP and netstack + * and we need to tag the buffer type to distinguish them + */ enum igb_tx_buf_type { IGB_TYPE_SKB = 0, IGB_TYPE_XDP, diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 08cc6f59aa2e..0a9198037b98 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2826,7 +2826,7 @@ static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type, static int igb_xdp_setup(struct net_device *dev, struct bpf_prog *prog) { - int i, frame_size = dev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; + int i, frame_size = dev->mtu + IGB_ETH_PKT_HDR_PAD; struct igb_adapter *adapter = netdev_priv(dev); bool running = netif_running(dev); struct bpf_prog *old_prog; @@ -3950,8 +3950,7 @@ static int igb_sw_init(struct igb_adapter *adapter) /* set default work limits */ adapter->tx_work_limit = IGB_DEFAULT_TX_WORK; - adapter->max_frame_size = netdev->mtu + ETH_HLEN + ETH_FCS_LEN + - VLAN_HLEN; + adapter->max_frame_size = netdev->mtu + IGB_ETH_PKT_HDR_PAD; adapter->min_frame_size = ETH_ZLEN + ETH_FCS_LEN; spin_lock_init(&adapter->nfc_lock); @@ -6491,7 +6490,7 @@ static void igb_get_stats64(struct net_device *netdev, static int igb_change_mtu(struct net_device *netdev, int new_mtu) { struct igb_adapter *adapter = netdev_priv(netdev); - int max_frame = new_mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; + int max_frame = new_mtu + IGB_ETH_PKT_HDR_PAD; if (adapter->xdp_prog) { int i; From 2e2bb5594ca0a5885dc93055ab0f9b5fbcdaa403 Mon Sep 17 00:00:00 2001 From: Sven Auhagen Date: Wed, 11 Nov 2020 18:04:50 +0100 Subject: [PATCH 61/81] igb: XDP extack message on error Add an extack error message when the RX buffer size is too small for the frame size. Fixes: 9cbc948b5a20 ("igb: add XDP support") Suggested-by: Maciej Fijalkowski Reviewed-by: Maciej Fijalkowski Acked-by: Maciej Fijalkowski Signed-off-by: Sven Auhagen Tested-by: Sandeep Penigalapati Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_main.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 0a9198037b98..a0a310a75cc5 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2824,20 +2824,25 @@ static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type, } } -static int igb_xdp_setup(struct net_device *dev, struct bpf_prog *prog) +static int igb_xdp_setup(struct net_device *dev, struct netdev_bpf *bpf) { int i, frame_size = dev->mtu + IGB_ETH_PKT_HDR_PAD; struct igb_adapter *adapter = netdev_priv(dev); + struct bpf_prog *prog = bpf->prog, *old_prog; bool running = netif_running(dev); - struct bpf_prog *old_prog; bool need_reset; /* verify igb ring attributes are sufficient for XDP */ for (i = 0; i < adapter->num_rx_queues; i++) { struct igb_ring *ring = adapter->rx_ring[i]; - if (frame_size > igb_rx_bufsz(ring)) + if (frame_size > igb_rx_bufsz(ring)) { + NL_SET_ERR_MSG_MOD(bpf->extack, + "The RX buffer size is too small for the frame size"); + netdev_warn(dev, "XDP RX buffer size %d is too small for the frame size %d\n", + igb_rx_bufsz(ring), frame_size); return -EINVAL; + } } old_prog = xchg(&adapter->xdp_prog, prog); @@ -2869,7 +2874,7 @@ static int igb_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: - return igb_xdp_setup(dev, xdp->prog); + return igb_xdp_setup(dev, xdp); default: return -EINVAL; } @@ -6499,7 +6504,9 @@ static int igb_change_mtu(struct net_device *netdev, int new_mtu) struct igb_ring *ring = adapter->rx_ring[i]; if (max_frame > igb_rx_bufsz(ring)) { - netdev_warn(adapter->netdev, "Requested MTU size is not supported with XDP\n"); + netdev_warn(adapter->netdev, + "Requested MTU size is not supported with XDP. Max frame size is %d\n", + max_frame); return -EINVAL; } } From 681429dba99249546dda160e266e56035a2d750b Mon Sep 17 00:00:00 2001 From: Sven Auhagen Date: Wed, 11 Nov 2020 18:04:51 +0100 Subject: [PATCH 62/81] igb: skb add metasize for xdp add metasize if it is set in xdp Fixes: 9cbc948b5a20 ("igb: add XDP support") Suggested-by: Maciej Fijalkowski Reviewed-by: Maciej Fijalkowski Acked-by: Maciej Fijalkowski Signed-off-by: Sven Auhagen Tested-by: Sandeep Penigalapati Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index a0a310a75cc5..8e412aaba0e3 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -8357,6 +8357,7 @@ static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring, SKB_DATA_ALIGN(xdp->data_end - xdp->data_hard_start); #endif + unsigned int metasize = xdp->data - xdp->data_meta; struct sk_buff *skb; /* prefetch first cache line of first page */ @@ -8371,6 +8372,9 @@ static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring, skb_reserve(skb, xdp->data - xdp->data_hard_start); __skb_put(skb, xdp->data_end - xdp->data); + if (metasize) + skb_metadata_set(skb, metasize); + /* pull timestamp out of packet data */ if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) { igb_ptp_rx_pktstamp(rx_ring->q_vector, skb->data, skb); From 3eca859008a75a4ad363db65b0fe83be1a3d5ad1 Mon Sep 17 00:00:00 2001 From: Sven Auhagen Date: Wed, 11 Nov 2020 18:04:52 +0100 Subject: [PATCH 63/81] igb: use xdp_do_flush Since it is a new XDP implementation change xdp_do_flush_map to xdp_do_flush. Fixes: 9cbc948b5a20 ("igb: add XDP support") Suggested-by: Maciej Fijalkowski Reviewed-by: Maciej Fijalkowski Acked-by: Maciej Fijalkowski Signed-off-by: Sven Auhagen Tested-by: Sandeep Penigalapati Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 8e412aaba0e3..af6ace6c0f87 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -8781,7 +8781,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) rx_ring->skb = skb; if (xdp_xmit & IGB_XDP_REDIR) - xdp_do_flush_map(); + xdp_do_flush(); if (xdp_xmit & IGB_XDP_TX) { struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter); From ec107e775d84392b35db46f6c3baa441e074042e Mon Sep 17 00:00:00 2001 From: Sven Auhagen Date: Wed, 11 Nov 2020 18:04:53 +0100 Subject: [PATCH 64/81] igb: avoid transmit queue timeout in xdp path Since we share the transmit queue with the network stack, it is possible that we run into a transmit queue timeout. This will reset the queue. This happens under high load when XDP is using the transmit queue pretty much exclusively. netdev_start_xmit() sets the trans_start variable of the transmit queue to jiffies which is later utilized by dev_watchdog(), so to avoid timeout, let stack know that XDP xmit happened by bumping the trans_start within XDP Tx routines to jiffies. Fixes: 9cbc948b5a20 ("igb: add XDP support") Acked-by: Maciej Fijalkowski Signed-off-by: Sven Auhagen Tested-by: Sandeep Penigalapati Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index af6ace6c0f87..0d343d050973 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2919,6 +2919,8 @@ static int igb_xdp_xmit_back(struct igb_adapter *adapter, struct xdp_buff *xdp) nq = txring_txq(tx_ring); __netif_tx_lock(nq, cpu); + /* Avoid transmit queue timeout since we share it with the slow path */ + nq->trans_start = jiffies; ret = igb_xmit_xdp_ring(adapter, tx_ring, xdpf); __netif_tx_unlock(nq); @@ -2951,6 +2953,9 @@ static int igb_xdp_xmit(struct net_device *dev, int n, nq = txring_txq(tx_ring); __netif_tx_lock(nq, cpu); + /* Avoid transmit queue timeout since we share it with the slow path */ + nq->trans_start = jiffies; + for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; int err; From 75aab4e10ae6a4593a60f66d13de755d4e91f400 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 25 Aug 2020 19:27:34 +0200 Subject: [PATCH 65/81] i40e: avoid premature Rx buffer reuse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The page recycle code, incorrectly, relied on that a page fragment could not be freed inside xdp_do_redirect(). This assumption leads to that page fragments that are used by the stack/XDP redirect can be reused and overwritten. To avoid this, store the page count prior invoking xdp_do_redirect(). Longer explanation: Intel NICs have a recycle mechanism. The main idea is that a page is split into two parts. One part is owned by the driver, one part might be owned by someone else, such as the stack. t0: Page is allocated, and put on the Rx ring +--------------- used by NIC ->| upper buffer (rx_buffer) +--------------- | lower buffer +--------------- page count == USHRT_MAX rx_buffer->pagecnt_bias == USHRT_MAX t1: Buffer is received, and passed to the stack (e.g.) +--------------- | upper buff (skb) +--------------- used by NIC ->| lower buffer (rx_buffer) +--------------- page count == USHRT_MAX rx_buffer->pagecnt_bias == USHRT_MAX - 1 t2: Buffer is received, and redirected +--------------- | upper buff (skb) +--------------- used by NIC ->| lower buffer (rx_buffer) +--------------- Now, prior calling xdp_do_redirect(): page count == USHRT_MAX rx_buffer->pagecnt_bias == USHRT_MAX - 2 This means that buffer *cannot* be flipped/reused, because the skb is still using it. The problem arises when xdp_do_redirect() actually frees the segment. Then we get: page count == USHRT_MAX - 1 rx_buffer->pagecnt_bias == USHRT_MAX - 2 From a recycle perspective, the buffer can be flipped and reused, which means that the skb data area is passed to the Rx HW ring! To work around this, the page count is stored prior calling xdp_do_redirect(). Note that this is not optimal, since the NIC could actually reuse the "lower buffer" again. However, then we need to track whether XDP_REDIRECT consumed the buffer or not. Fixes: d9314c474d4f ("i40e: add support for XDP_REDIRECT") Reported-and-analyzed-by: Li RongQing Signed-off-by: Björn Töpel Tested-by: George Kuruvinakunnel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 27 +++++++++++++++------ 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index d43ce13a93c9..3f5825fa67c9 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1850,6 +1850,7 @@ static inline bool i40e_page_is_reusable(struct page *page) * the adapter for another receive * * @rx_buffer: buffer containing the page + * @rx_buffer_pgcnt: buffer page refcount pre xdp_do_redirect() call * * If page is reusable, rx_buffer->page_offset is adjusted to point to * an unused region in the page. @@ -1872,7 +1873,8 @@ static inline bool i40e_page_is_reusable(struct page *page) * * In either case, if the page is reusable its refcount is increased. **/ -static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer) +static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer, + int rx_buffer_pgcnt) { unsigned int pagecnt_bias = rx_buffer->pagecnt_bias; struct page *page = rx_buffer->page; @@ -1883,7 +1885,7 @@ static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer) #if (PAGE_SIZE < 8192) /* if we are only owner of page we can reuse it */ - if (unlikely((page_count(page) - pagecnt_bias) > 1)) + if (unlikely((rx_buffer_pgcnt - pagecnt_bias) > 1)) return false; #else #define I40E_LAST_OFFSET \ @@ -1942,16 +1944,24 @@ static void i40e_add_rx_frag(struct i40e_ring *rx_ring, * i40e_get_rx_buffer - Fetch Rx buffer and synchronize data for use * @rx_ring: rx descriptor ring to transact packets on * @size: size of buffer to add to skb + * @rx_buffer_pgcnt: buffer page refcount * * This function will pull an Rx buffer from the ring and synchronize it * for use by the CPU. */ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct i40e_ring *rx_ring, - const unsigned int size) + const unsigned int size, + int *rx_buffer_pgcnt) { struct i40e_rx_buffer *rx_buffer; rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); + *rx_buffer_pgcnt = +#if (PAGE_SIZE < 8192) + page_count(rx_buffer->page); +#else + 0; +#endif prefetch_page_address(rx_buffer->page); /* we are reusing so sync this buffer for CPU use */ @@ -2102,14 +2112,16 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, * i40e_put_rx_buffer - Clean up used buffer and either recycle or free * @rx_ring: rx descriptor ring to transact packets on * @rx_buffer: rx buffer to pull data from + * @rx_buffer_pgcnt: rx buffer page refcount pre xdp_do_redirect() call * * This function will clean up the contents of the rx_buffer. It will * either recycle the buffer or unmap it and free the associated resources. */ static void i40e_put_rx_buffer(struct i40e_ring *rx_ring, - struct i40e_rx_buffer *rx_buffer) + struct i40e_rx_buffer *rx_buffer, + int rx_buffer_pgcnt) { - if (i40e_can_reuse_rx_page(rx_buffer)) { + if (i40e_can_reuse_rx_page(rx_buffer, rx_buffer_pgcnt)) { /* hand second half of page back to the ring */ i40e_reuse_rx_page(rx_ring, rx_buffer); } else { @@ -2336,6 +2348,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) while (likely(total_rx_packets < (unsigned int)budget)) { struct i40e_rx_buffer *rx_buffer; union i40e_rx_desc *rx_desc; + int rx_buffer_pgcnt; unsigned int size; u64 qword; @@ -2378,7 +2391,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) break; i40e_trace(clean_rx_irq, rx_ring, rx_desc, skb); - rx_buffer = i40e_get_rx_buffer(rx_ring, size); + rx_buffer = i40e_get_rx_buffer(rx_ring, size, &rx_buffer_pgcnt); /* retrieve a buffer from the ring */ if (!skb) { @@ -2421,7 +2434,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) break; } - i40e_put_rx_buffer(rx_ring, rx_buffer); + i40e_put_rx_buffer(rx_ring, rx_buffer, rx_buffer_pgcnt); cleaned_count++; if (i40e_is_non_eop(rx_ring, rx_desc, skb)) From a06316dc87bdc000f7f39a315476957af2ba0f05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 25 Aug 2020 19:27:35 +0200 Subject: [PATCH 66/81] ixgbe: avoid premature Rx buffer reuse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The page recycle code, incorrectly, relied on that a page fragment could not be freed inside xdp_do_redirect(). This assumption leads to that page fragments that are used by the stack/XDP redirect can be reused and overwritten. To avoid this, store the page count prior invoking xdp_do_redirect(). Fixes: 6453073987ba ("ixgbe: add initial support for xdp redirect") Reported-and-analyzed-by: Li RongQing Signed-off-by: Björn Töpel Tested-by: Sandeep Penigalapati Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 45ae33e15303..f3f449f53920 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1945,7 +1945,8 @@ static inline bool ixgbe_page_is_reserved(struct page *page) return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page); } -static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer) +static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer, + int rx_buffer_pgcnt) { unsigned int pagecnt_bias = rx_buffer->pagecnt_bias; struct page *page = rx_buffer->page; @@ -1956,7 +1957,7 @@ static bool ixgbe_can_reuse_rx_page(struct ixgbe_rx_buffer *rx_buffer) #if (PAGE_SIZE < 8192) /* if we are only owner of page we can reuse it */ - if (unlikely((page_ref_count(page) - pagecnt_bias) > 1)) + if (unlikely((rx_buffer_pgcnt - pagecnt_bias) > 1)) return false; #else /* The last offset is a bit aggressive in that we assume the @@ -2021,11 +2022,18 @@ static void ixgbe_add_rx_frag(struct ixgbe_ring *rx_ring, static struct ixgbe_rx_buffer *ixgbe_get_rx_buffer(struct ixgbe_ring *rx_ring, union ixgbe_adv_rx_desc *rx_desc, struct sk_buff **skb, - const unsigned int size) + const unsigned int size, + int *rx_buffer_pgcnt) { struct ixgbe_rx_buffer *rx_buffer; rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean]; + *rx_buffer_pgcnt = +#if (PAGE_SIZE < 8192) + page_count(rx_buffer->page); +#else + 0; +#endif prefetchw(rx_buffer->page); *skb = rx_buffer->skb; @@ -2055,9 +2063,10 @@ skip_sync: static void ixgbe_put_rx_buffer(struct ixgbe_ring *rx_ring, struct ixgbe_rx_buffer *rx_buffer, - struct sk_buff *skb) + struct sk_buff *skb, + int rx_buffer_pgcnt) { - if (ixgbe_can_reuse_rx_page(rx_buffer)) { + if (ixgbe_can_reuse_rx_page(rx_buffer, rx_buffer_pgcnt)) { /* hand second half of page back to the ring */ ixgbe_reuse_rx_page(rx_ring, rx_buffer); } else { @@ -2303,6 +2312,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, union ixgbe_adv_rx_desc *rx_desc; struct ixgbe_rx_buffer *rx_buffer; struct sk_buff *skb; + int rx_buffer_pgcnt; unsigned int size; /* return some buffers to hardware, one at a time is too slow */ @@ -2322,7 +2332,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, */ dma_rmb(); - rx_buffer = ixgbe_get_rx_buffer(rx_ring, rx_desc, &skb, size); + rx_buffer = ixgbe_get_rx_buffer(rx_ring, rx_desc, &skb, size, &rx_buffer_pgcnt); /* retrieve a buffer from the ring */ if (!skb) { @@ -2367,7 +2377,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, break; } - ixgbe_put_rx_buffer(rx_ring, rx_buffer, skb); + ixgbe_put_rx_buffer(rx_ring, rx_buffer, skb, rx_buffer_pgcnt); cleaned_count++; /* place incomplete frames back on ring for completion */ From 1beb7830d3b285b28f7cde3644d59d2590a47e51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 25 Aug 2020 19:27:36 +0200 Subject: [PATCH 67/81] ice: avoid premature Rx buffer reuse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The page recycle code, incorrectly, relied on that a page fragment could not be freed inside xdp_do_redirect(). This assumption leads to that page fragments that are used by the stack/XDP redirect can be reused and overwritten. To avoid this, store the page count prior invoking xdp_do_redirect(). Fixes: efc2214b6047 ("ice: Add support for XDP") Reported-and-analyzed-by: Li RongQing Signed-off-by: Björn Töpel Tested-by: George Kuruvinakunnel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_txrx.c | 31 ++++++++++++++++------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index eae75260fe20..23eca2f0a03b 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -762,13 +762,15 @@ ice_rx_buf_adjust_pg_offset(struct ice_rx_buf *rx_buf, unsigned int size) /** * ice_can_reuse_rx_page - Determine if page can be reused for another Rx * @rx_buf: buffer containing the page + * @rx_buf_pgcnt: rx_buf page refcount pre xdp_do_redirect() call * * If page is reusable, we have a green light for calling ice_reuse_rx_page, * which will assign the current buffer to the buffer that next_to_alloc is * pointing to; otherwise, the DMA mapping needs to be destroyed and * page freed */ -static bool ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf) +static bool +ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf, int rx_buf_pgcnt) { unsigned int pagecnt_bias = rx_buf->pagecnt_bias; struct page *page = rx_buf->page; @@ -779,7 +781,7 @@ static bool ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf) #if (PAGE_SIZE < 8192) /* if we are only owner of page we can reuse it */ - if (unlikely((page_count(page) - pagecnt_bias) > 1)) + if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1)) return false; #else #define ICE_LAST_OFFSET \ @@ -864,17 +866,24 @@ ice_reuse_rx_page(struct ice_ring *rx_ring, struct ice_rx_buf *old_buf) * @rx_ring: Rx descriptor ring to transact packets on * @skb: skb to be used * @size: size of buffer to add to skb + * @rx_buf_pgcnt: rx_buf page refcount * * This function will pull an Rx buffer from the ring and synchronize it * for use by the CPU. */ static struct ice_rx_buf * ice_get_rx_buf(struct ice_ring *rx_ring, struct sk_buff **skb, - const unsigned int size) + const unsigned int size, int *rx_buf_pgcnt) { struct ice_rx_buf *rx_buf; rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean]; + *rx_buf_pgcnt = +#if (PAGE_SIZE < 8192) + page_count(rx_buf->page); +#else + 0; +#endif prefetchw(rx_buf->page); *skb = rx_buf->skb; @@ -1006,12 +1015,15 @@ ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf, * ice_put_rx_buf - Clean up used buffer and either recycle or free * @rx_ring: Rx descriptor ring to transact packets on * @rx_buf: Rx buffer to pull data from + * @rx_buf_pgcnt: Rx buffer page count pre xdp_do_redirect() * * This function will update next_to_clean and then clean up the contents * of the rx_buf. It will either recycle the buffer or unmap it and free * the associated resources. */ -static void ice_put_rx_buf(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf) +static void +ice_put_rx_buf(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf, + int rx_buf_pgcnt) { u16 ntc = rx_ring->next_to_clean + 1; @@ -1022,7 +1034,7 @@ static void ice_put_rx_buf(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf) if (!rx_buf) return; - if (ice_can_reuse_rx_page(rx_buf)) { + if (ice_can_reuse_rx_page(rx_buf, rx_buf_pgcnt)) { /* hand second half of page back to the ring */ ice_reuse_rx_page(rx_ring, rx_buf); } else { @@ -1097,6 +1109,7 @@ int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) struct sk_buff *skb; unsigned int size; u16 stat_err_bits; + int rx_buf_pgcnt; u16 vlan_tag = 0; u8 rx_ptype; @@ -1119,7 +1132,7 @@ int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) dma_rmb(); if (rx_desc->wb.rxdid == FDIR_DESC_RXDID || !rx_ring->netdev) { - ice_put_rx_buf(rx_ring, NULL); + ice_put_rx_buf(rx_ring, NULL, 0); cleaned_count++; continue; } @@ -1128,7 +1141,7 @@ int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) ICE_RX_FLX_DESC_PKT_LEN_M; /* retrieve a buffer from the ring */ - rx_buf = ice_get_rx_buf(rx_ring, &skb, size); + rx_buf = ice_get_rx_buf(rx_ring, &skb, size, &rx_buf_pgcnt); if (!size) { xdp.data = NULL; @@ -1168,7 +1181,7 @@ int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) total_rx_pkts++; cleaned_count++; - ice_put_rx_buf(rx_ring, rx_buf); + ice_put_rx_buf(rx_ring, rx_buf, rx_buf_pgcnt); continue; construct_skb: if (skb) { @@ -1187,7 +1200,7 @@ construct_skb: break; } - ice_put_rx_buf(rx_ring, rx_buf); + ice_put_rx_buf(rx_ring, rx_buf, rx_buf_pgcnt); cleaned_count++; /* skip if it is NOP desc */ From a379b01cd4b2aa3f12786b281a714871574e5ccb Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Tue, 8 Dec 2020 12:56:32 -0600 Subject: [PATCH 68/81] e1000e: fix S0ix flow to allow S0i3.2 subset entry Changed a configuration in the flows to align with architecture requirements to achieve S0i3.2 substate. This helps both i219V and i219LM configurations. Also fixed a typo in the previous commit 632fbd5eb5b0 ("e1000e: fix S0ix flows for cable connected case"). Fixes: 632fbd5eb5b0 ("e1000e: fix S0ix flows for cable connected case"). Signed-off-by: Vitaly Lifshits Tested-by: Aaron Brown Signed-off-by: Tony Nguyen Reviewed-by: Alexander Duyck Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20201208185632.151052-1-mario.limonciello@dell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/e1000e/netdev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index b30f00891c03..128ab6898070 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6475,13 +6475,13 @@ static void e1000e_s0ix_entry_flow(struct e1000_adapter *adapter) /* Ungate PGCB clock */ mac_data = er32(FEXTNVM9); - mac_data |= BIT(28); + mac_data &= ~BIT(28); ew32(FEXTNVM9, mac_data); /* Enable K1 off to enable mPHY Power Gating */ mac_data = er32(FEXTNVM6); mac_data |= BIT(31); - ew32(FEXTNVM12, mac_data); + ew32(FEXTNVM6, mac_data); /* Enable mPHY power gating for any link and speed */ mac_data = er32(FEXTNVM8); @@ -6525,11 +6525,11 @@ static void e1000e_s0ix_exit_flow(struct e1000_adapter *adapter) /* Disable K1 off */ mac_data = er32(FEXTNVM6); mac_data &= ~BIT(31); - ew32(FEXTNVM12, mac_data); + ew32(FEXTNVM6, mac_data); /* Disable Ungate PGCB clock */ mac_data = er32(FEXTNVM9); - mac_data &= ~BIT(28); + mac_data |= BIT(28); ew32(FEXTNVM9, mac_data); /* Cancel not waking from dynamic From a770bf515613c6e12ae904c3593e26016de99448 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Tue, 8 Dec 2020 23:13:51 +0100 Subject: [PATCH 69/81] ethtool: fix stack overflow in ethnl_parse_bitset() Syzbot reported a stack overflow in bitmap_from_arr32() called from ethnl_parse_bitset() when bitset from netlink message is longer than target bitmap length. While ethnl_compact_sanity_checks() makes sure that trailing part is all zeros (i.e. the request does not try to touch bits kernel does not recognize), we also need to cap change_bits to nbits so that we don't try to write past the prepared bitmaps. Fixes: 88db6d1e4f62 ("ethtool: add ethnl_parse_bitset() helper") Reported-by: syzbot+9d39fa49d4df294aab93@syzkaller.appspotmail.com Signed-off-by: Michal Kubecek Link: https://lore.kernel.org/r/3487ee3a98e14cd526f55b6caaa959d2dcbcad9f.1607465316.git.mkubecek@suse.cz Signed-off-by: Jakub Kicinski --- net/ethtool/bitset.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c index 1fb3603d92ad..0515d6604b3b 100644 --- a/net/ethtool/bitset.c +++ b/net/ethtool/bitset.c @@ -628,6 +628,8 @@ int ethnl_parse_bitset(unsigned long *val, unsigned long *mask, return ret; change_bits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]); + if (change_bits > nbits) + change_bits = nbits; bitmap_from_arr32(val, nla_data(tb[ETHTOOL_A_BITSET_VALUE]), change_bits); if (change_bits < nbits) From 8ef44b6fe49d2b8d03ba9aa69063612b474f963b Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 8 Dec 2020 09:55:08 -0800 Subject: [PATCH 70/81] tcp: Retain ECT bits for tos reflection For DCTCP, we have to retain the ECT bits set by the congestion control algorithm on the socket when reflecting syn TOS in syn-ack, in order to make ECN work properly. Fixes: ac8f1710c12b ("tcp: reflect tos value received in SYN to the socket") Reported-by: Alexander Duyck Signed-off-by: Wei Wang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 7 +++++-- net/ipv6/tcp_ipv6.c | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8391aa29e7a4..595dcc3afac5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -984,7 +984,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? - tcp_rsk(req)->syn_tos & ~INET_ECN_MASK : + (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | + (inet_sk(sk)->tos & INET_ECN_MASK) : inet_sk(sk)->tos; if (!INET_ECN_is_capable(tos) && @@ -1541,7 +1542,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = prandom_u32(); - /* Set ToS of the new socket based upon the value of incoming SYN. */ + /* Set ToS of the new socket based upon the value of incoming SYN. + * ECT bits are set later in tcp_init_transfer(). + */ if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 992cbf3eb9e3..991dc36f95ff 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -528,7 +528,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? - tcp_rsk(req)->syn_tos & ~INET_ECN_MASK : + (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | + (np->tclass & INET_ECN_MASK) : np->tclass; if (!INET_ECN_is_capable(tclass) && @@ -1320,7 +1321,9 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * if (np->repflow) newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); - /* Set ToS of the new socket based upon the value of incoming SYN. */ + /* Set ToS of the new socket based upon the value of incoming SYN. + * ECT bits are set later in tcp_init_transfer(). + */ if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; From 5137d303659d8c324e67814b1cc2e1bc0c0d9836 Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Tue, 8 Dec 2020 10:48:35 +0800 Subject: [PATCH 71/81] net: flow_offload: Fix memory leak for indirect flow block The offending commit introduces a cleanup callback that is invoked when the driver module is removed to clean up the tunnel device flow block. But it returns on the first iteration of the for loop. The remaining indirect flow blocks will never be freed. Fixes: 1fac52da5942 ("net: flow_offload: consolidate indirect flow_block infrastructure") CC: Pablo Neira Ayuso Signed-off-by: Chris Mi Reviewed-by: Roi Dayan --- net/core/flow_offload.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index d4474c812b64..715b67f6c62f 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -381,10 +381,8 @@ static void __flow_block_indr_cleanup(void (*release)(void *cb_priv), list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) { if (this->release == release && - this->indr.cb_priv == cb_priv) { + this->indr.cb_priv == cb_priv) list_move(&this->indr.list, cleanup_list); - return; - } } } From 299bcb55ecd1412f6df606e9dc0912d55610029e Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 8 Dec 2020 22:57:59 -0500 Subject: [PATCH 72/81] tcp: fix cwnd-limited bug for TSO deferral where we send nothing When cwnd is not a multiple of the TSO skb size of N*MSS, we can get into persistent scenarios where we have the following sequence: (1) ACK for full-sized skb of N*MSS arrives -> tcp_write_xmit() transmit full-sized skb with N*MSS -> move pacing release time forward -> exit tcp_write_xmit() because pacing time is in the future (2) TSQ callback or TCP internal pacing timer fires -> try to transmit next skb, but TSO deferral finds remainder of available cwnd is not big enough to trigger an immediate send now, so we defer sending until the next ACK. (3) repeat... So we can get into a case where we never mark ourselves as cwnd-limited for many seconds at a time, even with bulk/infinite-backlog senders, because: o In case (1) above, every time in tcp_write_xmit() we have enough cwnd to send a full-sized skb, we are not fully using the cwnd (because cwnd is not a multiple of the TSO skb size). So every time we send data, we are not cwnd limited, and so in the cwnd-limited tracking code in tcp_cwnd_validate() we mark ourselves as not cwnd-limited. o In case (2) above, every time in tcp_write_xmit() that we try to transmit the "remainder" of the cwnd but defer, we set the local variable is_cwnd_limited to true, but we do not send any packets, so sent_pkts is zero, so we don't call the cwnd-limited logic to update tp->is_cwnd_limited. Fixes: ca8a22634381 ("tcp: make cwnd-limited checks measurement-based, and gentler") Reported-by: Ingemar Johansson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20201209035759.1225145-1-ncardwell.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bf48cd73e967..99011768c264 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1880,7 +1880,8 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) * window, and remember whether we were cwnd-limited then. */ if (!before(tp->snd_una, tp->max_packets_seq) || - tp->packets_out > tp->max_packets_out) { + tp->packets_out > tp->max_packets_out || + is_cwnd_limited) { tp->max_packets_out = tp->packets_out; tp->max_packets_seq = tp->snd_nxt; tp->is_cwnd_limited = is_cwnd_limited; @@ -2702,6 +2703,10 @@ repair: else tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); + is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); + if (likely(sent_pkts || is_cwnd_limited)) + tcp_cwnd_validate(sk, is_cwnd_limited); + if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; @@ -2709,8 +2714,6 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk, false); - is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); - tcp_cwnd_validate(sk, is_cwnd_limited); return false; } return !tp->packets_out && !tcp_write_queue_empty(sk); From fed91613c9dd455dd154b22fa8e11b8526466082 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 9 Dec 2020 15:03:38 +0200 Subject: [PATCH 73/81] net/mlx4_en: Avoid scheduling restart task if it is already running Add restarting state flag to avoid scheduling another restart task while such task is already running. Change task name from watchdog_task to restart_task to better fit the task role. Fixes: 1e338db56e5a ("mlx4_en: Fix a race at restart task") Signed-off-by: Moshe Shemesh Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx4/en_netdev.c | 20 ++++++++++++------- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 7 ++++++- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 106513f772c3..1a2b0bd64aa9 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -1378,8 +1378,10 @@ static void mlx4_en_tx_timeout(struct net_device *dev, unsigned int txqueue) tx_ring->cons, tx_ring->prod); priv->port_stats.tx_timeout++; - en_dbg(DRV, priv, "Scheduling watchdog\n"); - queue_work(mdev->workqueue, &priv->watchdog_task); + if (!test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state)) { + en_dbg(DRV, priv, "Scheduling port restart\n"); + queue_work(mdev->workqueue, &priv->restart_task); + } } @@ -1829,6 +1831,7 @@ int mlx4_en_start_port(struct net_device *dev) local_bh_enable(); } + clear_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state); netif_tx_start_all_queues(dev); netif_device_attach(dev); @@ -1999,7 +2002,7 @@ void mlx4_en_stop_port(struct net_device *dev, int detach) static void mlx4_en_restart(struct work_struct *work) { struct mlx4_en_priv *priv = container_of(work, struct mlx4_en_priv, - watchdog_task); + restart_task); struct mlx4_en_dev *mdev = priv->mdev; struct net_device *dev = priv->dev; @@ -2377,7 +2380,7 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) if (netif_running(dev)) { mutex_lock(&mdev->state_lock); if (!mdev->device_up) { - /* NIC is probably restarting - let watchdog task reset + /* NIC is probably restarting - let restart task reset * the port */ en_dbg(DRV, priv, "Change MTU called with card down!?\n"); } else { @@ -2386,7 +2389,9 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) if (err) { en_err(priv, "Failed restarting port:%d\n", priv->port); - queue_work(mdev->workqueue, &priv->watchdog_task); + if (!test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING, + &priv->state)) + queue_work(mdev->workqueue, &priv->restart_task); } } mutex_unlock(&mdev->state_lock); @@ -2792,7 +2797,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) if (err) { en_err(priv, "Failed starting port %d for XDP change\n", priv->port); - queue_work(mdev->workqueue, &priv->watchdog_task); + if (!test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state)) + queue_work(mdev->workqueue, &priv->restart_task); } } @@ -3165,7 +3171,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, priv->counter_index = MLX4_SINK_COUNTER_INDEX(mdev->dev); spin_lock_init(&priv->stats_lock); INIT_WORK(&priv->rx_mode_task, mlx4_en_do_set_rx_mode); - INIT_WORK(&priv->watchdog_task, mlx4_en_restart); + INIT_WORK(&priv->restart_task, mlx4_en_restart); INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate); INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats); INIT_DELAYED_WORK(&priv->service_task, mlx4_en_service_task); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index a46efe37cfa9..fd9535bde1b8 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -530,6 +530,10 @@ struct mlx4_en_stats_bitmap { struct mutex mutex; /* for mutual access to stats bitmap */ }; +enum { + MLX4_EN_STATE_FLAG_RESTARTING, +}; + struct mlx4_en_priv { struct mlx4_en_dev *mdev; struct mlx4_en_port_profile *prof; @@ -595,7 +599,7 @@ struct mlx4_en_priv { struct mlx4_en_cq *rx_cq[MAX_RX_RINGS]; struct mlx4_qp drop_qp; struct work_struct rx_mode_task; - struct work_struct watchdog_task; + struct work_struct restart_task; struct work_struct linkstate_task; struct delayed_work stats_task; struct delayed_work service_task; @@ -641,6 +645,7 @@ struct mlx4_en_priv { u32 pflags; u8 rss_key[MLX4_EN_RSS_KEY_SIZE]; u8 rss_hash_fn; + unsigned long state; }; enum mlx4_en_wol { From ba603d9d7b1215c72513d7c7aa02b6775fd4891b Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 9 Dec 2020 15:03:39 +0200 Subject: [PATCH 74/81] net/mlx4_en: Handle TX error CQE In case error CQE was found while polling TX CQ, the QP is in error state and all posted WQEs will generate error CQEs without any data transmitted. Fix it by reopening the channels, via same method used for TX timeout handling. In addition add some more info on error CQE and WQE for debug. Fixes: bd2f631d7c60 ("net/mlx4_en: Notify user when TX ring in error state") Signed-off-by: Moshe Shemesh Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx4/en_netdev.c | 1 + drivers/net/ethernet/mellanox/mlx4/en_tx.c | 40 +++++++++++++++---- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 5 +++ 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 1a2b0bd64aa9..6f290319b617 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -1735,6 +1735,7 @@ int mlx4_en_start_port(struct net_device *dev) mlx4_en_deactivate_cq(priv, cq); goto tx_err; } + clear_bit(MLX4_EN_TX_RING_STATE_RECOVERING, &tx_ring->state); if (t != TX_XDP) { tx_ring->tx_queue = netdev_get_tx_queue(dev, i); tx_ring->recycle_ring = NULL; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 3ddb7268e415..59b097cda327 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -392,6 +392,35 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring) return cnt; } +static void mlx4_en_handle_err_cqe(struct mlx4_en_priv *priv, struct mlx4_err_cqe *err_cqe, + u16 cqe_index, struct mlx4_en_tx_ring *ring) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_en_tx_info *tx_info; + struct mlx4_en_tx_desc *tx_desc; + u16 wqe_index; + int desc_size; + + en_err(priv, "CQE error - cqn 0x%x, ci 0x%x, vendor syndrome: 0x%x syndrome: 0x%x\n", + ring->sp_cqn, cqe_index, err_cqe->vendor_err_syndrome, err_cqe->syndrome); + print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, err_cqe, sizeof(*err_cqe), + false); + + wqe_index = be16_to_cpu(err_cqe->wqe_index) & ring->size_mask; + tx_info = &ring->tx_info[wqe_index]; + desc_size = tx_info->nr_txbb << LOG_TXBB_SIZE; + en_err(priv, "Related WQE - qpn 0x%x, wqe index 0x%x, wqe size 0x%x\n", ring->qpn, + wqe_index, desc_size); + tx_desc = ring->buf + (wqe_index << LOG_TXBB_SIZE); + print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, tx_desc, desc_size, false); + + if (test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state)) + return; + + en_err(priv, "Scheduling port restart\n"); + queue_work(mdev->workqueue, &priv->restart_task); +} + int mlx4_en_process_tx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int napi_budget) { @@ -438,13 +467,10 @@ int mlx4_en_process_tx_cq(struct net_device *dev, dma_rmb(); if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == - MLX4_CQE_OPCODE_ERROR)) { - struct mlx4_err_cqe *cqe_err = (struct mlx4_err_cqe *)cqe; - - en_err(priv, "CQE error - vendor syndrome: 0x%x syndrome: 0x%x\n", - cqe_err->vendor_err_syndrome, - cqe_err->syndrome); - } + MLX4_CQE_OPCODE_ERROR)) + if (!test_and_set_bit(MLX4_EN_TX_RING_STATE_RECOVERING, &ring->state)) + mlx4_en_handle_err_cqe(priv, (struct mlx4_err_cqe *)cqe, index, + ring); /* Skip over last polled CQE */ new_index = be16_to_cpu(cqe->wqe_index) & size_mask; diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index fd9535bde1b8..30378e4c90b5 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -271,6 +271,10 @@ struct mlx4_en_page_cache { } buf[MLX4_EN_CACHE_SIZE]; }; +enum { + MLX4_EN_TX_RING_STATE_RECOVERING, +}; + struct mlx4_en_priv; struct mlx4_en_tx_ring { @@ -317,6 +321,7 @@ struct mlx4_en_tx_ring { * Only queue_stopped might be used if BQL is not properly working. */ unsigned long queue_stopped; + unsigned long state; struct mlx4_hwq_resources sp_wqres; struct mlx4_qp sp_qp; struct mlx4_qp_context sp_context; From 7fdd375e383097a785bb65c66802e468f398bf82 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 9 Dec 2020 16:48:41 +0100 Subject: [PATCH 75/81] net: sched: Fix dump of MPLS_OPT_LSE_LABEL attribute in cls_flower TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL is a u32 attribute (MPLS label is 20 bits long). Fixes the following bug: $ tc filter add dev ethX ingress protocol mpls_uc \ flower mpls lse depth 2 label 256 \ action drop $ tc filter show dev ethX ingress filter protocol mpls_uc pref 49152 flower chain 0 filter protocol mpls_uc pref 49152 flower chain 0 handle 0x1 eth_type 8847 mpls lse depth 2 label 0 <-- invalid label 0, should be 256 ... Fixes: 61aec25a6db5 ("cls_flower: Support filtering on multiple MPLS Label Stack Entries") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index fed18fd2c50b..1319986693fc 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -2424,8 +2424,8 @@ static int fl_dump_key_mpls_opt_lse(struct sk_buff *skb, return err; } if (lse_mask->mpls_label) { - err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL, - lse_key->mpls_label); + err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL, + lse_key->mpls_label); if (err) return err; } From 177745beebe39773004921d6bffd6c94c77dca32 Mon Sep 17 00:00:00 2001 From: Mickey Rachamim Date: Wed, 9 Dec 2020 15:47:39 +0200 Subject: [PATCH 76/81] MAINTAINERS: Add entry for Marvell Prestera Ethernet Switch driver Add maintainers info for new Marvell Prestera Ethernet switch driver. Signed-off-by: Mickey Rachamim Signed-off-by: David S. Miller --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index ebe4829cdd4d..0381eb273944 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10555,6 +10555,13 @@ S: Supported F: Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst F: drivers/net/ethernet/marvell/octeontx2/af/ +MARVELL PRESTERA ETHERNET SWITCH DRIVER +M: Vadym Kochan +M: Taras Chornyi +S: Supported +W: https://github.com/Marvell-switching/switchdev-prestera +F: drivers/net/ethernet/marvell/prestera/ + MARVELL SOC MMC/SD/SDIO CONTROLLER DRIVER M: Nicolas Pitre S: Odd Fixes From b02709587ea3d699a608568ee8157d8db4fd8cae Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 8 Dec 2020 19:01:51 +0100 Subject: [PATCH 77/81] bpf: Fix propagation of 32-bit signed bounds from 64-bit bounds. The 64-bit signed bounds should not affect 32-bit signed bounds unless the verifier knows that upper 32-bits are either all 1s or all 0s. For example the register with smin_value==1 doesn't mean that s32_min_value is also equal to 1, since smax_value could be larger than 32-bit subregister can hold. The verifier refines the smax/s32_max return value from certain helpers in do_refine_retval_range(). Teach the verifier to recognize that smin/s32_min value is also bounded. When both smin and smax bounds fit into 32-bit subregister the verifier can propagate those bounds. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Reported-by: Jean-Philippe Brucker Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1388bf733071..53fe6ef6d931 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1298,9 +1298,7 @@ static void __reg_combine_32_into_64(struct bpf_reg_state *reg) static bool __reg64_bound_s32(s64 a) { - if (a > S32_MIN && a < S32_MAX) - return true; - return false; + return a > S32_MIN && a < S32_MAX; } static bool __reg64_bound_u32(u64 a) @@ -1314,10 +1312,10 @@ static void __reg_combine_64_into_32(struct bpf_reg_state *reg) { __mark_reg32_unbounded(reg); - if (__reg64_bound_s32(reg->smin_value)) + if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) { reg->s32_min_value = (s32)reg->smin_value; - if (__reg64_bound_s32(reg->smax_value)) reg->s32_max_value = (s32)reg->smax_value; + } if (__reg64_bound_u32(reg->umin_value)) reg->u32_min_value = (u32)reg->umin_value; if (__reg64_bound_u32(reg->umax_value)) @@ -4895,6 +4893,8 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, ret_reg->smax_value = meta->msize_max_value; ret_reg->s32_max_value = meta->msize_max_value; + ret_reg->smin_value = -MAX_ERRNO; + ret_reg->s32_min_value = -MAX_ERRNO; __reg_deduce_bounds(ret_reg); __reg_bound_offset(ret_reg); __update_reg_bounds(ret_reg); From 511a76bcb0ce242a19153658b25437906cc6070e Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 8 Dec 2020 19:01:52 +0100 Subject: [PATCH 78/81] selftests/bpf: Add test for signed 32-bit bound check bug After a 32-bit load followed by a branch, the verifier would reduce the maximum bound of the register to 0x7fffffff, allowing a user to bypass bound checks. Ensure such a program is rejected. In the second test, the 64-bit compare should not sufficient to determine whether the signed 32-bit lower bound is 0, so the verifier should reject the second branch. Signed-off-by: Jean-Philippe Brucker Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/bounds.c | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index dac40de3f868..57ed67b86074 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -703,3 +703,44 @@ .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, +{ + "bounds checks after 32-bit truncation. test 1", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + /* This used to reduce the max bound to 0x7fffffff */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 0x7fffffff, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 3 }, + .errstr_unpriv = "R0 leaks addr", + .result_unpriv = REJECT, + .result = ACCEPT, +}, +{ + "bounds checks after 32-bit truncation. test 2", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSLT, BPF_REG_1, 1, 1), + BPF_JMP32_IMM(BPF_JSLT, BPF_REG_1, 0, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 3 }, + .errstr_unpriv = "R0 leaks addr", + .result_unpriv = REJECT, + .result = ACCEPT, +}, From 77ce220c0549dcc3db8226c61c60e83fc59dfafc Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 8 Dec 2020 19:01:53 +0100 Subject: [PATCH 79/81] selftests/bpf: Fix array access with signed variable test The test fails because of a recent fix to the verifier, even though this program is valid. In details what happens is: 7: (61) r1 = *(u32 *)(r0 +0) Load a 32-bit value, with signed bounds [S32_MIN, S32_MAX]. The bounds of the 64-bit value are [0, U32_MAX]... 8: (65) if r1 s> 0xffffffff goto pc+1 ... therefore this is always true (the operand is sign-extended). 10: (b4) w2 = 11 11: (6d) if r2 s> r1 goto pc+1 When true, the 64-bit bounds become [0, 10]. The 32-bit bounds are still [S32_MIN, 10]. 13: (64) w1 <<= 2 Because this is a 32-bit operation, the verifier propagates the new 32-bit bounds to the 64-bit ones, and the knowledge gained from insn 11 is lost. 14: (0f) r0 += r1 15: (7a) *(u64 *)(r0 +0) = 4 Then the verifier considers r0 unbounded here, rejecting the test. To make the test work, change insn 8 to check the sign of the 32-bit value. Signed-off-by: Jean-Philippe Brucker Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/array_access.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index 1c4b1939f5a8..bed53b561e04 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -68,7 +68,7 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), - BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 0xffffffff, 1), + BPF_JMP32_IMM(BPF_JSGT, BPF_REG_1, 0xffffffff, 1), BPF_MOV32_IMM(BPF_REG_1, 0), BPF_MOV32_IMM(BPF_REG_2, MAX_ENTRIES), BPF_JMP_REG(BPF_JSGT, BPF_REG_2, BPF_REG_1, 1), From 3615bdf6d9b19db12b1589861609b4f1c6a8d303 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Tue, 8 Dec 2020 19:01:54 +0100 Subject: [PATCH 80/81] selftests/bpf: Fix "dubious pointer arithmetic" test The verifier trace changed following a bugfix. After checking the 64-bit sign, only the upper bit mask is known, not bit 31. Update the test accordingly. Signed-off-by: Jean-Philippe Brucker Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/align.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 52414058a627..5861446d0777 100644 --- a/tools/testing/selftests/bpf/prog_tests/align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -456,10 +456,10 @@ static struct bpf_align_test tests[] = { */ {7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>=0 */ - {9, "R5=inv(id=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, /* packet pointer + nonnegative (4n+2) */ - {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, - {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, + {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. * We checked the bounds, but it might have been able * to overflow if the packet pointer started in the @@ -467,7 +467,7 @@ static struct bpf_align_test tests[] = { * So we did not get a 'range' on R6, and the access * attempt will fail. */ - {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, } }, { From 38bf8cd821be292e7d8e6f6283d67c5d9708f887 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Dec 2020 12:21:13 +0100 Subject: [PATCH 81/81] selftests: fix poll error in udpgro.sh The test program udpgso_bench_rx always invokes the poll() syscall with a timeout of 10ms. If a larger timeout is specified via the command line, udpgso_bench_rx is supposed to do multiple poll() calls till the timeout is expired or an event is received. Currently the poll() loop errors out after the first invocation with no events, and may causes self-tests failure alike: failed GRO with custom segment size ./udpgso_bench_rx: poll: 0x0 expected 0x1 This change addresses the issue allowing the poll() loop to consume all the configured timeout. Fixes: ada641ff6ed3 ("selftests: fixes for UDP GRO") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- tools/testing/selftests/net/udpgso_bench_rx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c index db3d4a8b5a4c..76a24052f4b4 100644 --- a/tools/testing/selftests/net/udpgso_bench_rx.c +++ b/tools/testing/selftests/net/udpgso_bench_rx.c @@ -113,6 +113,9 @@ static void do_poll(int fd, int timeout_ms) interrupted = true; break; } + + /* no events and more time to wait, do poll again */ + continue; } if (pfd.revents != POLLIN) error(1, errno, "poll: 0x%x expected 0x%x\n",