Merge branch 'tcp-backlog-processing-optims'
Eric Dumazet says: ==================== tcp: backlog processing optims First patches are mostly preparing the ground for the last one. Last patch of the series implements sort of ACK reduction only for the cases a TCP receiver is under high stress, which happens for high throughput flows. This gives us a ~20% increase of single TCP flow (100Gbit -> 120Gbit) ==================== Link: https://lore.kernel.org/r/20230911170531.828100-1-edumazet@google.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
@@ -745,6 +745,13 @@ tcp_comp_sack_nr - INTEGER
|
||||
|
||||
Default : 44
|
||||
|
||||
tcp_backlog_ack_defer - BOOLEAN
|
||||
If set, user thread processing socket backlog tries sending
|
||||
one ACK for the whole queue. This helps to avoid potential
|
||||
long latencies at end of a TCP socket syscall.
|
||||
|
||||
Default : true
|
||||
|
||||
tcp_slow_start_after_idle - BOOLEAN
|
||||
If set, provide RFC2861 behavior and time out the congestion
|
||||
window after an idle period. An idle period is defined at
|
||||
|
||||
+8
-6
@@ -463,15 +463,17 @@ enum tsq_enum {
|
||||
TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
|
||||
* tcp_v{4|6}_mtu_reduced()
|
||||
*/
|
||||
TCP_ACK_DEFERRED, /* TX pure ack is deferred */
|
||||
};
|
||||
|
||||
enum tsq_flags {
|
||||
TSQF_THROTTLED = (1UL << TSQ_THROTTLED),
|
||||
TSQF_QUEUED = (1UL << TSQ_QUEUED),
|
||||
TCPF_TSQ_DEFERRED = (1UL << TCP_TSQ_DEFERRED),
|
||||
TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED),
|
||||
TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED),
|
||||
TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED),
|
||||
TSQF_THROTTLED = BIT(TSQ_THROTTLED),
|
||||
TSQF_QUEUED = BIT(TSQ_QUEUED),
|
||||
TCPF_TSQ_DEFERRED = BIT(TCP_TSQ_DEFERRED),
|
||||
TCPF_WRITE_TIMER_DEFERRED = BIT(TCP_WRITE_TIMER_DEFERRED),
|
||||
TCPF_DELACK_TIMER_DEFERRED = BIT(TCP_DELACK_TIMER_DEFERRED),
|
||||
TCPF_MTU_REDUCED_DEFERRED = BIT(TCP_MTU_REDUCED_DEFERRED),
|
||||
TCPF_ACK_DEFERRED = BIT(TCP_ACK_DEFERRED),
|
||||
};
|
||||
|
||||
#define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
|
||||
|
||||
@@ -132,6 +132,7 @@ struct netns_ipv4 {
|
||||
u8 sysctl_tcp_syncookies;
|
||||
u8 sysctl_tcp_migrate_req;
|
||||
u8 sysctl_tcp_comp_sack_nr;
|
||||
u8 sysctl_tcp_backlog_ack_defer;
|
||||
int sysctl_tcp_reordering;
|
||||
u8 sysctl_tcp_retries1;
|
||||
u8 sysctl_tcp_retries2;
|
||||
|
||||
+4
-5
@@ -1823,12 +1823,11 @@ static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
|
||||
|
||||
static inline void sock_release_ownership(struct sock *sk)
|
||||
{
|
||||
if (sock_owned_by_user_nocheck(sk)) {
|
||||
sk->sk_lock.owned = 0;
|
||||
DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk));
|
||||
sk->sk_lock.owned = 0;
|
||||
|
||||
/* The sk_lock has mutex_unlock() semantics: */
|
||||
mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
|
||||
}
|
||||
/* The sk_lock has mutex_unlock() semantics: */
|
||||
mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
|
||||
}
|
||||
|
||||
/* no reclassification while locks are held */
|
||||
|
||||
+3
-3
@@ -3001,6 +3001,9 @@ void __sk_flush_backlog(struct sock *sk)
|
||||
{
|
||||
spin_lock_bh(&sk->sk_lock.slock);
|
||||
__release_sock(sk);
|
||||
|
||||
if (sk->sk_prot->release_cb)
|
||||
sk->sk_prot->release_cb(sk);
|
||||
spin_unlock_bh(&sk->sk_lock.slock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__sk_flush_backlog);
|
||||
@@ -3519,9 +3522,6 @@ void release_sock(struct sock *sk)
|
||||
if (sk->sk_backlog.tail)
|
||||
__release_sock(sk);
|
||||
|
||||
/* Warning : release_cb() might need to release sk ownership,
|
||||
* ie call sock_release_ownership(sk) before us.
|
||||
*/
|
||||
if (sk->sk_prot->release_cb)
|
||||
sk->sk_prot->release_cb(sk);
|
||||
|
||||
|
||||
@@ -1366,6 +1366,15 @@ static struct ctl_table ipv4_net_table[] = {
|
||||
.proc_handler = proc_dou8vec_minmax,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_backlog_ack_defer",
|
||||
.data = &init_net.ipv4.sysctl_tcp_backlog_ack_defer,
|
||||
.maxlen = sizeof(u8),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dou8vec_minmax,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_reflect_tos",
|
||||
.data = &init_net.ipv4.sysctl_tcp_reflect_tos,
|
||||
|
||||
@@ -5553,6 +5553,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
|
||||
tcp_in_quickack_mode(sk) ||
|
||||
/* Protocol state mandates a one-time immediate ACK */
|
||||
inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
|
||||
/* If we are running from __release_sock() in user context,
|
||||
* Defer the ack until tcp_release_cb().
|
||||
*/
|
||||
if (sock_owned_by_user_nocheck(sk) &&
|
||||
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
|
||||
set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
|
||||
return;
|
||||
}
|
||||
send_now:
|
||||
tcp_send_ack(sk);
|
||||
return;
|
||||
|
||||
@@ -3263,6 +3263,7 @@ static int __net_init tcp_sk_init(struct net *net)
|
||||
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
|
||||
net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
|
||||
net->ipv4.sysctl_tcp_comp_sack_nr = 44;
|
||||
net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
|
||||
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
|
||||
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
|
||||
atomic_set(&net->ipv4.tfo_active_disable_times, 0);
|
||||
|
||||
+4
-11
@@ -1077,7 +1077,8 @@ static void tcp_tasklet_func(struct tasklet_struct *t)
|
||||
#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
|
||||
TCPF_WRITE_TIMER_DEFERRED | \
|
||||
TCPF_DELACK_TIMER_DEFERRED | \
|
||||
TCPF_MTU_REDUCED_DEFERRED)
|
||||
TCPF_MTU_REDUCED_DEFERRED | \
|
||||
TCPF_ACK_DEFERRED)
|
||||
/**
|
||||
* tcp_release_cb - tcp release_sock() callback
|
||||
* @sk: socket
|
||||
@@ -1101,16 +1102,6 @@ void tcp_release_cb(struct sock *sk)
|
||||
tcp_tsq_write(sk);
|
||||
__sock_put(sk);
|
||||
}
|
||||
/* Here begins the tricky part :
|
||||
* We are called from release_sock() with :
|
||||
* 1) BH disabled
|
||||
* 2) sk_lock.slock spinlock held
|
||||
* 3) socket owned by us (sk->sk_lock.owned == 1)
|
||||
*
|
||||
* But following code is meant to be called from BH handlers,
|
||||
* so we should keep BH disabled, but early release socket ownership
|
||||
*/
|
||||
sock_release_ownership(sk);
|
||||
|
||||
if (flags & TCPF_WRITE_TIMER_DEFERRED) {
|
||||
tcp_write_timer_handler(sk);
|
||||
@@ -1124,6 +1115,8 @@ void tcp_release_cb(struct sock *sk)
|
||||
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
|
||||
__sock_put(sk);
|
||||
}
|
||||
if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk))
|
||||
tcp_send_ack(sk);
|
||||
}
|
||||
EXPORT_SYMBOL(tcp_release_cb);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user