Merge branch 'tcp-switch-to-Early-Departure-Time-model'

Eric Dumazet says: ==================== tcp: switch to Early Departure Time model In the early days, pacing has been implemented in sch_fq (FQ) in a generic way : - SO_MAX_PACING_RATE could be used by any sockets. - TCP would vary effective pacing rate based on CWND*MSS/SRTT - FQ would ensure delays between packets based on current sk->sk_pacing_rate, but with some quantum based artifacts. (inflating RPC tail latencies) - BBR then tweaked the pacing rate in its various phases (PROBE, DRAIN, ...) This worked reasonably well, but had the side effect that TCP RTT samples would be inflated by the sojourn time of the packets in FQ. Also note that when FQ is not used and TCP wants pacing, the internal pacing fallback has very different behavior, since TCP emits packets at the time they should be sent (with unreasonable assumptions about scheduling costs) Van Jacobson gave a talk at Netdev 0x12 in Montreal, about letting TCP (or applications for UDP messages) decide of the Earliest Departure Time, instead of letting packet schedulers derive it from pacing rate. https://www.netdevconf.org/0x12/session.html?evolving-from-afap-teaching-nics-about-time https://www.files.netdevconf.org/d/46def75c2ef345809bbe/files/?p=/Evolving%20from%20AFAP%20%E2%80%93%20Teaching%20NICs%20about%20time.pdf Recent additions in linux provided SO_TXTIME and a new ETF qdisc supporting the new skb->tstamp role This patch series converts TCP and FQ to the same model. This might in the future allow us to relax tight TSQ limits (if FQ is present in the output path), and thus lower number of callbacks to tcp_write_xmit(), thanks to batching. This will be followed by FQ change allowing SO_TXTIME support so that QUIC servers can let the pacing being done in FQ (or offloaded if network device permits) For example, a TCP flow rated at 24Mbps now shows a more meaningful RTT Before : ESTAB 0 211408 10.246.7.151:41558 10.246.7.152:33723 cubic wscale:8,8 rto:203 rtt:2.195/0.084 mss:1448 rcvmss:536 advmss:1448 cwnd:20 ssthresh:20 bytes_acked:36897937 segs_out:25488 segs_in:12454 data_segs_out:25486 send 105.5Mbps lastsnd:1 lastrcv:12851 lastack:1 pacing_rate 24.0Mbps/24.0Mbps delivery_rate 22.9Mbps busy:12851ms unacked:4 rcv_space:29200 notsent:205616 minrtt:0.026 After : ESTAB 0 192584 10.246.7.151:61612 10.246.7.152:34375 cubic wscale:8,8 rto:201 rtt:0.165/0.129 mss:1448 rcvmss:536 advmss:1448 cwnd:20 ssthresh:20 bytes_acked:170755401 segs_out:117931 segs_in:57651 data_segs_out:117929 send 1404.1Mbps lastsnd:1 lastrcv:56915 lastack:1 pacing_rate 24.0Mbps/24.0Mbps delivery_rate 24.2Mbps busy:56915ms unacked:4 rcv_space:29200 notsent:186792 minrtt:0.054 A nice side effect of this patch series is a reduction of max/p99 latencies of RPC workloads, since the FQ quantum no longer adds artifact. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@davemloft.net> 2018-09-21 19:38:00 -0700
committer: David S. Miller <davem@davemloft.net> 2018-09-21 19:38:00 -0700
commit: a88e24f270ebed5499f10615e64c11ccd2210517 (patch)
tree: bec31e6a1fd0762ac9a4cf0e5106e057481f2384 /net/ipv4/tcp_output.c
parent: 4f4b93a88c9cf40b3d8711cba062d2dd45f30896 (diff)
parent: 90caf67b01fabdd51b6cdeeb23b29bf73901df90 (diff)
1 files changed, 44 insertions, 24 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 597dbd749f05d..fe7855b090e4f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -45,6 +45,22 @@
 
 #include <trace/events/tcp.h>
 
+/* Refresh clocks of a TCP socket,
+ * ensuring monotically increasing values.
+ */
+void tcp_mstamp_refresh(struct tcp_sock *tp)
+{
+	u64 val = tcp_clock_ns();
+
+	/* departure time for next data packet */
+	if (val > tp->tcp_wstamp_ns)
+		tp->tcp_wstamp_ns = val;
+
+	val = div_u64(val, NSEC_PER_USEC);
+	if (val > tp->tcp_mstamp)
+		tp->tcp_mstamp = val;
+}
+
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			   int push_one, gfp_t gfp);
 
@@ -977,28 +993,34 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
+static void tcp_internal_pacing(struct sock *sk)
 {
-	u64 len_ns;
-	u32 rate;
-
 	if (!tcp_needs_internal_pacing(sk))
 		return;
-	rate = sk->sk_pacing_rate;
-	if (!rate || rate == ~0U)
-		return;
-
-	len_ns = (u64)skb->len * NSEC_PER_SEC;
-	do_div(len_ns, rate);
 	hrtimer_start(&tcp_sk(sk)->pacing_timer,
-		      ktime_add_ns(ktime_get(), len_ns),
+		      ns_to_ktime(tcp_sk(sk)->tcp_wstamp_ns),
 		      HRTIMER_MODE_ABS_PINNED_SOFT);
 	sock_hold(sk);
 }
 
-static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
+static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb)
 {
-	skb->skb_mstamp = tp->tcp_mstamp;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
+	if (sk->sk_pacing_status != SK_PACING_NONE) {
+		u32 rate = sk->sk_pacing_rate;
+
+		/* Original sch_fq does not pace first 10 MSS
+		 * Note that tp->data_segs_out overflows after 2^32 packets,
+		 * this is a minor annoyance.
+		 */
+		if (rate != ~0U && rate && tp->data_segs_out >= 10) {
+			tp->tcp_wstamp_ns += div_u64((u64)skb->len * NSEC_PER_SEC, rate);
+
+			tcp_internal_pacing(sk);
+		}
+	}
 	list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
 }
 
@@ -1045,7 +1067,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 		if (unlikely(!skb))
 			return -ENOBUFS;
 	}
-	skb->skb_mstamp = tp->tcp_mstamp;
+	skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
 
 	inet = inet_sk(sk);
 	tcb = TCP_SKB_CB(skb);
@@ -1137,7 +1159,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 		tcp_event_data_sent(tp, sk);
 		tp->data_segs_out += tcp_skb_pcount(skb);
 		tp->bytes_sent += skb->len - tcp_header_size;
-		tcp_internal_pacing(sk, skb);
 	}
 
 	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
@@ -1149,8 +1170,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
 	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
 
-	/* Our usage of tstamp should remain private */
-	skb->tstamp = 0;
+	/* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
 
 	/* Cleanup our debris for IP stacks */
 	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
@@ -1163,7 +1183,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 		err = net_xmit_eval(err);
 	}
 	if (!err && oskb) {
-		tcp_update_skb_after_send(tp, oskb);
+		tcp_update_skb_after_send(sk, oskb);
 		tcp_rate_skb_sent(sk, oskb);
 	}
 	return err;
@@ -1966,7 +1986,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 	head = tcp_rtx_queue_head(sk);
 	if (!head)
 		goto send_now;
-	age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
+	age = tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(head));
 	/* If next ACK is likely to come too late (half srtt), do not defer */
 	if (age < (tp->srtt_us >> 4))
 		goto send_now;
@@ -2312,7 +2332,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 
 		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
 			/* "skb_mstamp" is used as a start point for the retransmit timer */
-			tcp_update_skb_after_send(tp, skb);
+			tcp_update_skb_after_send(sk, skb);
 			goto repair; /* Skip network transmission */
 		}
 
@@ -2887,7 +2907,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 		} tcp_skb_tsorted_restore(skb);
 
 		if (!err) {
-			tcp_update_skb_after_send(tp, skb);
+			tcp_update_skb_after_send(sk, skb);
 			tcp_rate_skb_sent(sk, skb);
 		}
 	} else {
@@ -3205,10 +3225,10 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	memset(&opts, 0, sizeof(opts));
 #ifdef CONFIG_SYN_COOKIES
 	if (unlikely(req->cookie_ts))
-		skb->skb_mstamp = cookie_init_timestamp(req);
+		skb->skb_mstamp_ns = cookie_init_timestamp(req);
 	else
 #endif
-		skb->skb_mstamp = tcp_clock_us();
+		skb->skb_mstamp_ns = tcp_clock_ns();
 
 #ifdef CONFIG_TCP_MD5SIG
 	rcu_read_lock();
@@ -3424,7 +3444,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 
 	err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
 
-	syn->skb_mstamp = syn_data->skb_mstamp;
+	syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
 
 	/* Now full SYN+DATA was cloned and sent (or not),
 	 * remove the SYN from the original skb (syn_data)
author	David S. Miller <davem@davemloft.net>	2018-09-21 19:38:00 -0700
committer	David S. Miller <davem@davemloft.net>	2018-09-21 19:38:00 -0700
commit	a88e24f270ebed5499f10615e64c11ccd2210517 (patch)
tree	bec31e6a1fd0762ac9a4cf0e5106e057481f2384 /net/ipv4/tcp_output.c
parent	4f4b93a88c9cf40b3d8711cba062d2dd45f30896 (diff)
parent	90caf67b01fabdd51b6cdeeb23b29bf73901df90 (diff)