patch-2.3.41 linux/net/ipv4/tcp_output.c

Next file: linux/net/ipv4/tcp_timer.c
Previous file: linux/net/ipv4/tcp_ipv4.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.3.40/linux/net/ipv4/tcp_output.c linux/net/ipv4/tcp_output.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_output.c,v 1.116 2000/01/13 00:19:49 davem Exp $
+ * Version:	$Id: tcp_output.c,v 1.119 2000/01/19 04:06:15 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -31,6 +31,7 @@
  *					during syn/ack processing.
  *		David S. Miller :	Output engine completely rewritten.
  *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
+ *		Cacophonix Gaul :	draft-minshall-nagle-01
  *
  */
 
@@ -38,75 +39,65 @@
 
 #include <linux/smp_lock.h>
 
-extern int sysctl_tcp_timestamps;
-extern int sysctl_tcp_window_scaling;
-extern int sysctl_tcp_sack;
-
 /* People can turn this off for buggy TCP's found in printers etc. */
 int sysctl_tcp_retrans_collapse = 1;
 
-/* Get rid of any delayed acks, we sent one already.. */
-static __inline__ void clear_delayed_acks(struct sock * sk)
-{
-	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-
-	tp->delayed_acks = 0;
-	if(tcp_in_quickack_mode(tp))
-		tcp_exit_quickack_mode(tp);
-	tcp_clear_xmit_timer(sk, TIME_DACK);
-}
-
 static __inline__ void update_send_head(struct sock *sk)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-	
+
 	tp->send_head = tp->send_head->next;
 	if (tp->send_head == (struct sk_buff *) &sk->write_queue)
 		tp->send_head = NULL;
 }
 
 /* Calculate mss to advertise in SYN segment.
-   RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
-
-   1. It is independent of path mtu.
-   2. Ideally, it is maximal possible segment size i.e. 65535-40.
-   3. For IPv4 it is reasonable to calculate it from maximal MTU of
-      attached devices, because some buggy hosts are confused by
-      large MSS.
-   4. We do not make 3, we advertise MSS, calculated from first
-      hop device mtu, but allow to raise it to ip_rt_min_advmss.
-      This may be overriden via information stored in routing table.
-   5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
-      probably even Jumbo".
+ * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
+ *
+ * 1. It is independent of path mtu.
+ * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
+ * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
+ *    attached devices, because some buggy hosts are confused by
+ *    large MSS.
+ * 4. We do not make 3, we advertise MSS, calculated from first
+ *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
+ *    This may be overriden via information stored in routing table.
+ * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
+ *    probably even Jumbo".
  */
 static __u16 tcp_advertise_mss(struct sock *sk)
 {
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	struct dst_entry *dst = __sk_dst_get(sk);
-	int mss;
+	int mss = tp->advmss;
 
-	if (dst) {
+	if (dst && dst->advmss < mss) {
 		mss = dst->advmss;
-	} else {
-		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+		tp->advmss = mss;
+	}
 
-		/* No dst. It is bad. Guess some reasonable value.
-		 * Actually, this case should not be possible.
-		 * SANITY.
-		 */
-		BUG_TRAP(dst!=NULL);
+	return (__u16)mss;
+}
 
-		mss = tp->mss_cache;
-		mss += (tp->tcp_header_len - sizeof(struct tcphdr)) +
-			tp->ext_header_len;
+static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb)
+{
+	/* If we had a reply for ato after last received
+	 * packet, enter pingpong mode.
+	 */
+	if ((u32)(tp->lsndtime - tp->ack.lrcvtime) < tp->ack.ato)
+		tp->ack.pingpong = 1;
 
-		/* Minimal MSS to include full set of of TCP/IP options
-		   plus 8 bytes of data. It corresponds to mtu 128.
-		 */
-		if (mss < 88)
-			mss = 88;
-	}
+	tp->lsndtime = tcp_time_stamp;
+}
 
-	return (__u16)mss;
+static __inline__ void tcp_event_ack_sent(struct sock *sk)
+{
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+	tp->last_ack_sent = tp->rcv_nxt;
+	tcp_dec_quickack_mode(tp);
+	tp->ack.pending = 0;
+	tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
 }
 
 /* This routine actually transmits TCP packets queued in by
@@ -120,7 +111,7 @@
  * We are working here with either a clone of the original
  * SKB, or a fresh unique copy made by the retransmit engine.
  */
-void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 {
 	if(skb != NULL) {
 		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -128,6 +119,7 @@
 		int tcp_header_size = tp->tcp_header_len;
 		struct tcphdr *th;
 		int sysctl_flags;
+		int err;
 
 #define SYSCTL_FLAG_TSTAMPS	0x1
 #define SYSCTL_FLAG_WSCALE	0x2
@@ -190,11 +182,29 @@
 		}
 		tp->af_specific->send_check(sk, th, skb->len, skb);
 
-		clear_delayed_acks(sk);
-		tp->last_ack_sent = tp->rcv_nxt;
+		if (th->ack)
+			tcp_event_ack_sent(sk);
+
+		if (skb->len != tcp_header_size)
+			tcp_event_data_sent(tp, skb);
+
 		TCP_INC_STATS(TcpOutSegs);
-		tp->af_specific->queue_xmit(skb);
+
+		err = tp->af_specific->queue_xmit(skb);
+		if (err <= 0)
+			return err;
+
+		tcp_enter_cong_avoid(tp);
+
+		/* NET_XMIT_CN is special. It does not guarantee,
+		 * that this packet is lost. It tells that device
+		 * is about to start to drop packets or already
+		 * drops some packets of the same priority and
+		 * invokes us to send less aggressively.
+		 */
+		return err == NET_XMIT_CN ? 0 : err;
 	}
+	return -ENOBUFS;
 #undef SYSCTL_FLAG_TSTAMPS
 #undef SYSCTL_FLAG_WSCALE
 #undef SYSCTL_FLAG_SACK
@@ -202,32 +212,33 @@
 
 /* This is the main buffer sending routine. We queue the buffer
  * and decide whether to queue or transmit now.
+ *
+ * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
+ * otherwise socket can stall.
  */
-void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue)
+void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
 	/* Advance write_seq and place onto the write_queue. */
-	tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq);
+	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
 	__skb_queue_tail(&sk->write_queue, skb);
 
-	if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) {
+	if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, 1)) {
 		/* Send it out now. */
 		TCP_SKB_CB(skb)->when = tcp_time_stamp;
-		tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
-		tp->packets_out++;
-		tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
-		if(!tcp_timer_is_set(sk, TIME_RETRANS))
-			tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
-	} else {
-		/* Queue it, remembering where we must start sending. */
-		if (tp->send_head == NULL)
-			tp->send_head = skb;
-		if (!force_queue && tp->packets_out == 0 && !tp->pending) {
-			tp->pending = TIME_PROBE0;
-			tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
+		if (tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)) == 0) {
+			tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+			tcp_minshall_update(tp, cur_mss, skb->len);
+			tp->packets_out++;
+			if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+				tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+			return;
 		}
 	}
+	/* Queue it, remembering where we must start sending. */
+	if (tp->send_head == NULL)
+		tp->send_head = skb;
 }
 
 /* Function to create two new TCP segments.  Shrinks the given segment
@@ -243,13 +254,13 @@
 
 	/* Get a new skb... force flag on. */
 	buff = sock_wmalloc(sk,
-			    (nsize + MAX_HEADER + sk->prot->max_header),
+			    (nsize + MAX_TCP_HEADER + 15),
 			    1, GFP_ATOMIC);
 	if (buff == NULL)
-		return -1; /* We'll just try again later. */
+		return -ENOMEM; /* We'll just try again later. */
 
 	/* Reserve space for headers. */
-	skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
+	skb_reserve(buff, MAX_TCP_HEADER);
 		
 	/* Correct the sequence numbers. */
 	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
@@ -276,8 +287,8 @@
 	TCP_SKB_CB(buff)->sacked = 0;
 
 	/* Copy and checksum data tail into the new buffer. */
-	buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize),
-				       nsize, 0);
+	buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
+					       nsize, 0);
 
 	/* This takes care of the FIN sequence number too. */
 	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
@@ -288,6 +299,11 @@
 
 	/* Looks stupid, but our code really uses when of
 	 * skbs, which it never sent before. --ANK
+	 *
+	 * NOTE: several days after I added this, Dave repaired
+	 * tcp_simple_retransmit() and it should not use ->when
+	 * of never sent skbs more. I am not sure, so that
+	 * this line remains until more careful investigation. --ANK
 	 */
 	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
 
@@ -335,20 +351,19 @@
 	if (mss_now > tp->mss_clamp)
 		mss_now = tp->mss_clamp;
 
-	/* Now subtract TCP options size, not including SACKs */
-	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
-
 	/* Now subtract optional transport overhead */
 	mss_now -= tp->ext_header_len;
 
-	/* It we got too small (or even negative) value,
-	   clamp it by 8 from below. Why 8 ?
-	   Well, it could be 1 with the same success,
-	   but if IP accepted segment of length 1,
-	   it would love 8 even more 8)		--ANK (980731)
-	 */
-	if (mss_now < 8)
-		mss_now = 8;
+	/* Then reserve room for full set of TCP options and 8 bytes of data */
+	if (mss_now < 48)
+		mss_now = 48;
+
+	/* Now subtract TCP options size, not including SACKs */
+	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+	/* Bound mss with half of window */
+	if (tp->max_window && mss_now > (tp->max_window>>1))
+		mss_now = max((tp->max_window>>1), 1);
 
 	/* And store cached results */
 	tp->pmtu_cookie = pmtu;
@@ -360,27 +375,30 @@
 /* This routine writes packets to the network.  It advances the
  * send_head.  This happens as incoming acks open up the remote
  * window for us.
+ *
+ * Returns 1, if no segments are in flight and we have queued segments, but
+ * cannot send anything now because of SWS or another problem.
  */
-void tcp_write_xmit(struct sock *sk)
+int tcp_write_xmit(struct sock *sk)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	unsigned int mss_now;
 
-	/* Account for SACKS, we may need to fragment due to this.
-	 * It is just like the real MSS changing on us midstream.
-	 * We also handle things correctly when the user adds some
-	 * IP options mid-stream.  Silly to do, but cover it.
-	 */
-	mss_now = tcp_current_mss(sk); 
-
-	/* If we are zapped, the bytes will have to remain here.
-	 * In time closedown will empty the write queue and all
+	/* If we are closed, the bytes will have to remain here.
+	 * In time closedown will finish, we empty the write queue and all
 	 * will be happy.
 	 */
-	if(!sk->zapped) {
+	if(sk->state != TCP_CLOSE) {
 		struct sk_buff *skb;
 		int sent_pkts = 0;
 
+		/* Account for SACKS, we may need to fragment due to this.
+		 * It is just like the real MSS changing on us midstream.
+		 * We also handle things correctly when the user adds some
+		 * IP options mid-stream.  Silly to do, but cover it.
+		 */
+		mss_now = tcp_current_mss(sk); 
+
 		/* Anything on the transmit queue that fits the window can
 		 * be added providing we are:
 		 *
@@ -388,27 +406,36 @@
 		 * b) not exceeding our congestion window.
 		 * c) not retransmitting [Nagle]
 		 */
-		while((skb = tp->send_head) && tcp_snd_test(sk, skb)) {
+		while((skb = tp->send_head) &&
+		      tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb))) {
 			if (skb->len > mss_now) {
 				if (tcp_fragment(sk, skb, mss_now))
 					break;
 			}
 
-			/* Advance the send_head.  This one is going out. */
-			update_send_head(sk);
 			TCP_SKB_CB(skb)->when = tcp_time_stamp;
+			if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
+				break;
+			/* Advance the send_head.  This one is sent out. */
+			update_send_head(sk);
 			tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+			tcp_minshall_update(tp, mss_now, skb->len);
 			tp->packets_out++;
-			tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
 			sent_pkts = 1;
 		}
 
 		/* If we sent anything, make sure the retransmit
 		 * timer is active.
 		 */
-		if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS))
-			tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+		if (sent_pkts) {
+			if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+				tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+			return 0;
+		}
+
+		return !tp->packets_out && tp->send_head;
 	}
+	return 0;
 }
 
 /* This function returns the amount that we can raise the
@@ -471,7 +498,7 @@
 	 * but may be worse for the performance because of rcv_mss
 	 * fluctuations.  --SAW  1998/11/1
 	 */
-	unsigned int mss = tp->rcv_mss;
+	unsigned int mss = tp->ack.rcv_mss;
 	int free_space;
 	u32 window;
 
@@ -481,11 +508,19 @@
 		free_space = tp->window_clamp;
 	if (tp->window_clamp < mss)
 		mss = tp->window_clamp; 
-	
-	if ((free_space < (tcp_full_space(sk) / 2)) && 
+
+	if ((free_space < (min((int)tp->window_clamp, tcp_full_space(sk)) / 2)) && 
 		(free_space < ((int) (mss/2)))) {
 		window = 0;
-		tp->pred_flags = 0; 
+
+		/* THIS IS _VERY_ GOOD PLACE to play window clamp.
+		 * if free_space becomes suspiciously low
+		 * verify ratio rmem_alloc/(rcv_nxt - copied_seq),
+		 * and if we predict that when free_space will be lower mss,
+		 * rmem_alloc will run out of rcvbuf*2, shrink window_clamp.
+		 * It will eliminate most of prune events! Very simple,
+		 * it is the next thing to do.			--ANK
+		 */
 	} else {
 		/* Get the largest window that is a nice multiple of mss.
 		 * Window clamp already applied above.
@@ -542,9 +577,9 @@
 			/* Optimize, actually we could also combine next_skb->csum
 			 * to skb->csum using a single add w/carry operation too.
 			 */
-			skb->csum = csum_partial_copy(next_skb->data,
-						      skb_put(skb, next_skb_size),
-						      next_skb_size, skb->csum);
+			skb->csum = csum_partial_copy_nocheck(next_skb->data,
+							      skb_put(skb, next_skb_size),
+							      next_skb_size, skb->csum);
 		}
 	
 		/* Update sequence range on original skb. */
@@ -603,8 +638,10 @@
 		if (old_next_skb != skb || skb->len > mss)
 			resend_skb = 1;
 		old_next_skb = skb->next;
-		if (resend_skb != 0)
-			tcp_retransmit_skb(sk, skb);
+		if (resend_skb != 0) {
+			if (tcp_retransmit_skb(sk, skb))
+				break;
+		}
 	}
 }
 
@@ -629,9 +666,21 @@
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	unsigned int cur_mss = tcp_current_mss(sk);
 
+#ifdef TCP_DEBUG
+	/* It was possible this summer, that retransmit timer
+	 * raced with its deletion and hit socket with packets_out==0.
+	 * I fixed it, but preserved the check in the place,
+	 * where the fault occured. --ANK
+	 */
+	if (skb == NULL) {
+		printk("tcp_retransmit_skb: bug, skb==NULL, caller=%p\n", NET_CALLER(sk));
+		return -EFAULT;
+	}
+#endif
+
 	if(skb->len > cur_mss) {
 		if(tcp_fragment(sk, skb, cur_mss))
-			return 1; /* We'll try again later. */
+			return -ENOMEM; /* We'll try again later. */
 
 		/* New SKB created, account for it. */
 		tp->packets_out++;
@@ -646,7 +695,7 @@
 		tcp_retrans_try_collapse(sk, skb, cur_mss);
 
 	if(tp->af_specific->rebuild_header(sk))
-		return 1; /* Routing failure or similar. */
+		return -EHOSTUNREACH; /* Routing failure or similar. */
 
 	/* Some Solaris stacks overoptimize and ignore the FIN on a
 	 * retransmit when old data is attached.  So strip it off
@@ -673,13 +722,10 @@
 	else
 		skb = skb_clone(skb, GFP_ATOMIC);
 
-	tcp_transmit_skb(sk, skb);
-
 	/* Update global TCP statistics and return success. */
-	sk->prot->retransmits++;
 	TCP_INC_STATS(TcpRetransSegs);
 
-	return 0;
+	return tcp_transmit_skb(sk, skb);
 }
 
 /* This gets called after a retransmit timeout, and the initially
@@ -774,7 +820,11 @@
 	 */
 	mss_now = tcp_current_mss(sk); 
 
-	if((tp->send_head != NULL) && (skb->len < mss_now)) {
+	/* Please, find seven differences of 2.3.33 and loook
+	 * what I broke here. 8) --ANK
+	 */
+
+	if(tp->send_head != NULL) {
 		/* tcp_write_xmit() takes care of the rest. */
 		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
 		TCP_SKB_CB(skb)->end_seq++;
@@ -783,31 +833,34 @@
 		/* Special case to avoid Nagle bogosity.  If this
 		 * segment is the last segment, and it was queued
 		 * due to Nagle/SWS-avoidance, send it out now.
+		 *
+		 * Hmm... actually it overrides also congestion
+		 * avoidance (OK for FIN) and retransmit phase
+		 * (not OK? Added.).
 		 */
 		if(tp->send_head == skb &&
-		   !sk->nonagle &&
-		   skb->len < (tp->rcv_mss >> 1) &&
-		   tp->packets_out &&
-		   !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) {
-			update_send_head(sk);
+		   !after(tp->write_seq, tp->snd_una + tp->snd_wnd) &&
+		   !tp->retransmits) {
 			TCP_SKB_CB(skb)->when = tcp_time_stamp;
-			tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
-			tp->packets_out++;
-			tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
-			if(!tcp_timer_is_set(sk, TIME_RETRANS))
-				tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+			if (!tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) {
+				update_send_head(sk);
+				tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+				tp->packets_out++;
+				if(!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+					tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+			} else
+				tcp_check_probe_timer(sk, tp);
 		}
 	} else {
 		/* Socket is locked, keep trying until memory is available. */
 		do {
 			skb = sock_wmalloc(sk,
-					   (MAX_HEADER +
-					    sk->prot->max_header),
+					   MAX_TCP_HEADER + 15,
 					   1, GFP_KERNEL);
 		} while (skb == NULL);
 
 		/* Reserve space for headers and prepare control bits. */
-		skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+		skb_reserve(skb, MAX_TCP_HEADER);
 		skb->csum = 0;
 		TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
 		TCP_SKB_CB(skb)->sacked = 0;
@@ -816,7 +869,8 @@
 		/* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */
 		TCP_SKB_CB(skb)->seq = tp->write_seq;
 		TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
-		tcp_send_skb(sk, skb, 0);
+		tcp_send_skb(sk, skb, 0, mss_now);
+		__tcp_push_pending_frames(sk, tp, mss_now);
 	}
 }
 
@@ -831,19 +885,19 @@
 	struct sk_buff *skb;
 
 	/* NOTE: No TCP options attached and we never retransmit this. */
-	skb = alloc_skb(MAX_HEADER + sk->prot->max_header, priority);
+	skb = alloc_skb(MAX_TCP_HEADER + 15, priority);
 	if (!skb)
 		return;
 
 	/* Reserve space for headers and prepare control bits. */
-	skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+	skb_reserve(skb, MAX_TCP_HEADER);
 	skb->csum = 0;
 	TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
 	TCP_SKB_CB(skb)->sacked = 0;
 	TCP_SKB_CB(skb)->urg_ptr = 0;
 
 	/* Send it off. */
-	TCP_SKB_CB(skb)->seq = tp->write_seq;
+	TCP_SKB_CB(skb)->seq = tp->snd_nxt;
 	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
 	tcp_transmit_skb(sk, skb);
@@ -859,13 +913,13 @@
 	struct tcp_opt* tp = &(sk->tp_pinfo.af_tcp);
 	struct sk_buff* skb;	
 
-	skb = sock_wmalloc(sk, (MAX_HEADER + sk->prot->max_header),
+	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15,
 			   1, GFP_ATOMIC);
 	if (skb == NULL) 
 		return -ENOMEM;
 
 	/* Reserve space for headers and prepare control bits. */
-	skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+	skb_reserve(skb, MAX_TCP_HEADER);
 	skb->csum = 0;
 	TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_SYN);
 	TCP_SKB_CB(skb)->sacked = 0;
@@ -877,8 +931,7 @@
 	__skb_queue_tail(&sk->write_queue, skb);
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
 	tp->packets_out++;
-	tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
-	return 0;
+	return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
 }
 
 /*
@@ -887,16 +940,17 @@
 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 				 struct open_request *req)
 {
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	struct tcphdr *th;
 	int tcp_header_size;
 	struct sk_buff *skb;
 
-	skb = sock_wmalloc(sk, MAX_HEADER + sk->prot->max_header, 1, GFP_ATOMIC);
+	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
 	if (skb == NULL)
 		return NULL;
 
 	/* Reserve space for headers. */
-	skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+	skb_reserve(skb, MAX_TCP_HEADER);
 
 	skb->dst = dst_clone(dst);
 
@@ -919,7 +973,7 @@
 	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
 		__u8 rcv_wscale; 
 		/* Set this up on the first call only */
-		req->window_clamp = skb->dst->window;
+		req->window_clamp = tp->window_clamp ? : skb->dst->window;
 		/* tcp_full_space because it is guaranteed to be the first packet */
 		tcp_select_initial_window(tcp_full_space(sk), 
 			dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -951,7 +1005,7 @@
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
 	/* Reserve space for headers. */
-	skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
+	skb_reserve(buff, MAX_TCP_HEADER + 15);
 
 	/* We'll fix this up when we get a response from the other end.
 	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -962,12 +1016,16 @@
 	/* If user gave his TCP_MAXSEG, record it to clamp */
 	if (tp->user_mss)
 		tp->mss_clamp = tp->user_mss;
+	tp->max_window = 0;
 	tcp_sync_mss(sk, dst->pmtu);
+	tcp_initialize_rcv_mss(sk);
 
-	tp->window_clamp = dst->window;
+	if (!tp->window_clamp)
+		tp->window_clamp = dst->window;
+	tp->advmss = dst->advmss;
 
 	tcp_select_initial_window(tcp_full_space(sk),
-		dst->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)),
+		tp->advmss - (tp->tcp_header_len - sizeof(struct tcphdr)),
 		&tp->rcv_wnd,
 		&tp->window_clamp,
 		sysctl_tcp_window_scaling,
@@ -982,10 +1040,12 @@
 		goto err_out;
 
 	sk->err = 0;
+	sk->done = 0;
 	tp->snd_wnd = 0;
 	tp->snd_wl1 = 0;
 	tp->snd_wl2 = tp->write_seq;
 	tp->snd_una = tp->write_seq;
+	tp->snd_sml = tp->write_seq;
 	tp->rcv_nxt = 0;
 	tp->rcv_wup = 0;
 	tp->copied_seq = 0;
@@ -1006,13 +1066,14 @@
 
 	/* Send it off. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
+	tp->syn_stamp = TCP_SKB_CB(buff)->when;
 	__skb_queue_tail(&sk->write_queue, buff);
 	tp->packets_out++;
 	tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
 	TCP_INC_STATS(TcpActiveOpens);
 
 	/* Timer for repeating the SYN until an answer. */
-	tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
 	return 0;
 
 err_out:
@@ -1025,16 +1086,14 @@
  * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
  * for details.
  */
-void tcp_send_delayed_ack(struct sock *sk, int max_timeout)
+void tcp_send_delayed_ack(struct sock *sk)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 	unsigned long timeout;
 
 	/* Stay within the limit we were given */
-	timeout = (tp->ato << 1) >> 1;
-	if (timeout > max_timeout)
-		timeout = max_timeout;
-	timeout += jiffies;
+	timeout = tp->ack.ato;
+	timeout += jiffies + (timeout>>2);
 
 	/* Use new timeout only if there wasn't a older one earlier. */
 	spin_lock_bh(&sk->timer_lock);
@@ -1042,18 +1101,46 @@
 		sock_hold(sk);
 		tp->delack_timer.expires = timeout;
 	} else {
+		/* If delack timer was blocked or is about to expire,
+		 * send ACK now.
+		 */
+		if (tp->ack.blocked || time_before_eq(tp->delack_timer.expires, jiffies+(tp->ack.ato>>2))) {
+			spin_unlock_bh(&sk->timer_lock);
+
+			tcp_send_ack(sk);
+			__sock_put(sk);
+			return;
+		}
+
 		if (time_before(timeout, tp->delack_timer.expires))
 			tp->delack_timer.expires = timeout;
 	}
 	add_timer(&tp->delack_timer);
 	spin_unlock_bh(&sk->timer_lock);
+
+#ifdef TCP_FORMAL_WINDOW
+	/* Explanation. Header prediction path does not handle
+	 * case of zero window. If we send ACK immediately, pred_flags
+	 * are reset when sending ACK. If rcv_nxt is advanced and
+	 * ack is not sent, than delayed ack is scheduled.
+	 * Hence, it is the best place to check for zero window.
+	 */
+	if (tp->pred_flags) {
+		if (tcp_receive_window(tp) == 0)
+			tp->pred_flags = 0;
+	} else {
+		if (skb_queue_len(&tp->out_of_order_queue) == 0 &&
+		    !tp->urg_data)
+			tcp_fast_path_on(tp);
+	}
+#endif
 }
 
 /* This routine sends an ack and also updates the window. */
 void tcp_send_ack(struct sock *sk)
 {
 	/* If we have been reset, we may not send again. */
-	if(!sk->zapped) {
+	if(sk->state != TCP_CLOSE) {
 		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 		struct sk_buff *buff;
 
@@ -1061,29 +1148,15 @@
 		 * tcp_transmit_skb() will set the ownership to this
 		 * sock.
 		 */
-		buff = alloc_skb(MAX_HEADER + sk->prot->max_header, GFP_ATOMIC);
+		buff = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC);
 		if (buff == NULL) {
-			/* Force it to send an ack. We don't have to do this
-			 * (ACK is unreliable) but it's much better use of
-			 * bandwidth on slow links to send a spare ack than
-			 * resend packets.
-			 *
-			 * This is the one possible way that we can delay an
-			 * ACK and have tp->ato indicate that we are in
-			 * quick ack mode, so clear it.  It is also the only
-			 * possible way for ato to be zero, when ACK'ing a
-			 * SYNACK because we've taken no ATO measurement yet.
-			 */
-			if (tcp_in_quickack_mode(tp))
-				tcp_exit_quickack_mode(tp);
-			if (!tp->ato)
-				tp->ato = tp->rto;
-			tcp_send_delayed_ack(sk, HZ/2);
+			tp->ack.pending = 1;
+			tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
 			return;
 		}
 
 		/* Reserve space for headers and prepare control bits. */
-		skb_reserve(buff, MAX_HEADER + sk->prot->max_header);
+		skb_reserve(buff, MAX_TCP_HEADER);
 		buff->csum = 0;
 		TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
 		TCP_SKB_CB(buff)->sacked = 0;
@@ -1099,24 +1172,20 @@
 /* This routine sends a packet with an out of date sequence
  * number. It assumes the other end will try to ack it.
  */
-void tcp_write_wakeup(struct sock *sk)
+int tcp_write_wakeup(struct sock *sk)
 {
-	/* After a valid reset we can send no more. */
-	if (!sk->zapped) {
+	if (sk->state != TCP_CLOSE) {
 		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 		struct sk_buff *skb;
 
-		/* Write data can still be transmitted/retransmitted in the
-		 * following states.  If any other state is encountered, return.
-		 * [listen/close will never occur here anyway]
+		/* Now this function is never called, while
+		 * we have something not ACKed in queue.
 		 */
-		if ((1 << sk->state) &
-		    ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
-		      TCPF_FIN_WAIT2|TCPF_LAST_ACK|TCPF_CLOSING))
-			return;
+		BUG_TRAP(tp->snd_una == tp->snd_nxt);
 
-		if (before(tp->snd_nxt, tp->snd_una + tp->snd_wnd) &&
-		    ((skb = tp->send_head) != NULL)) {
+		if (tp->snd_wnd > (tp->snd_nxt-tp->snd_una)
+		    && ((skb = tp->send_head) != NULL)) {
+			int err;
 			unsigned long win_size;
 
 			/* We are probing the opening of a window
@@ -1126,24 +1195,26 @@
 			win_size = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
 			if (win_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
 				if (tcp_fragment(sk, skb, win_size))
-					return; /* Let a retransmit get it. */
+					return -1;
 			}
-			update_send_head(sk);
 			TCP_SKB_CB(skb)->when = tcp_time_stamp;
-			tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
-			tp->packets_out++;
-			tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
-			if (!tcp_timer_is_set(sk, TIME_RETRANS))
-				tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+			err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+			if (!err) {
+				update_send_head(sk);
+				tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+				tp->packets_out++;
+				if (!tcp_timer_is_set(sk, TCP_TIME_RETRANS))
+					tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+			}
+			return err;
 		} else {
 			/* We don't queue it, tcp_transmit_skb() sets ownership. */
-			skb = alloc_skb(MAX_HEADER + sk->prot->max_header,
-					GFP_ATOMIC);
+			skb = alloc_skb(MAX_TCP_HEADER + 15, GFP_ATOMIC);
 			if (skb == NULL) 
-				return;
+				return -1;
 
 			/* Reserve space for headers and set control bits. */
-			skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+			skb_reserve(skb, MAX_TCP_HEADER);
 			skb->csum = 0;
 			TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 			TCP_SKB_CB(skb)->sacked = 0;
@@ -1152,13 +1223,18 @@
 			/* Use a previous sequence.  This should cause the other
 			 * end to send an ack.  Don't queue or clone SKB, just
 			 * send it.
+			 *
+			 * RED-PEN: logically it should be snd_una-1.
+			 * snd_nxt-1 will not be acked. snd_una==snd_nxt
+			 * in this place however. Right?
 			 */
-			TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1;
+			TCP_SKB_CB(skb)->seq = tp->snd_una - 1;
 			TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
 			TCP_SKB_CB(skb)->when = tcp_time_stamp;
-			tcp_transmit_skb(sk, skb);
+			return tcp_transmit_skb(sk, skb);
 		}
 	}
+	return -1;
 }
 
 /* A window probe timeout has occurred.  If window is not closed send
@@ -1167,11 +1243,32 @@
 void tcp_send_probe0(struct sock *sk)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	int err;
+
+	err = tcp_write_wakeup(sk);
+
+	if (tp->packets_out || !tp->send_head) {
+		/* Cancel probe timer, if it is not required. */
+		tp->probes_out = 0;
+		tp->backoff = 0;
+		return;
+	}
 
-	tcp_write_wakeup(sk);
-	tp->pending = TIME_PROBE0;
-	tp->backoff++;
-	tp->probes_out++;
-	tcp_reset_xmit_timer (sk, TIME_PROBE0, 
-			      min(tp->rto << tp->backoff, 120*HZ));
+	if (err <= 0) {
+		tp->backoff++;
+		tp->probes_out++;
+		tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
+				      min(tp->rto << tp->backoff, TCP_RTO_MAX));
+	} else {
+		/* If packet was not sent due to local congestion,
+		 * do not backoff and do not remember probes_out.
+		 * Let local senders to fight for local resources.
+		 *
+		 * Use accumulated backoff yet.
+		 */
+		if (!tp->probes_out)
+			tp->probes_out=1;
+		tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
+				      min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
+	}
 }

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)