patch-2.3.41 linux/net/ipv4/tcp.c

Next file: linux/net/ipv4/tcp_input.c
Previous file: linux/net/ipv4/sysctl_net_ipv4.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.3.40/linux/net/ipv4/tcp.c linux/net/ipv4/tcp.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp.c,v 1.153 2000/01/09 02:19:33 davem Exp $
+ * Version:	$Id: tcp.c,v 1.160 2000/01/24 18:40:32 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -202,6 +202,8 @@
  *		Eric Schenk	:	Fix fast close down bug with
  *					shutdown() followed by close().
  *		Andi Kleen :	Make poll agree with SIGIO
+ *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
+ *					lingertime == 0 (RFC 793 ABORT Call)
  *					
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -432,113 +434,14 @@
 kmem_cache_t *tcp_bucket_cachep;
 kmem_cache_t *tcp_timewait_cachep;
 
-/*
- *	Find someone to 'accept'. Must be called with
- *	the listening socket locked.
- */
-
-static struct open_request *tcp_find_established(struct tcp_opt *tp, 
-						 struct open_request **prevp)
-{
-	struct open_request *req = tp->syn_wait_queue;
-	struct open_request *prev = (struct open_request *)&tp->syn_wait_queue; 
-	while(req) {
-		if (req->sk) {
-			if((1 << req->sk->state) &
-			   ~(TCPF_SYN_SENT|TCPF_SYN_RECV))
-				break;
-		}
-		prev = req; 
-		req = req->dl_next;
-	}
-	*prevp = prev; 
-	return req;
-}
-
-/*
- *	Walk down the receive queue counting readable data.
- *
- *	Must be called with the socket lock held.
- */
-
-static int tcp_readable(struct sock *sk)
-{
-	unsigned long counted;
-	unsigned long amount;
-	struct sk_buff *skb;
-	int sum;
-
-	SOCK_DEBUG(sk, "tcp_readable: %p - ",sk);
-
-	skb = skb_peek(&sk->receive_queue);
-	if (skb == NULL) {
-		SOCK_DEBUG(sk, "empty\n");
-	  	return(0);
-	}
-
-	counted = sk->tp_pinfo.af_tcp.copied_seq;	/* Where we are at the moment */
-	amount = 0;
-
-	/* Do until a push or until we are out of data. */
-	do {
-		/* Found a hole so stops here. */
-		if (before(counted, TCP_SKB_CB(skb)->seq))	/* should not happen */
-			break;
-
-		/* Length - header but start from where we are up to
-		 * avoid overlaps.
-		 */
-		sum = skb->len - (counted - TCP_SKB_CB(skb)->seq);
-		if (sum >= 0) {
-			/* Add it up, move on. */
-			amount += sum;
-			counted += sum;
-			if (skb->h.th->syn)
-				counted++;
-		}
-
-		/* Don't count urg data ... but do it in the right place!
-		 * Consider: "old_data (ptr is here) URG PUSH data"
-		 * The old code would stop at the first push because
-		 * it counted the urg (amount==1) and then does amount--
-		 * *after* the loop.  This means tcp_readable() always
-		 * returned zero if any URG PUSH was in the queue, even
-		 * though there was normal data available. If we subtract
-		 * the urg data right here, we even get it to work for more
-		 * than one URG PUSH skb without normal data.
-		 * This means that poll() finally works now with urg data
-		 * in the queue.  Note that rlogin was never affected
-		 * because it doesn't use poll(); it uses two processes
-		 * and a blocking read().  And the queue scan in tcp_read()
-		 * was correct.  Mike <pall@rz.uni-karlsruhe.de>
-		 */
-
-		/* Don't count urg data. */
-		if (skb->h.th->urg)
-			amount--;
-#if 0
-		if (amount && skb->h.th->psh) break;
-#endif
-		skb = skb->next;
-	} while(skb != (struct sk_buff *)&sk->receive_queue);
-
-	SOCK_DEBUG(sk, "got %lu bytes.\n",amount);
-	return(amount);
-}
+atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 
 /*
  * LISTEN is a special case for poll..
  */
-static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
+static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
 {
-	struct open_request *req, *dummy;
-
-	lock_sock(sk);
-	req = tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy);
-	release_sock(sk);
-	if (req)
-		return POLLIN | POLLRDNORM;
-	return 0;
+	return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
 }
 
 /*
@@ -585,9 +488,25 @@
 	 * if you don't tell them that something has hung up!
 	 *
 	 * Check-me.
+	 *
+	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
+	 * our fs/select.c). It means that after we received EOF,
+	 * poll always returns immediately, making impossible poll() on write()
+	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
+	 * if and only if shutdown has been made in both directions.
+	 * Actually, it is interesting to look how Solaris and DUX
+	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
+	 * then we could set it on SND_SHUTDOWN. BTW examples given
+	 * in Stevens' books assume exactly this behaviour, it explains
+	 * why PULLHUP is incompatible with POLLOUT.	--ANK
+	 *
+	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
+	 * blocking on fresh not-connected or disconnected socket. --ANK
 	 */
-	if (sk->shutdown & RCV_SHUTDOWN)
+	if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
 		mask |= POLLHUP;
+	if (sk->shutdown & RCV_SHUTDOWN)
+		mask |= POLLIN | POLLRDNORM;
 
 	/* Connected? */
 	if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
@@ -605,7 +524,7 @@
 			}
 		}
 
-		if (tp->urg_data & URG_VALID)
+		if (tp->urg_data & TCP_URG_VALID)
 			mask |= POLLPRI;
 	}
 	return mask;
@@ -631,32 +550,48 @@
 	read_unlock(&sk->callback_lock);
 }
 
+/* Listening TCP sockets never sleep to wait for memory, so
+ * it is completely silly to wake them up on queue space
+ * available events.  So we hook them up to this dummy callback.
+ */
+static void tcp_listen_write_space(struct sock *sk)
+{
+}
 
 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	int answ;
 
 	switch(cmd) {
-	case TIOCINQ:
-#ifdef FIXME	/* FIXME: */
-	case FIONREAD:
-#endif
+	case SIOCINQ:
 		if (sk->state == TCP_LISTEN)
 			return(-EINVAL);
+
 		lock_sock(sk);
-		answ = tcp_readable(sk);
+		if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
+			answ = 0;
+		else if (sk->urginline || !tp->urg_data ||
+			 before(tp->urg_seq,tp->copied_seq) ||
+			 !before(tp->urg_seq,tp->rcv_nxt))
+			answ = tp->rcv_nxt - tp->copied_seq;
+		else
+			answ = tp->urg_seq - tp->copied_seq;
 		release_sock(sk);
 		break;
 	case SIOCATMARK:
 		{
-			struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 			answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 			break;
 		}
-	case TIOCOUTQ:
+	case SIOCOUTQ:
 		if (sk->state == TCP_LISTEN)
 			return(-EINVAL);
-		answ = sock_wspace(sk);
+
+		if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
+			answ = 0;
+		else
+			answ = tp->write_seq - tp->snd_una;
 		break;
 	default:
 		return(-ENOIOCTLCMD);
@@ -665,12 +600,131 @@
 	return put_user(answ, (int *)arg);
 }
 
+
+int tcp_listen_start(struct sock *sk)
+{
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	struct tcp_listen_opt *lopt;
+
+	sk->max_ack_backlog = 0;
+	sk->ack_backlog = 0;
+	tp->accept_queue = NULL;
+	tp->syn_wait_lock = RW_LOCK_UNLOCKED;
+
+	lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
+	if (!lopt)
+		return -ENOMEM;
+
+	memset(lopt, 0, sizeof(struct tcp_listen_opt));
+	for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
+		if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
+			break;
+
+	write_lock_bh(&tp->syn_wait_lock);
+	tp->listen_opt = lopt;
+	write_unlock_bh(&tp->syn_wait_lock);
+
+	sk->state = TCP_LISTEN;
+	if (sk->num == 0) {
+		if (sk->prot->get_port(sk, 0) != 0) {
+			sk->state = TCP_CLOSE;
+			write_lock_bh(&tp->syn_wait_lock);
+			tp->listen_opt = NULL;
+			write_unlock_bh(&tp->syn_wait_lock);
+			kfree(lopt);
+			return -EAGAIN;
+		}
+		sk->sport = htons(sk->num);
+	} else {
+		if (sk->prev)
+			((struct tcp_bind_bucket*)sk->prev)->fastreuse = 0;
+	}
+
+	sk_dst_reset(sk);
+	sk->prot->hash(sk);
+	sk->socket->flags |= SO_ACCEPTCON;
+	sk->write_space = tcp_listen_write_space;
+
+	return 0;
+}
+
+/*
+ *	This routine closes sockets which have been at least partially
+ *	opened, but not yet accepted.
+ */
+
+static void tcp_listen_stop (struct sock *sk)
+{
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	struct tcp_listen_opt *lopt = tp->listen_opt;
+	struct open_request *acc_req = tp->accept_queue;
+	struct open_request *req;
+	int i;
+
+	tcp_delete_keepalive_timer(sk);
+
+	/* make all the listen_opt local to us */
+	write_lock_bh(&tp->syn_wait_lock);
+	tp->listen_opt =NULL;
+	write_unlock_bh(&tp->syn_wait_lock);
+	tp->accept_queue = NULL;
+
+	if (lopt->qlen) {
+		for (i=0; i<TCP_SYNQ_HSIZE; i++) {
+			while ((req = lopt->syn_table[i]) != NULL) {
+				lopt->syn_table[i] = req->dl_next;
+				lopt->qlen--;
+				tcp_openreq_free(req);
+
+		/* Following specs, it would be better either to send FIN
+		 * (and enter FIN-WAIT-1, it is normal close)
+		 * or to send active reset (abort). 
+		 * Certainly, it is pretty dangerous while synflood, but it is
+		 * bad justification for our negligence 8)
+		 * To be honest, we are not able to make either
+		 * of the variants now.			--ANK
+		 */
+			}
+		}
+	}
+	BUG_TRAP(lopt->qlen == 0);
+
+	kfree(lopt);
+
+	while ((req=acc_req) != NULL) {
+		struct sock *child = req->sk;
+
+		acc_req = req->dl_next;
+
+		local_bh_disable();
+		bh_lock_sock(child);
+		BUG_TRAP(child->lock.users==0);
+		sock_hold(child);
+
+		tcp_disconnect(child, O_NONBLOCK);
+
+		sock_orphan(child);
+
+		atomic_inc(&tcp_orphan_count);
+
+		tcp_destroy_sock(child);
+
+		bh_unlock_sock(child);
+		local_bh_enable();
+		sock_put(child);
+
+		tcp_acceptq_removed(sk);
+		tcp_openreq_fastfree(req);
+	}
+	BUG_TRAP(sk->ack_backlog == 0);
+}
+
 /*
  *	Wait for a socket to get into the connected state
  *
  *	Note: Must be called with the socket locked.
  */
-static int wait_for_tcp_connect(struct sock * sk, int flags)
+static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -684,7 +738,7 @@
 				send_sig(SIGPIPE, tsk, 0);
 			return -EPIPE;
 		}
-		if(flags & MSG_DONTWAIT)
+		if(!*timeo_p)
 			return -EAGAIN;
 		if(signal_pending(tsk))
 			return -ERESTARTSYS;
@@ -694,7 +748,7 @@
 		sk->tp_pinfo.af_tcp.write_pending++;
 
 		release_sock(sk);
-		schedule();
+		*timeo_p = schedule_timeout(*timeo_p);
 		lock_sock(sk);
 
 		__set_task_state(tsk, TASK_RUNNING);
@@ -712,7 +766,7 @@
 /*
  *	Wait for more memory for a socket
  */
-static void wait_for_tcp_memory(struct sock * sk)
+static long wait_for_tcp_memory(struct sock * sk, long timeo)
 {
 	if (!tcp_memory_free(sk)) {
 		DECLARE_WAITQUEUE(wait, current);
@@ -732,12 +786,13 @@
 				break;
 			release_sock(sk);
 			if (!tcp_memory_free(sk))
-				schedule();
+				timeo = schedule_timeout(timeo);
 			lock_sock(sk);
 		}
 		current->state = TASK_RUNNING;
 		remove_wait_queue(sk->sleep, &wait);
 	}
+	return timeo;
 }
 
 /* When all user supplied data has been queued set the PSH bit */
@@ -746,11 +801,9 @@
 /*
  *	This routine copies from a user buffer into a socket,
  *	and starts the transmit system.
- *
- *	Note: must be called with the socket locked.
  */
 
-int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
+int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
 {
 	struct iovec *iov;
 	struct tcp_opt *tp;
@@ -758,15 +811,22 @@
 	int iovlen, flags;
 	int mss_now;
 	int err, copied;
+	long timeo;
 
 	err = 0;
 	tp = &(sk->tp_pinfo.af_tcp);
 
-	/* Wait for a connection to finish. */
+	lock_sock(sk);
+	TCP_CHECK_TIMER(sk);
+
 	flags = msg->msg_flags;
+
+	timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
+
+	/* Wait for a connection to finish. */
 	if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
-		if((err = wait_for_tcp_connect(sk, flags)) != 0)
-			goto out;
+		if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
+			goto out_unlock;
 
 	/* This should be in poll */
 	sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */
@@ -777,7 +837,7 @@
 	iovlen = msg->msg_iovlen;
 	iov = msg->msg_iov;
 	copied = 0;
-	
+
 	while(--iovlen >= 0) {
 		int seglen=iov->iov_len;
 		unsigned char * from=iov->iov_base;
@@ -785,7 +845,7 @@
 		iov++;
 
 		while(seglen > 0) {
-			int copy, tmp, queue_it, psh;
+			int copy, tmp, queue_it;
 
 			if (err)
 				goto do_fault2;
@@ -811,8 +871,7 @@
 				 * welcome.
 				 */
 				if (skb_tailroom(skb) > 0 &&
-				    (mss_now - copy) > 0 &&
-				    tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) {
+				    (mss_now - copy) > 0) {
 					int last_byte_was_odd = (copy % 4);
 
 					copy = mss_now - copy;
@@ -855,34 +914,17 @@
 				}
 			}
 
-			/* We also need to worry about the window.  If
-			 * window < 1/2 the maximum window we've seen
-			 * from this host, don't use it.  This is
-			 * sender side silly window prevention, as
-			 * specified in RFC1122.  (Note that this is
-			 * different than earlier versions of SWS
-			 * prevention, e.g. RFC813.).  What we
-			 * actually do is use the whole MSS.  Since
-			 * the results in the right edge of the packet
-			 * being outside the window, it will be queued
-			 * for later rather than sent.
+			/* A chunk was here doing something strange
+			 * with psh etc. It is deleted, because it was
+			 * evident non-sense.			 --ANK
 			 */
-			psh = 0;
-			copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
-			if(copy > (tp->max_window >> 1)) {
-				copy = min(copy, mss_now);
-				psh = 1;
-			} else {
-				copy = mss_now;
-			}
-			if(copy > seglen)
-				copy = seglen;
+
+			copy = min(seglen, mss_now);
 
 			/* Determine how large of a buffer to allocate.  */
-			tmp = MAX_HEADER + sk->prot->max_header;
-			if (copy < min(mss_now, tp->max_window >> 1) &&
-			    !(flags & MSG_OOB)) {
-				tmp += min(mss_now, tp->max_window);
+			tmp = MAX_TCP_HEADER + 15;
+			if (copy < mss_now && !(flags & MSG_OOB)) {
+				tmp += mss_now;
 
 				/* What is happening here is that we want to
 				 * tack on later members of the users iovec
@@ -901,7 +943,7 @@
 			/* If we didn't get any memory, we need to sleep. */
 			if (skb == NULL) {
 				sk->socket->flags |= SO_NOSPACE;
-				if (flags&MSG_DONTWAIT) {
+				if (!timeo) {
 					err = -EAGAIN;
 					goto do_interrupted;
 				}
@@ -909,8 +951,8 @@
 					err = -ERESTARTSYS;
 					goto do_interrupted;
 				}
-				tcp_push_pending_frames(sk, tp);
-				wait_for_tcp_memory(sk);
+				__tcp_push_pending_frames(sk, tp, mss_now);
+				timeo = wait_for_tcp_memory(sk, timeo);
 
 				/* If SACK's were formed or PMTU events happened,
 				 * we must find out about it.
@@ -923,7 +965,7 @@
 
 			/* Prepare control bits for TCP header creation engine. */
 			TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK |
-						  ((PSH_NEEDED || psh) ?
+						  ((PSH_NEEDED) ?
 						   TCPCB_FLAG_PSH : 0));
 			TCP_SKB_CB(skb)->sacked = 0;
 			if (flags & MSG_OOB) {
@@ -936,7 +978,7 @@
 			 * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
 			 * Reserve header space and checksum the data.
 			 */
-			skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
+			skb_reserve(skb, MAX_TCP_HEADER);
 			skb->csum = csum_and_copy_from_user(from,
 					skb_put(skb, copy), copy, 0, &err);
 
@@ -950,7 +992,7 @@
 			TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
 
 			/* This advances tp->write_seq for us. */
-			tcp_send_skb(sk, skb, queue_it);
+			tcp_send_skb(sk, skb, queue_it, mss_now);
 		}
 	}
 	sk->err = 0;
@@ -981,63 +1023,39 @@
 do_fault2:
 	err = -EFAULT;
 out:
-	tcp_push_pending_frames(sk, tp);
+	__tcp_push_pending_frames(sk, tp, mss_now);
+	TCP_CHECK_TIMER(sk);
+out_unlock:
+	release_sock(sk);
 	return err;
 }
 
 #undef PSH_NEEDED
 
 /*
- *	Send an ack if one is backlogged at this point. Ought to merge
- *	this with tcp_send_ack().
- *      This is called for delayed acks also.
- */
- 
-void tcp_read_wakeup(struct sock *sk)
-{
-	/* If we're closed, don't send an ack, or we'll get a RST
-	 * from the closed destination.
-	 */
-	if (sk->state != TCP_CLOSE)
-		tcp_send_ack(sk);
-}
-
-/*
  *	Handle reading urgent data. BSD has very simple semantics for
  *	this, no blocking and very strange errors 8)
  */
 
-static int tcp_recv_urg(struct sock * sk, int nonblock,
+static int tcp_recv_urg(struct sock * sk, long timeo,
 			struct msghdr *msg, int len, int flags, 
 			int *addr_len)
 {
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
 	/* No URG data to read. */
-	if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ)
+	if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
 		return -EINVAL;	/* Yes this is right ! */
 
 	if (sk->done)
 		return -ENOTCONN;
 
-	if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) {
-		sk->done = 1;
-		return 0;
-	}
-
-	if (tp->urg_data & URG_VALID) {
+	if (tp->urg_data & TCP_URG_VALID) {
 		int err = 0; 
 		char c = tp->urg_data;
 
 		if (!(flags & MSG_PEEK))
-			tp->urg_data = URG_READ;
-
-		if(msg->msg_name)
-			tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
-						       msg->msg_name);       
-
-		if(addr_len)
-			*addr_len = tp->af_specific->sockaddr_len;
+			tp->urg_data = TCP_URG_READ;
 
 		/* Read urgent data. */
 		msg->msg_flags|=MSG_OOB;
@@ -1051,6 +1069,10 @@
 		return err ? -EFAULT : len;
 	}
 
+	/* Do not set sk->done, it is set only by normal data receive */
+	if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
+		return 0;
+
 	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
 	 * the available implementations agree in this case:
 	 * this call should never block, independent of the
@@ -1069,6 +1091,8 @@
 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
 {
 	__skb_unlink(skb, &sk->receive_queue);
+	BUG_TRAP(atomic_read(&skb->users) == 1);
+	/* Well, if I missed something then punishment will be terrible oops. */
 	__kfree_skb(skb);
 }
 
@@ -1080,22 +1104,34 @@
  */
 static void cleanup_rbuf(struct sock *sk, int copied)
 {
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	struct sk_buff *skb;
+	int time_to_ack;
 	
 	/* NOTE! The socket must be locked, so that we don't get
 	 * a messed-up receive queue.
 	 */
 	while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
-		if (!skb->used || atomic_read(&skb->users) > 1)
+		if (!skb->used)
 			break;
 		tcp_eat_skb(sk, skb);
 	}
 
+	/* Delayed ACKs frequently hit locked sockets during bulk receive. */
+	time_to_ack = tp->ack.blocked && tp->ack.pending;
+#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/
+	if (tp->ack.pending &&
+	    (tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss)
+		time_to_ack = 1;
+#endif
+
   	/* We send an ACK if we can now advertise a non-zero window
 	 * which has been raised "significantly".
+	 *
+	 * Even if window raised up to infinity, do not send window open ACK
+	 * in states, where we will not receive more. It is useless.
   	 */
-	if(copied > 0) {
-		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
 		__u32 rcv_window_now = tcp_receive_window(tp);
 		__u32 new_window = __tcp_select_window(sk);
 
@@ -1106,16 +1142,20 @@
 		 * which don't advertize a larger window.
 		 */
 		if((new_window && (new_window >= rcv_window_now * 2)) &&
-		   ((rcv_window_now + tp->mss_cache) <= tp->window_clamp))
-			tcp_read_wakeup(sk);
+		   ((rcv_window_now + tp->ack.rcv_mss) <= tp->window_clamp))
+			time_to_ack = 1;
 	}
+	if (time_to_ack)
+		tcp_send_ack(sk);
 }
 
 /* Now socket state including sk->err is changed only under lock,
-   hence we should check only pending signals.
+ * hence we may omit checks after joining wait queue.
+ * We check receive queue before schedule() only as optimization;
+ * it is very likely that release_sock() added new data.
  */
 
-static void tcp_data_wait(struct sock *sk)
+static long tcp_data_wait(struct sock *sk, long timeo)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -1127,17 +1167,39 @@
 	release_sock(sk);
 
 	if (skb_queue_empty(&sk->receive_queue))
-		schedule();
+		timeo = schedule_timeout(timeo);
 
 	lock_sock(sk);
 	sk->socket->flags &= ~SO_WAITDATA;
 
 	remove_wait_queue(sk->sleep, &wait);
 	__set_current_state(TASK_RUNNING);
+	return timeo;
+}
+
+static void tcp_prequeue_process(struct sock *sk)
+{
+	struct sk_buff *skb;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+	net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
+
+	/* RX process wants to run with disabled BHs, though it is not necessary */
+	local_bh_disable();
+	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
+		sk->backlog_rcv(sk, skb);
+	local_bh_enable();
+
+	/* Clear memory counter. */
+	tp->ucopy.memory = 0;
 }
 
 /*
  *	This routine copies from a sock struct into the user buffer. 
+ *
+ *	Technical note: in 2.3 we work on _locked_ socket, so that
+ *	tricks with *seq access order and skb->users are not required.
+ *	Probably, code can be easily improved even more.
  */
  
 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
@@ -1146,13 +1208,18 @@
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	int copied = 0;
 	u32 peek_seq;
-	volatile u32 *seq;	/* So gcc doesn't overoptimise */
+	u32 *seq;
 	unsigned long used;
 	int err;
-	int target = 1;		/* Read at least this many bytes */
+	int target;		/* Read at least this many bytes */
+	long timeo;
+	struct task_struct *user_recv = NULL;
 
 	lock_sock(sk);
 
+	TCP_CHECK_TIMER(sk);
+
+
 	if (sk->err)
 		goto out_err;
 
@@ -1160,24 +1227,20 @@
 	if (sk->state == TCP_LISTEN)
 		goto out;
 
+	timeo = sock_rcvtimeo(sk, nonblock);
+
 	/* Urgent data needs to be handled specially. */
 	if (flags & MSG_OOB)
 		goto recv_urg;
 
-	/*	Copying sequence to update. This is volatile to handle
-	 *	the multi-reader case neatly (memcpy_to/fromfs might be
-	 *	inline and thus not flush cached variables otherwise).
-	 */
-	peek_seq = tp->copied_seq;
 	seq = &tp->copied_seq;
-	if (flags & MSG_PEEK)
+	if (flags & MSG_PEEK) {
+		peek_seq = tp->copied_seq;
 		seq = &peek_seq;
+	}
 
-	/* Handle the POSIX bogosity MSG_WAITALL. */
-	if (flags & MSG_WAITALL)
-		target=len;
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 
-	
 	/*
 	 *	BUG BUG BUG
 	 *	This violates 1003.1g compliance. We must wait for 
@@ -1200,7 +1263,7 @@
 			if (copied)
 				break;
 			copied = -ERESTARTSYS;
-			if (nonblock)
+			if (!timeo)
 				copied = -EAGAIN;
 			break;
 		}
@@ -1232,47 +1295,128 @@
 			skb = skb->next;
 		} while (skb != (struct sk_buff *)&sk->receive_queue);
 
-		if (copied >= target)
+		/* Well, if we have backlog, try to process it now yet. */
+
+		if (copied >= target && sk->backlog.tail == NULL)
 			break;
 
-		if (sk->err && !(flags&MSG_PEEK)) {
-			if (!copied)
+		if (copied) {
+			if (sk->err ||
+			    sk->state == TCP_CLOSE ||
+			    (sk->shutdown & RCV_SHUTDOWN) ||
+			    !timeo)
+				break;
+		} else {
+			if (sk->err) {
 				copied = sock_error(sk);
-			break;
-		}
+				break;
+			}
 
-		if (sk->shutdown & RCV_SHUTDOWN) {
-			sk->done = 1;
-			break;
-		}
+			if (sk->done) {
+				copied = -ENOTCONN;
+				break;
+			}
 
-		if (sk->state == TCP_CLOSE) {
-			if (!sk->done) {
-				sk->done = 1;
+			if (sk->state == TCP_CLOSE) {
+				if (!(flags&MSG_PEEK))
+					sk->done = 1;
 				break;
 			}
-			if (!copied)
-				copied = -ENOTCONN;
-			break;
-		}
 
-		if (nonblock) {
-			copied = -EAGAIN;
-			break;
+			if (sk->shutdown & RCV_SHUTDOWN)
+				break;
+
+			if (!timeo) {
+				copied = -EAGAIN;
+				break;
+			}
 		}
 
 		cleanup_rbuf(sk, copied);
-		tcp_data_wait(sk);
+
+		if (tp->ucopy.task == user_recv) {
+			/* Install new reader */
+			if (user_recv == NULL && !(flags&MSG_PEEK)) {
+				user_recv = current;
+				tp->ucopy.task = user_recv;
+				tp->ucopy.iov = msg->msg_iov;
+			}
+
+			tp->ucopy.len = len;
+
+			BUG_TRAP(tp->copied_seq == tp->rcv_nxt);
+
+			/* Ugly... If prequeue is not empty, we have to
+			 * process it before releasing socket, otherwise
+			 * order will be broken at second iteration.
+			 * More elegant solution is required!!!
+			 *
+			 * Look: we have the following (pseudo)queues:
+			 *
+			 * 1. packets in flight
+			 * 2. backlog
+			 * 3. prequeue
+			 * 4. receive_queue
+			 *
+			 * Each queue can be processed only if the next ones
+			 * are empty. At this point we have empty receive_queue.
+			 * But prequeue _can_ be not empty after second iteration,
+			 * when we jumped to start of loop because backlog
+			 * processing added something to receive_queue.
+			 * We cannot release_sock(), because backlog contains
+			 * packets arrived _after_ prequeued ones.
+			 *
+			 * Shortly, algorithm is clear --- to process all
+			 * the queues in order. We could make it more directly,
+			 * requeueing packets from backlog to prequeue, if
+			 * is not empty. It is more elegant, but eats cycles,
+			 * unfortunately.
+			 */
+			if (skb_queue_len(&tp->ucopy.prequeue))
+				goto do_prequeue;
+
+			/* __ Set realtime policy in scheduler __ */
+		}
+
+		if (copied >= target) {
+			/* Do not sleep, just process backlog. */
+			release_sock(sk);
+			lock_sock(sk);
+		} else {
+			timeo = tcp_data_wait(sk, timeo);
+		}
+
+		if (user_recv) {
+			int chunk;
+
+			/* __ Restore normal policy in scheduler __ */
+
+			if ((chunk = len - tp->ucopy.len) != 0) {
+				net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
+				len -= chunk;
+				copied += chunk;
+			}
+
+			if (tp->rcv_nxt == tp->copied_seq &&
+			    skb_queue_len(&tp->ucopy.prequeue)) {
+do_prequeue:
+				tcp_prequeue_process(sk);
+
+				if ((chunk = len - tp->ucopy.len) != 0) {
+					net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
+					len -= chunk;
+					copied += chunk;
+				}
+			}
+#if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/
+			if (tp->ack.pending &&
+			    (tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss)
+				tcp_send_ack(sk);
+#endif
+		}
 		continue;
 
 	found_ok_skb:
-		/*	Lock the buffer. We can be fairly relaxed as
-		 *	an interrupt will never steal a buffer we are
-		 *	using unless I've missed something serious in
-		 *	tcp_data.
-		 */
-		atomic_inc(&skb->users);
-
 		/* Ok so how much can we use? */
 		used = skb->len - offset;
 		if (len < used)
@@ -1293,36 +1437,28 @@
 			}
 		}
 
-		/*	Copy it - We _MUST_ update *seq first so that we
-		 *	don't ever double read when we have dual readers
-		 */
-		*seq += used;
-
-		/*	This memcpy_toiovec can sleep. If it sleeps and we
-		 *	do a second read it relies on the skb->users to avoid
-		 *	a crash when cleanup_rbuf() gets called.
-		 */
 		err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
 		if (err) {
 			/* Exception. Bailout! */
-			atomic_dec(&skb->users);
-			copied = -EFAULT;
+			if (!copied)
+				copied = -EFAULT;
 			break;
 		}
 
+		*seq += used;
 		copied += used;
 		len -= used;
 
-		/*	We now will not sleep again until we are finished
-		 *	with skb. Sorry if you are doing the SMP port
-		 *	but you'll just have to fix it neatly ;)
-		 *
-		 *	Very funny Alan... -DaveM
-		 */
-		atomic_dec(&skb->users);
-
-		if (after(tp->copied_seq,tp->urg_seq))
+		if (after(tp->copied_seq,tp->urg_seq)) {
 			tp->urg_data = 0;
+			if (skb_queue_len(&tp->out_of_order_queue) == 0
+#ifdef TCP_FORMAL_WINDOW
+			    && tcp_receive_window(tp)
+#endif
+			    ) {
+				tcp_fast_path_on(tp);
+			}
+		}
 		if (used + offset < skb->len)
 			continue;
 
@@ -1334,8 +1470,30 @@
 		if (flags & MSG_PEEK)
 			continue;
 		skb->used = 1;
-		if (atomic_read(&skb->users) == 1)
-			tcp_eat_skb(sk, skb);
+		tcp_eat_skb(sk, skb);
+
+#ifdef CONFIG_TCP_LESS_COARSE_ACKS
+		/* Possible improvement. When sender is faster than receiver,
+		 * traffic looks like: fill window ... wait for window open ...
+		 * fill window. We lose at least one rtt, because call
+		 * cleanup_rbuf only once. Probably, if "len" was large
+		 * we should insert several intermediate cleanup_rbuf(s).
+		 *
+		 * F.e.:
+		 */
+		do {
+			u32 full_space = min(tp->window_clamp, tcp_full_space(sk));
+
+			/* Try to ACK, if total buffer length is larger
+			   than maximal window and if rcv_window has
+			   chances to increase twice. It will result
+			   to exponentially decreased ACKing during
+			   read to huge (usually, mmapped) buffer.
+			 */
+			if (len >= full_space && tp->rcv_wnd <= full_space/2)
+				cleanup_rbuf(sk, copied);
+		} while (0);
+#endif
 		continue;
 
 	found_fin_ok:
@@ -1345,19 +1503,36 @@
 
 		/* All is done. */
 		skb->used = 1;
-		sk->shutdown |= RCV_SHUTDOWN;
 		break;
 	}
 
-	if (copied >= 0 && msg->msg_name)
-		tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
-					       msg->msg_name);       
+	if (user_recv) {
+		if (skb_queue_len(&tp->ucopy.prequeue)) {
+			int chunk;
+
+			tp->ucopy.len = copied > 0 ? len : 0;
+
+			tcp_prequeue_process(sk);
+
+			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
+				net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
+				len -= chunk;
+				copied += chunk;
+			}
+		}
+
+		tp->ucopy.task = NULL;
+		tp->ucopy.len = 0;
+	}
 
-	if(addr_len)
-		*addr_len = tp->af_specific->sockaddr_len;
+	/* According to UNIX98, msg_name/msg_namelen are ignored
+	 * on connected socket. I was just happy when found this 8) --ANK
+	 */
 
 	/* Clean up data we have read: This will do ACK frames. */
 	cleanup_rbuf(sk, copied);
+
+	TCP_CHECK_TIMER(sk);
 	release_sock(sk);
 	return copied;
 
@@ -1365,24 +1540,16 @@
 	err = sock_error(sk);
 
 out:
+	TCP_CHECK_TIMER(sk);
 	release_sock(sk);
 	return err;
 
 recv_urg:
-	err = tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
+	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
 	goto out;
 }
 
 /*
- * Check whether to renew the timer.
- */
-static inline void tcp_check_fin_timer(struct sock *sk)
-{
-	if (sk->state == TCP_FIN_WAIT2)
-		tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout);
-}
-
-/*
  *	State processing on a close. This implements the state shift for
  *	sending our FIN frame. Note that we only send a FIN for some
  *	states. A shutdown() may have already sent the FIN, or we may be
@@ -1405,24 +1572,13 @@
   /* TCP_CLOSING	*/ TCP_CLOSING,
 };
 
-static int tcp_close_state(struct sock *sk, int dead)
+static int tcp_close_state(struct sock *sk)
 {
 	int next = (int) new_state[sk->state];
 	int ns = (next & TCP_STATE_MASK);
 
 	tcp_set_state(sk, ns);
 
-	/*	This is a (useful) BSD violating of the RFC. There is a
-	 *	problem with TCP as specified in that the other end could
-	 *	keep a socket open forever with no application left this end.
-	 *	We use a 3 minute timeout (about the same as BSD) then kill
-	 *	our end. If they send after that then tough - BUT: long enough
-	 *	that we won't make the old 4*rto = almost no time - whoops
-	 *	reset mistake.
-	 */
-	if (dead)
-		tcp_check_fin_timer(sk);
-
 	return (next & TCP_ACTION_FIN);
 }
 
@@ -1443,9 +1599,8 @@
 	/* If we've already sent a FIN, or it's a closed state, skip this. */
 	if ((1 << sk->state) &
 	    (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
-
 		/* Clear out any half completed packets.  FIN if needed. */
-		if (tcp_close_state(sk,0))
+		if (tcp_close_state(sk))
 			tcp_send_fin(sk);
 	}
 }
@@ -1460,40 +1615,6 @@
 	return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
 }
 
-/*
- *	This routine closes sockets which have been at least partially
- *	opened, but not yet accepted. Currently it is only called by
- *	tcp_close.
- */
-
-static void tcp_close_pending (struct sock *sk)
-{
-	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-	struct open_request *req = tp->syn_wait_queue;
-
-	while(req) {
-		struct open_request *iter;
-		
-		if (req->sk)
-			tcp_close(req->sk, 0);
-
-		iter = req;
-		req = req->dl_next;
-
-		if (iter->sk) {
-			sk->ack_backlog--;
-		} else {
-			tcp_dec_slow_timer(TCP_SLT_SYNACK);
-			tp->syn_backlog--;
-		}
-		(*iter->class->destructor)(iter);
-		tcp_openreq_free(iter);
-	}
-	BUG_TRAP(tp->syn_backlog == 0);
-	BUG_TRAP(sk->ack_backlog == 0);
-	tcp_synq_init(tp);
-}
-
 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
 {
 	/* First the read buffer. */
@@ -1528,6 +1649,14 @@
 	/* It it has not 0 sk->num, it must be bound */
 	BUG_TRAP(!sk->num || sk->prev!=NULL);
 
+#ifdef TCP_DEBUG
+	if (sk->zapped) {
+		printk("TCP: double destroy sk=%p\n", sk);
+		sock_hold(sk);
+	}
+	sk->zapped = 1;
+#endif
+
 	sk->prot->destroy(sk);
 
 	tcp_kill_sk_queues(sk);
@@ -1538,6 +1667,7 @@
 	}
 #endif
 
+	atomic_dec(&tcp_orphan_count);
 	sock_put(sk);
 }
 
@@ -1547,17 +1677,17 @@
 	int data_was_unread = 0;
 
 	lock_sock(sk);
+	sk->shutdown = SHUTDOWN_MASK;
+
 	if(sk->state == TCP_LISTEN) {
 		tcp_set_state(sk, TCP_CLOSE);
 
 		/* Special case. */
-		tcp_close_pending(sk);
+		tcp_listen_stop(sk);
 
 		goto adjudge_to_death;
 	}
 
-	sk->shutdown = SHUTDOWN_MASK;
-
 	/*  We need to flush the recv. buffs.  We do this only on the
 	 *  descriptor close, not protocol-sourced closes, because the
 	 *  reader process may not have drained the data yet!
@@ -1581,10 +1711,35 @@
 		/* Unread data was tossed, zap the connection. */
 		tcp_set_state(sk, TCP_CLOSE);
 		tcp_send_active_reset(sk, GFP_KERNEL);
-	} else if (tcp_close_state(sk,1)) {
+	} else if (sk->linger && sk->lingertime==0) {
+		/* Check zero linger _after_ checking for unread data. */
+		sk->prot->disconnect(sk, 0);
+	} else if (tcp_close_state(sk)) {
 		/* We FIN if the application ate all the data before
 		 * zapping the connection.
 		 */
+
+		/* RED-PEN. Formally speaking, we have broken TCP state
+		 * machine. State transitions:
+		 *
+		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
+		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
+		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
+		 *
+		 * are legal only when FIN has been sent (i.e. in window),
+		 * rather than queued out of window. Purists blame.
+		 *
+		 * F.e. "RFC state" is ESTABLISHED,
+		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
+		 *
+		 * The visible declinations are that sometimes
+		 * we enter time-wait state, when it is not required really
+		 * (harmless), do not send active resets, when they are
+		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
+		 * they look as CLOSING or LAST_ACK for Linux)
+		 * Probably, I missed some more holelets.
+		 * 						--ANK
+		 */
 		tcp_send_fin(sk);
 	}
 
@@ -1594,26 +1749,19 @@
 
 		add_wait_queue(sk->sleep, &wait);
 
-		while (1) {
+		do {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (!closing(sk))
 				break;
 			release_sock(sk);
 			timeout = schedule_timeout(timeout);
 			lock_sock(sk);
-			if (!signal_pending(tsk) || timeout)
-				break;
-		}
+		} while (!signal_pending(tsk) && timeout);
 
 		tsk->state = TASK_RUNNING;
 		remove_wait_queue(sk->sleep, &wait);
 	}
 
-	/* Now that the socket is dead, if we are in the FIN_WAIT2 state
-	 * we may need to set up a timer.
-         */
-	tcp_check_fin_timer(sk);
-
 adjudge_to_death:
 	/* It is the last release_sock in its life. It will remove backlog. */
 	release_sock(sk);
@@ -1627,23 +1775,67 @@
 	BUG_TRAP(sk->lock.users==0);
 
 	sock_hold(sk);
+	sock_orphan(sk);
+
+	/*	This is a (useful) BSD violating of the RFC. There is a
+	 *	problem with TCP as specified in that the other end could
+	 *	keep a socket open forever with no application left this end.
+	 *	We use a 3 minute timeout (about the same as BSD) then kill
+	 *	our end. If they send after that then tough - BUT: long enough
+	 *	that we won't make the old 4*rto = almost no time - whoops
+	 *	reset mistake.
+	 *
+	 *	Nope, it was not mistake. It is really desired behaviour
+	 *	f.e. on http servers, when such sockets are useless, but
+	 *	consume significant resources. Let's do it with special
+	 *	linger2	option.					--ANK
+	 */
 
-	/* Announce socket dead, detach it from wait queue and inode. */
-	write_lock_irq(&sk->callback_lock);
-	sk->dead = 1;
-	sk->socket = NULL;
-	sk->sleep = NULL;
-	write_unlock_irq(&sk->callback_lock);
+	if (sk->state == TCP_FIN_WAIT2) {
+		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+		if (tp->linger2 < 0) {
+			tcp_set_state(sk, TCP_CLOSE);
+			tcp_send_active_reset(sk, GFP_ATOMIC);
+		} else {
+			int tmo = tcp_fin_time(tp);
+
+			if (tmo > TCP_TIMEWAIT_LEN) {
+				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
+			} else {
+				atomic_inc(&tcp_orphan_count);
+				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+				goto out;
+			}
+		}
+	}
+	if (sk->state != TCP_CLOSE &&
+	    atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans) {
+		if (net_ratelimit())
+			printk(KERN_INFO "TCP: too many of orphaned sockets\n");
+		tcp_set_state(sk, TCP_CLOSE);
+		tcp_send_active_reset(sk, GFP_ATOMIC);
+	}
+	atomic_inc(&tcp_orphan_count);
 
 	if (sk->state == TCP_CLOSE)
 		tcp_destroy_sock(sk);
 	/* Otherwise, socket is reprieved until protocol close. */
 
+out:
 	bh_unlock_sock(sk);
 	local_bh_enable();
 	sock_put(sk);
 }
 
+/* These states need RST on ABORT according to RFC793 */
+
+extern __inline__ int tcp_need_reset(int state)
+{
+	return ((1 << state) &
+	       	(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
+		 TCPF_FIN_WAIT2|TCPF_SYN_RECV));
+}
+
 int tcp_disconnect(struct sock *sk, int flags)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
@@ -1656,9 +1848,14 @@
 
 	/* ABORT function of RFC793 */
 	if (old_state == TCP_LISTEN) {
-		tcp_close_pending(sk);
-	} else if (tcp_connected(old_state)) {
-		tcp_send_active_reset(sk, GFP_KERNEL);
+		tcp_listen_stop(sk);
+	} else if (tcp_need_reset(old_state) ||
+		   (tp->snd_nxt != tp->write_seq &&
+		    (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
+		/* The last check adjusts for discrepance of Linux wrt. RFC
+		 * states
+		 */
+		tcp_send_active_reset(sk, gfp_any());
 		sk->err = ECONNRESET;
 	} else if (old_state == TCP_SYN_SENT)
 		sk->err = ECONNRESET;
@@ -1677,26 +1874,25 @@
 	memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
 #endif
 
-	sk->zapped = 0;
 	sk->shutdown = 0;
 	sk->done = 0;
 	sk->write_space = tcp_write_space;
 	tp->srtt = 0;
-#ifdef CONFIG_TCP_TW_RECYCLE
-	if ((tp->write_seq += 2) == 0)
-		tp->write_seq = 1;
-#else
-	tp->write_seq = 0;
-#endif
-	tp->ato = 0;
+	if (sysctl_tcp_tw_recycle) {
+		if ((tp->write_seq += 2) == 0)
+			tp->write_seq = 1;
+	} else {
+		tp->write_seq = 0;
+	}
 	tp->backoff = 0;
 	tp->snd_cwnd = 2;
 	tp->probes_out = 0;
+	tp->packets_out = 0;
 	tp->high_seq = 0;
 	tp->snd_ssthresh = 0x7fffffff;
 	tp->snd_cwnd_cnt = 0;
 	tp->dup_acks = 0;
-	tp->delayed_acks = 0;
+	tcp_delack_init(tp);
 	tp->send_head = tp->retrans_head = NULL;
 	tp->saw_tstamp = 0;
 	__sk_dst_reset(sk);
@@ -1712,11 +1908,10 @@
  *	conditions. This must be called with the socket locked,
  *	and without the kernel lock held.
  */
-static struct open_request * wait_for_connect(struct sock * sk,
-					      struct open_request **pprev)
+static int wait_for_connect(struct sock * sk, long timeo)
 {
 	DECLARE_WAITQUEUE(wait, current);
-	struct open_request *req;
+	int err;
 
 	/*
 	 * True wake-one mechanism for incoming connections: only
@@ -1736,17 +1931,25 @@
 	for (;;) {
 		current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
 		release_sock(sk);
-		schedule();
+		if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
+			timeo = schedule_timeout(timeo);
 		lock_sock(sk);
-		req = tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev);
-		if (req) 
+		err = 0;
+		if (sk->tp_pinfo.af_tcp.accept_queue)
 			break;
+		err = -EINVAL;
+		if (sk->state != TCP_LISTEN)
+			break;
+		err = -ERESTARTSYS;
 		if (signal_pending(current))
 			break;
+		err = -EAGAIN;
+		if (!timeo)
+			break;
 	}
 	current->state = TASK_RUNNING;
 	remove_wait_queue(sk->sleep, &wait);
-	return req;
+	return err;
 }
 
 /*
@@ -1758,9 +1961,10 @@
 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-	struct open_request *req, *prev;
+	struct open_request *req;
 	struct sock *newsk;
 	int error;
+	long timeo;
 
 	lock_sock(sk); 
 
@@ -1771,25 +1975,27 @@
 	if (sk->state != TCP_LISTEN)
 		goto out;
 
+	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
 	/* Find already established connection */
-	req = tcp_find_established(tp, &prev);
-	if (!req) {
+	if (!tp->accept_queue) {
 		/* If this is a non blocking socket don't sleep */
 		error = -EAGAIN;
-		if (flags & O_NONBLOCK)
+		if (!timeo)
 			goto out;
 
-		error = -ERESTARTSYS;
-		req = wait_for_connect(sk, &prev);
-		if (!req)
+		error = wait_for_connect(sk, timeo);
+		if (error)
 			goto out;
 	}
 
-	tcp_synq_unlink(tp, req, prev);
-	newsk = req->sk;
-	req->class->destructor(req);
-	tcp_openreq_free(req);
-	sk->ack_backlog--; 
+	req = tp->accept_queue;
+	tp->accept_queue = req->dl_next;
+
+ 	newsk = req->sk;
+	tcp_acceptq_removed(sk);
+	tcp_openreq_fastfree(req);
+	BUG_TRAP(newsk->state != TCP_SYN_RECV);
 	release_sock(sk);
 	return newsk;
 
@@ -1828,7 +2034,7 @@
 		 * the point when this call is done we typically don't yet know
 		 * which interface is going to be used
 		 */
-		if(val < 1 || val > MAX_WINDOW) {
+		if(val < 8 || val > MAX_TCP_WINDOW) {
 			err = -EINVAL;
 			break;
 		}
@@ -1839,11 +2045,11 @@
 		/* You cannot try to use this and TCP_CORK in
 		 * tandem, so let the user know.
 		 */
-		if (sk->nonagle == 2) {
+		if (tp->nonagle == 2) {
 			err = -EINVAL;
 			break;
 		}
-		sk->nonagle = (val == 0) ? 0 : 1;
+		tp->nonagle = (val == 0) ? 0 : 1;
 		break;
 
 	case TCP_CORK:
@@ -1858,14 +2064,14 @@
 		 * You cannot try to use TCP_NODELAY and this mechanism
 		 * at the same time, so let the user know.
 		 */
-		if (sk->nonagle == 1) {
+		if (tp->nonagle == 1) {
 			err = -EINVAL;
 			break;
 		}
 		if (val != 0) {
-			sk->nonagle = 2;
+			tp->nonagle = 2;
 		} else {
-			sk->nonagle = 0;
+			tp->nonagle = 0;
 
 			tcp_push_pending_frames(sk, tp);
 		}
@@ -1905,6 +2111,38 @@
 			tp->syn_retries = val;
 		break;
 
+	case TCP_LINGER2:
+		if (val < 0)
+			tp->linger2 = -1;
+		else if (val > sysctl_tcp_fin_timeout/HZ)
+			tp->linger2 = 0;
+		else
+			tp->linger2 = val*HZ;
+		break;
+
+	case TCP_DEFER_ACCEPT:
+		tp->defer_accept = 0;
+		if (val > 0) {
+			/* Translate value in seconds to number of retransmits */
+			while (val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
+				tp->defer_accept++;
+			tp->defer_accept++;
+		}
+		break;
+
+	case TCP_WINDOW_CLAMP:
+		if (val==0) {
+			if (sk->state != TCP_CLOSE) {
+				err = -EINVAL;
+				break;
+			}
+			tp->window_clamp = 0;
+		} else {
+			tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
+				SOCK_MIN_SNDBUF : val;
+		}
+		break;
+
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -1930,37 +2168,38 @@
 
 	switch(optname) {
 	case TCP_MAXSEG:
-		val = tp->user_mss;
+		val = tp->mss_cache;
+		if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
+			val = tp->user_mss;
 		break;
 	case TCP_NODELAY:
-		val = (sk->nonagle == 1);
+		val = (tp->nonagle == 1);
 		break;
 	case TCP_CORK:
-		val = (sk->nonagle == 2);
+		val = (tp->nonagle == 2);
 		break;
 	case TCP_KEEPIDLE:
-		if (tp->keepalive_time)
-			val = tp->keepalive_time / HZ;
-		else
-			val = sysctl_tcp_keepalive_time / HZ;
+		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
 		break;
 	case TCP_KEEPINTVL:
-		if (tp->keepalive_intvl)
-			val = tp->keepalive_intvl / HZ;
-		else
-			val = sysctl_tcp_keepalive_intvl / HZ;
+		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
 		break;
 	case TCP_KEEPCNT:
-		if (tp->keepalive_probes)
-			val = tp->keepalive_probes;
-		else
-			val = sysctl_tcp_keepalive_probes;
+		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
 		break;
 	case TCP_SYNCNT:
-		if (tp->syn_retries)
-			val = tp->syn_retries;
-		else
-			val = sysctl_tcp_syn_retries;
+		val = tp->syn_retries ? : sysctl_tcp_syn_retries;
+		break;
+	case TCP_LINGER2:
+		val = tp->linger2;
+		if (val > 0)
+			val = (val ? : sysctl_tcp_fin_timeout)/HZ;
+		break;
+	case TCP_DEFER_ACCEPT:
+		val = tp->defer_accept == 0 ? 0 : (TCP_TIMEOUT_INIT<<(tp->defer_accept-1));
+		break;
+	case TCP_WINDOW_CLAMP:
+		val = tp->window_clamp;
 		break;
 	default:
 		return -ENOPROTOOPT;
@@ -2049,11 +2288,20 @@
 		tcp_bhash[i].chain = NULL;
 	}
 
+	/* Try to be a bit smarter and adjust defaults depending
+	 * on available memory.
+	 */
 	if (order > 4) {
 		sysctl_local_port_range[0] = 32768;
 		sysctl_local_port_range[1] = 61000;
+		sysctl_tcp_max_tw_buckets = 180000;
+		sysctl_tcp_max_orphans = 4096<<(order-4);
+		sysctl_max_syn_backlog = 1024;
 	} else if (order < 3) {
 		sysctl_local_port_range[0] = 1024*(3-order);
+		sysctl_tcp_max_tw_buckets >>= (3-order);
+		sysctl_tcp_max_orphans >>= (3-order);
+		sysctl_max_syn_backlog = 128;
 	}
 	tcp_port_rover = sysctl_local_port_range[0] - 1;
 

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)