patch-2.1.8 linux/net/ipv4/tcp.c
Next file: linux/net/ipv4/tcp_input.c
Previous file: linux/net/ipv4/sysctl_net_ipv4.c
Back to the patch index
Back to the overall index
- Lines: 1490
- Date:
Thu Nov 7 11:02:38 1996
- Orig file:
v2.1.7/linux/net/ipv4/tcp.c
- Orig date:
Tue Oct 29 19:58:50 1996
diff -u --recursive --new-file v2.1.7/linux/net/ipv4/tcp.c linux/net/ipv4/tcp.c
@@ -172,7 +172,7 @@
* ack if stat is TCP_CLOSED.
* Alan Cox : Look up device on a retransmit - routes may
* change. Doesn't yet cope with MSS shrink right
- * but it's a start!
+ * but its a start!
* Marc Tamsky : Closing in closing fixes.
* Mike Shaver : RFC1122 verifications.
* Alan Cox : rcv_saddr errors.
@@ -194,15 +194,8 @@
* against machines running Solaris,
* and seems to result in general
* improvement.
- * Eric Schenk : Changed receiver side silly window
- * avoidance algorithm to BSD style
- * algorithm. This doubles throughput
- * against machines running Solaris,
- * and seems to result in general
- * improvement.
* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
* Willy Konynenberg : Transparent proxying support.
- * Theodore Ts'o : Do secure TCP sequence numbers.
*
* To Fix:
* Fast path the code. Two things here - fix the window calculation
@@ -425,10 +418,8 @@
* (Whew. -- MS 950903)
**/
-#include <linux/config.h>
#include <linux/types.h>
#include <linux/fcntl.h>
-#include <linux/random.h>
#include <net/icmp.h>
#include <net/tcp.h>
@@ -438,25 +429,34 @@
unsigned long seq_offset;
struct tcp_mib tcp_statistics;
-static void tcp_close(struct sock *sk, unsigned long timeout);
+
/*
* Find someone to 'accept'. Must be called with
* the socket locked or with interrupts disabled
*/
-static struct sk_buff *tcp_find_established(struct sock *s)
+static struct open_request *tcp_find_established(struct tcp_opt *tp)
{
- struct sk_buff *p=skb_peek(&s->receive_queue);
- if(p==NULL)
+ struct open_request *req;
+
+ req = tp->syn_wait_queue;
+
+ if (!req)
return NULL;
- do
- {
- if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
- return p;
- p=p->next;
- }
- while(p!=(struct sk_buff *)&s->receive_queue);
+
+ do {
+ if (req->sk &&
+ (req->sk->state == TCP_ESTABLISHED ||
+ req->sk->state >= TCP_FIN_WAIT1))
+ {
+ return req;
+ }
+
+ req = req->dl_next;
+
+ } while (req != tp->syn_wait_queue);
+
return NULL;
}
@@ -493,108 +493,9 @@
/*
- * This routine is called by the ICMP module when it gets some
- * sort of error condition. If err < 0 then the socket should
- * be closed and the error returned to the user. If err > 0
- * it's just the icmp type << 8 | icmp code. After adjustment
- * header points to the first 8 bytes of the tcp header. We need
- * to find the appropriate port.
- */
-
-void tcp_err(int type, int code, unsigned char *header, __u32 daddr,
- __u32 saddr, struct inet_protocol *protocol)
-{
- struct tcphdr *th = (struct tcphdr *)header;
- struct sock *sk;
-
- /*
- * This one is _WRONG_. FIXME urgently.
- */
-#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
- struct iphdr *iph=(struct iphdr *)(header-sizeof(struct iphdr));
-#endif
- th =(struct tcphdr *)header;
- sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr, 0, 0);
-
- if (sk == NULL)
- return;
-
- if (type == ICMP_SOURCE_QUENCH)
- {
- /*
- * FIXME:
- * Follow BSD for now and just reduce cong_window to 1 again.
- * It is possible that we just want to reduce the
- * window by 1/2, or that we want to reduce ssthresh by 1/2
- * here as well.
- */
- sk->cong_window = 1;
- sk->high_seq = sk->sent_seq;
- return;
- }
-
- if (type == ICMP_PARAMETERPROB)
- {
- sk->err=EPROTO;
- sk->error_report(sk);
- }
-
-#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
- if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
- {
- struct rtable * rt;
- /*
- * Ugly trick to pass MTU to protocol layer.
- * Really we should add argument "info" to error handler.
- */
- unsigned short new_mtu = ntohs(iph->id);
-
- if ((rt = sk->ip_route_cache) != NULL)
- if (rt->rt_mtu > new_mtu)
- rt->rt_mtu = new_mtu;
-
- /*
- * FIXME::
- * Not the nicest of fixes: Lose a MTU update if the socket is
- * locked this instant. Not the right answer but will be best
- * for the production fix. Make 2.1 work right!
- */
-
- if (sk->mtu > new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr)
- && new_mtu > sizeof(struct iphdr)+sizeof(struct tcphdr) && !sk->users)
- sk->mtu = new_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
-
- return;
- }
-#endif
-
- /*
- * If we've already connected we will keep trying
- * until we time out, or the user gives up.
- */
-
- if(code<=NR_ICMP_UNREACH)
- {
- if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
- {
- sk->err = icmp_err_convert[code].errno;
- if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
- {
- tcp_statistics.TcpAttemptFails++;
- tcp_set_state(sk,TCP_CLOSE);
- sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
- }
- }
- else /* Only an error on timeout */
- sk->err_soft = icmp_err_convert[code].errno;
- }
-}
-
-
-/*
- * Walk down the receive queue counting readable data until we hit the end or we find a gap
- * in the received data queue (ie a frame missing that needs sending to us). Not
- * sorting using two queues as data arrives makes life so much harder.
+ * Walk down the receive queue counting readable data until we hit the
+ * end or we find a gap in the received data queue (ie a frame missing
+ * that needs sending to us).
*/
static int tcp_readable(struct sock *sk)
@@ -627,13 +528,19 @@
do
{
- if (before(counted, skb->seq)) /* Found a hole so stops here */
+ /* Found a hole so stops here */
+ if (before(counted, skb->seq))
break;
- sum = skb->len - (counted - skb->seq); /* Length - header but start from where we are up to (avoid overlaps) */
+ /*
+ * Length - header but start from where we are up to
+ * avoid overlaps
+ */
+ sum = skb->len - (counted - skb->seq);
if (skb->h.th->syn)
sum++;
if (sum > 0)
- { /* Add it up, move on */
+ {
+ /* Add it up, move on */
amount += sum;
if (skb->h.th->syn)
amount--;
@@ -655,9 +562,13 @@
* and a blocking read(). And the queue scan in tcp_read()
* was correct. Mike <pall@rz.uni-karlsruhe.de>
*/
+
+ /* don't count urg data */
if (skb->h.th->urg)
- amount--; /* don't count urg data */
-/* if (amount && skb->h.th->psh) break;*/
+ amount--;
+#if 0
+ if (amount && skb->h.th->psh) break;
+#endif
skb = skb->next;
}
while(skb != (struct sk_buff *)&sk->receive_queue);
@@ -674,12 +585,12 @@
static int tcp_listen_select(struct sock *sk, int sel_type, select_table *wait)
{
if (sel_type == SEL_IN) {
- struct sk_buff * skb;
+ struct open_request *req;
lock_sock(sk);
- skb = tcp_find_established(sk);
+ req = tcp_find_established(&sk->tp_pinfo.af_tcp);
release_sock(sk);
- if (skb)
+ if (req)
return 1;
select_wait(sk->sleep,wait);
return 0;
@@ -687,7 +598,6 @@
return 0;
}
-
/*
* Wait for a TCP event.
*
@@ -695,8 +605,10 @@
* take care of normal races (between the test and the event) and we don't
* go look at any of the socket buffers directly.
*/
-static int tcp_select(struct sock *sk, int sel_type, select_table *wait)
+int tcp_select(struct sock *sk, int sel_type, select_table *wait)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
if (sk->state == TCP_LISTEN)
return tcp_listen_select(sk, sel_type, wait);
@@ -709,12 +621,12 @@
if (sk->shutdown & RCV_SHUTDOWN)
return 1;
-
- if (sk->acked_seq == sk->copied_seq)
+
+ if (tp->rcv_nxt == sk->copied_seq)
break;
if (sk->urg_seq != sk->copied_seq ||
- sk->acked_seq != sk->copied_seq+1 ||
+ tp->rcv_nxt != sk->copied_seq+1 ||
sk->urginline || !sk->urg_data)
return 1;
break;
@@ -726,7 +638,12 @@
return 0;
if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV)
break;
- if (sk->wmem_alloc*2 > sk->sndbuf)
+ /*
+ * This is now right thanks to a small fix
+ * by Matt Dillon.
+ */
+
+ if (sock_wspace(sk) < sk->mtu+128+sk->prot->max_header)
break;
return 1;
@@ -792,50 +709,21 @@
}
-/*
- * This routine computes a TCP checksum.
- *
- * Modified January 1995 from a go-faster DOS routine by
- * Jorge Cwik <jorge@laser.satlink.net>
+/*
+ * This routine builds a generic TCP header.
*/
-#undef DEBUG_TCP_CHECK
-void tcp_send_check(struct tcphdr *th, unsigned long saddr,
- unsigned long daddr, int len, struct sk_buff *skb)
-{
-#ifdef DEBUG_TCP_CHECK
- u16 check;
-#endif
- th->check = 0;
- th->check = tcp_check(th, len, saddr, daddr,
- csum_partial((char *)th,sizeof(*th),skb->csum));
-
-#ifdef DEBUG_TCP_CHECK
- check = th->check;
- th->check = 0;
- th->check = tcp_check(th, len, saddr, daddr,
- csum_partial((char *)th,len,0));
- if (check != th->check) {
- static int count = 0;
- if (++count < 10) {
- printk("Checksum %x (%x) from %p\n", th->check, check,
- (&th)[-1]);
- printk("TCP=<off:%d a:%d s:%d f:%d>\n", th->doff*4, th->ack, th->syn, th->fin);
- }
- }
-#endif
-}
-
-
-/*
- * This routine builds a generic TCP header.
- */
-
-static inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
+
+extern __inline int tcp_build_header(struct tcphdr *th, struct sock *sk, int push)
{
+ struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
memcpy(th,(void *) &(sk->dummy_th), sizeof(*th));
- th->psh = (push == 0) ? 1 : 0;
th->seq = htonl(sk->write_seq);
- th->ack_seq = htonl(sk->acked_seq);
+#if 0
+ th->psh =(push == 0) ? 1 : 0;
+#endif
+ sk->bytes_rcv = 0;
+ sk->ack_timed = 0;
+ th->ack_seq = htonl(tp->rcv_nxt);
th->window = htons(tcp_select_window(sk));
return(sizeof(*th));
@@ -890,187 +778,173 @@
lock_sock(sk);
}
-/*
- * Add more stuff to the end of skb->len
- */
-static int fill_in_partial_skb(struct sock *sk, struct sk_buff *skb,
- unsigned char * from, int seglen)
-{
- void (*send)(struct sock *sk, struct sk_buff *skb);
- int copy, tcp_size;
- tcp_size = skb->tail - (unsigned char *)(skb->h.th + 1);
+static int tcp_append_tail(struct sock *sk, struct sk_buff *skb, u8 *from,
+ int tcp_size, int seglen)
+{
+ int fault;
+ int copy;
- /*
- * Now we may find the frame is as big, or too
- * big for our MSS. Thats all fine. It means the
- * MSS shrank (from an ICMP) after we allocated
- * this frame.
+ /*
+ * Add more stuff to the end
+ * of the skb
*/
- copy = sk->mss - tcp_size;
- if (copy <= 0) {
- tcp_send_skb(sk, skb);
- return 0;
+ copy = min(sk->mss - tcp_size, skb->end - skb->tail);
+ copy = min(copy, seglen);
+
+ tcp_size += copy;
+
+ fault = copy_from_user(skb->tail, from, copy);
+
+ if (fault)
+ {
+ return -1;
}
- /*
- * Otherwise continue to fill the buffer.
- */
- send = tcp_send_skb;
- if (copy > seglen) {
- send = tcp_enqueue_partial;
- copy = seglen;
- }
- copy_from_user(skb->tail, from, copy);
- tcp_size += copy;
- skb->tail += copy;
- skb->len += copy;
+ skb_put(skb, copy);
skb->csum = csum_partial(skb->tail - tcp_size, tcp_size, 0);
+
sk->write_seq += copy;
- if (!sk->packets_out)
- send = tcp_send_skb;
- send(sk, skb);
+ skb->end_seq += copy;
+
return copy;
}
-
/*
* This routine copies from a user buffer into a socket,
* and starts the transmit system.
*/
-static int do_tcp_sendmsg(struct sock *sk,
- int iovlen, struct iovec *iov,
- int len, int nonblock, int flags)
+int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov,
+ int len, int nonblock, int flags)
{
- int copied = 0;
- struct device *dev = NULL;
+ int copied = 0;
+ struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
- /*
+ /*
* Wait for a connection to finish.
*/
while (sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
{
- if (sk->err)
+
+ if (copied)
+ return copied;
+
+ if (sk->err)
return sock_error(sk);
-
+
if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
{
if (sk->keepopen)
send_sig(SIGPIPE, current, 0);
return -EPIPE;
}
-
+
if (nonblock)
return -EAGAIN;
-
+
if (current->signal & ~current->blocked)
return -ERESTARTSYS;
-
+
wait_for_tcp_connect(sk);
}
-
+
+
/*
* Ok commence sending
*/
-
- while (--iovlen >= 0)
+
+ while(--iovlen >= 0)
{
int seglen=iov->iov_len;
unsigned char * from=iov->iov_base;
+ u32 actual_win;
+
iov++;
- while(seglen > 0)
+ while(seglen > 0)
{
int copy;
int tmp;
struct sk_buff *skb;
- void (*send)(struct sock *, struct sk_buff *);
/*
* Stop on errors
*/
- if (sk->err)
+ if (sk->err)
{
- if (copied)
+ if (copied)
return copied;
return sock_error(sk);
}
/*
- * Make sure that we are established.
+ * Make sure that we are established.
*/
- if (sk->shutdown & SEND_SHUTDOWN)
+ if (sk->shutdown & SEND_SHUTDOWN)
{
if (copied)
return copied;
send_sig(SIGPIPE,current,0);
return -EPIPE;
}
-
- /*
- * The following code can result in copy <= if sk->mss is ever
- * decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).
- * sk->mtu is constant once SYN processing is finished. I.e. we
- * had better not get here until we've seen his SYN and at least one
- * valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)
- * But ESTABLISHED should guarantee that. sk->max_window is by definition
- * non-decreasing. Note that any ioctl to set user_mss must be done
- * before the exchange of SYN's. If the initial ack from the other
- * end has a window of 0, max_window and thus mss will both be 0.
+
+ /*
+ *Now we need to check if we have a half built packet.
*/
- /*
- * Now we need to check if we have a half built packet.
- */
-#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
- /*
- * Really, we should rebuild all the queues...
- * It's difficult. Temporary hack is to send all
- * queued segments with allowed fragmentation.
- */
+ /* if we have queued packets */
+ if (tp->send_head && !(flags & MSG_OOB) )
{
+ int tcp_size;
+
+ /* Tail */
+
+ skb = sk->write_queue.prev;
+ tcp_size = skb->tail -
+ (unsigned char *)(skb->h.th + 1);
+
/*
- * new_mss may be zero. That indicates
- * we don't have a window estimate for
- * the remote box yet.
- * -- AC
+ * This window_seq test is somewhat dangerous
+ * If the remote does SWS avoidance we should
+ * queue the best we can
+ * if not we should in fact send multiple
+ * packets...
+ * a method for detecting this would be most
+ * welcome
*/
-
- int new_mss = min(sk->mtu, sk->max_window);
- if (new_mss && new_mss < sk->mss)
- {
- tcp_send_partial(sk);
- sk->mss = new_mss;
- }
- }
-#endif
- /*
- * If there is a partly filled frame we can fill
- * out.
- */
- skb = tcp_dequeue_partial(sk);
- if (skb) {
- if (!(flags & MSG_OOB)) {
- int retval;
- retval = fill_in_partial_skb(sk, skb, from, seglen);
- if (retval < 0)
- return retval;
- seglen -= retval;
- from += retval;
- copied += retval;
- len -= retval;
+ if (skb->end > skb->tail &&
+ sk->mss - tcp_size > 0 &&
+ skb->end_seq < tp->snd_una + tp->snd_wnd)
+ {
+
+ copy = tcp_append_tail(sk, skb, from,
+ tcp_size,
+ seglen);
+ if (copy == -1)
+ {
+ return -EFAULT;
+ }
+
+ from += copy;
+ copied += copy;
+ len -= copy;
+ seglen -= copy;
+
+ /*
+ * FIXME: if we're nagling we
+ * should send here.
+ */
continue;
}
- tcp_send_skb(sk, skb);
- continue;
}
+
/*
- * We also need to worry about the window.
- * If window < 1/2 the maximum window we've seen from this
+ * We also need to worry about the window.
+ * If window < 1/2 the maximum window we've seen from this
* host, don't use it. This is sender side
* silly window prevention, as specified in RFC1122.
* (Note that this is different than earlier versions of
@@ -1080,40 +954,51 @@
* be queued for later rather than sent.
*/
- copy = sk->window_seq - sk->write_seq;
- if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
- copy = sk->mss;
- if (copy > seglen)
- copy = seglen;
+ copy = min(seglen, sk->mss);
+
+ actual_win = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
+
+ if (copy > actual_win &&
+ actual_win >= (sk->max_window >> 1))
+ {
+ copy = actual_win;
+ }
+
if (copy <= 0)
{
- printk(KERN_CRIT "TCP: **bug**: copy=%d, sk->mss=%d\n", copy, sk->mss);
- return -EFAULT;
+ printk(KERN_DEBUG "sendmsg: copy < 0\n");
+ return -EIO;
}
/*
- * We should really check the window here also.
+ * If sk->packets_out > 0 segment will be nagled
+ * else we kick it right away
*/
- send = tcp_send_skb;
- tmp = copy + sk->prot->max_header + 15;
- if (copy < sk->mss && !(flags & MSG_OOB) && sk->packets_out)
+ tmp = MAX_HEADER + sk->prot->max_header +
+ sizeof(struct sk_buff) + 15;
+ if (copy < min(sk->mss, sk->max_window >> 1) &&
+ !(flags & MSG_OOB) && sk->packets_out)
+ {
+ tmp += min(sk->mss, sk->max_window);
+ }
+ else
{
- tmp = tmp - copy + sk->mtu + 128;
- send = tcp_enqueue_partial;
+ tmp += copy;
}
+
skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
-
+
/*
- * If we didn't get any memory, we need to sleep.
+ * If we didn't get any memory, we need to sleep.
*/
-
- if (skb == NULL)
+
+ if (skb == NULL)
{
sk->socket->flags |= SO_NOSPACE;
- if (nonblock)
+ if (nonblock)
{
- if (copied)
+ if (copied)
return copied;
return -EAGAIN;
}
@@ -1132,27 +1017,14 @@
skb->sk = sk;
skb->free = 0;
skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
-
+
/*
* FIXME: we need to optimize this.
* Perhaps some hints here would be good.
*/
- tmp = sk->prot->build_header(skb, sk->saddr, sk->daddr, &dev,
- IPPROTO_TCP, sk->opt, skb->truesize,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
- if (tmp < 0 )
- {
- sock_wfree(sk, skb);
- if (copied)
- return(copied);
- return(tmp);
- }
-#ifndef CONFIG_NO_PATH_MTU_DISCOVERY
- skb->ip_hdr->frag_off |= htons(IP_DF);
-#endif
- skb->dev = dev;
- skb->h.th =(struct tcphdr *)skb_put(skb,sizeof(struct tcphdr));
- tmp = tcp_build_header(skb->h.th, sk, seglen-copy);
+ tmp = tp->af_specific->build_net_header(sk, skb);
+
if (tmp < 0)
{
sock_wfree(sk, skb);
@@ -1161,6 +1033,20 @@
return(tmp);
}
+ skb->h.th =(struct tcphdr *)
+ skb_put(skb,sizeof(struct tcphdr));
+
+ seglen -= copy;
+ tmp = tcp_build_header(skb->h.th, sk, seglen || iovlen);
+
+ if (tmp < 0)
+ {
+ sock_wfree(sk, skb);
+ if (copied)
+ return(copied);
+ return(tmp);
+ }
+
if (flags & MSG_OOB)
{
skb->h.th->urg = 1;
@@ -1168,92 +1054,42 @@
}
skb->csum = csum_partial_copy_fromuser(from,
- skb->tail, copy, 0);
- skb->tail += copy;
- skb->len += copy;
+ skb_put(skb, copy), copy, 0);
+
from += copy;
copied += copy;
len -= copy;
- seglen -= copy;
- sk->write_seq += copy;
skb->free = 0;
-
- send(sk, skb);
+ sk->write_seq += copy;
+
+ tcp_send_skb(sk, skb);
+
+ release_sock(sk);
+ lock_sock(sk);
}
}
+
sk->err = 0;
return copied;
}
-static int tcp_sendmsg(struct sock *sk, struct msghdr *msg,
- int len, int nonblock, int flags)
-{
- int retval = -EINVAL;
-
- /*
- * Do sanity checking for sendmsg/sendto/send
- */
-
- if (flags & ~(MSG_OOB|MSG_DONTROUTE))
- goto out;
- if (msg->msg_name) {
- struct sockaddr_in *addr=(struct sockaddr_in *)msg->msg_name;
-
- if (msg->msg_namelen < sizeof(*addr))
- goto out;
- if (addr->sin_family && addr->sin_family != AF_INET)
- goto out;
- retval = -ENOTCONN;
- if(sk->state == TCP_CLOSE)
- goto out;
- retval = -EISCONN;
- if (addr->sin_port != sk->dummy_th.dest)
- goto out;
- if (addr->sin_addr.s_addr != sk->daddr)
- goto out;
- }
-
- lock_sock(sk);
- retval = do_tcp_sendmsg(sk, msg->msg_iovlen, msg->msg_iov, len, nonblock, flags);
+
/*
- * Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
- * interactive fast network servers. It's meant to be on and
- * it really improves the throughput though not the echo time
- * on my slow slip link - Alan
- *
- * If not nagling we can send on the before case too..
+ * Send an ack if one is backlogged at this point. Ought to merge
+ * this with tcp_send_ack().
+ * This is called for delayed acks also.
*/
-
- if (sk->partial) {
- if (!sk->packets_out ||
- (sk->nonagle && before(sk->write_seq , sk->window_seq))) {
- tcp_send_partial(sk);
- }
- }
-
- release_sock(sk);
-
-out:
- return retval;
-}
-
-
-/*
- * Send an ack if one is backlogged at this point.
- */
-
+
void tcp_read_wakeup(struct sock *sk)
{
- if (!sk->ack_backlog)
- return;
-
/*
* If we're closed, don't send an ack, or we'll get a RST
* from the closed destination.
*/
+
if ((sk->state == TCP_CLOSE) || (sk->state == TCP_TIME_WAIT))
return;
@@ -1267,8 +1103,11 @@
*/
static int tcp_recv_urg(struct sock * sk, int nonblock,
- struct msghdr *msg, int len, int flags, int *addr_len)
+ struct msghdr *msg, int len, int flags,
+ int *addr_len)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
/*
* No URG data to read
*/
@@ -1302,13 +1141,12 @@
memcpy_toiovec(msg->msg_iov, &c, 1);
if(msg->msg_name)
{
- struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
- sin->sin_family=AF_INET;
- sin->sin_addr.s_addr=sk->daddr;
- sin->sin_port=sk->dummy_th.dest;
+ tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
+ msg->msg_name);
}
if(addr_len)
- *addr_len=sizeof(struct sockaddr_in);
+ *addr_len= tp->af_specific->sockaddr_len;
+
release_sock(sk);
return 1;
}
@@ -1332,46 +1170,57 @@
static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
{
+ sk->ack_backlog++;
+
skb->sk = sk;
__skb_unlink(skb, &sk->receive_queue);
kfree_skb(skb, FREE_READ);
}
-/*
- * FIXME:
- * This routine frees used buffers.
- * It should consider sending an ACK to let the
- * other end know we now have a bigger window.
- */
static void cleanup_rbuf(struct sock *sk)
{
+ struct sk_buff *skb;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
/*
* NOTE! The socket must be locked, so that we don't get
* a messed-up receive queue.
*/
- while (!skb_queue_empty(&sk->receive_queue)) {
- struct sk_buff *skb = sk->receive_queue.next;
+
+ while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
if (!skb->used || skb->users)
break;
tcp_eat_skb(sk, skb);
}
-
+
+ if(sk->debug)
+ printk("sk->rspace = %lu\n", sock_rspace(sk));
+
/*
- * Tell the world if we raised the window.
+ * We send a ACK if the sender is blocked
+ * else let tcp_data deal with the acking policy.
*/
- if (tcp_raise_window(sk))
- tcp_send_ack(sk);
-}
+
+ if (sock_rspace(sk) > tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup) &&
+ (tp->rcv_wnd - (tp->rcv_nxt - tp->rcv_wup) < sk->mss))
+ {
+ /* Send an ack right now. */
+ sk->delayed_acks++;
+ tcp_read_wakeup(sk);
+ }
+
+}
/*
- * This routine copies from a sock struct into the user buffer.
+ * This routine copies from a sock struct into the user buffer.
*/
-
-static int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
- int len, int nonblock, int flags, int *addr_len)
+
+int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
+ int len, int nonblock, int flags, int *addr_len)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct wait_queue wait = { current, NULL };
int copied = 0;
u32 peek_seq;
@@ -1434,11 +1283,20 @@
current->state = TASK_INTERRUPTIBLE;
- skb = sk->receive_queue.next;
- while (skb != (struct sk_buff *)&sk->receive_queue)
+ skb = skb_peek(&sk->receive_queue);
+ do
{
- if (before(*seq, skb->seq))
+ if (!skb)
break;
+ /*
+ * now that we have two receive queues this
+ * shouldn't happen
+ */
+ if (before(*seq, skb->seq)) {
+ printk("recvmsg bug: copied %X seq %X\n",
+ *seq, skb->seq);
+ break;
+ }
offset = *seq - skb->seq;
if (skb->h.th->syn)
offset--;
@@ -1450,6 +1308,7 @@
skb->used = 1;
skb = skb->next;
}
+ while (skb != (struct sk_buff *)&sk->receive_queue);
if (copied)
break;
@@ -1539,7 +1398,7 @@
*seq += used;
/*
- * This copy_to_user can sleep. If it sleeps and we
+ * This memcpy_tofs can sleep. If it sleeps and we
* do a second read it relies on the skb->users to avoid
* a crash when cleanup_rbuf() gets called.
*/
@@ -1590,15 +1449,13 @@
}
- if(copied>0 && msg->msg_name)
+ if(copied > 0 && msg->msg_name)
{
- struct sockaddr_in *sin=(struct sockaddr_in *)msg->msg_name;
- sin->sin_family=AF_INET;
- sin->sin_addr.s_addr=sk->daddr;
- sin->sin_port=sk->dummy_th.dest;
+ tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
+ msg->msg_name);
}
if(addr_len)
- *addr_len=sizeof(struct sockaddr_in);
+ *addr_len= tp->af_specific->sockaddr_len;
remove_wait_queue(sk->sleep, &wait);
current->state = TASK_RUNNING;
@@ -1708,19 +1565,16 @@
sk->shutdown |= SEND_SHUTDOWN;
/*
- * Clear out any half completed packets.
+ * Clear out any half completed packets.
*/
- if (sk->partial)
- tcp_send_partial(sk);
-
/*
* FIN if needed
*/
-
+
if (tcp_close_state(sk,0))
tcp_send_fin(sk);
-
+
release_sock(sk);
}
@@ -1741,7 +1595,7 @@
}
-static void tcp_close(struct sock *sk, unsigned long timeout)
+void tcp_close(struct sock *sk, unsigned long timeout)
{
struct sk_buff *skb;
@@ -1774,22 +1628,16 @@
* descriptor close, not protocol-sourced closes, because the
* reader process may not have drained the data yet!
*/
-
+
while((skb=skb_dequeue(&sk->receive_queue))!=NULL)
kfree_skb(skb, FREE_READ);
- /*
- * Get rid off any half-completed packets.
- */
-
- if (sk->partial)
- tcp_send_partial(sk);
-
+
/*
* Timeout is not the same thing - however the code likes
* to send both the same way (sigh).
*/
-
+
if (tcp_close_state(sk,1)==1)
{
tcp_send_fin(sk);
@@ -1836,14 +1684,13 @@
/*
- * Wait for an incoming connection, avoid race
- * conditions. This must be called with the socket
- * locked.
+ * Wait for an incoming connection, avoid race
+ * conditions. This must be called with the socket locked.
*/
-static struct sk_buff * wait_for_connect(struct sock * sk)
+static struct open_request * wait_for_connect(struct sock * sk)
{
struct wait_queue wait = { current, NULL };
- struct sk_buff * skb = NULL;
+ struct open_request *req = NULL;
add_wait_queue(sk->sleep, &wait);
for (;;) {
@@ -1851,27 +1698,29 @@
release_sock(sk);
schedule();
lock_sock(sk);
- skb = tcp_find_established(sk);
- if (skb)
+ req = tcp_find_established(&(sk->tp_pinfo.af_tcp));
+ if (req)
break;
if (current->signal & ~current->blocked)
break;
}
remove_wait_queue(sk->sleep, &wait);
- return skb;
+ return req;
}
+
/*
* This will accept the next outstanding connection.
*
* Be careful about race conditions here - this is subtle.
*/
-static struct sock *tcp_accept(struct sock *sk, int flags)
+struct sock *tcp_accept(struct sock *sk, int flags)
{
- int error;
- struct sk_buff *skb;
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+ struct open_request *req;
struct sock *newsk = NULL;
+ int error;
/*
* We need to make sure that this socket is listening,
@@ -1884,12 +1733,12 @@
lock_sock(sk);
- skb = tcp_find_established(sk);
- if (skb) {
+ req = tcp_find_established(tp);
+ if (req) {
got_new_connect:
- __skb_unlink(skb, &sk->receive_queue);
- newsk = skb->sk;
- kfree_skb(skb, FREE_READ);
+ tcp_synq_unlink(tp, req);
+ newsk = req->sk;
+ kfree(req);
sk->ack_backlog--;
error = 0;
out:
@@ -1902,238 +1751,35 @@
error = EAGAIN;
if (flags & O_NONBLOCK)
goto out;
- skb = wait_for_connect(sk);
- if (skb)
+ req = wait_for_connect(sk);
+ if (req)
goto got_new_connect;
error = ERESTARTSYS;
goto out;
}
-/*
- * Check that a TCP address is unique, don't allow multiple
- * connects to/from the same address
- */
-static int tcp_unique_address(u32 saddr, u16 snum, u32 daddr, u16 dnum)
-{
- int retval = 1;
- struct sock * sk;
-
- /* Make sure we are allowed to connect here. */
- cli();
- for (sk = tcp_prot.sock_array[snum & (SOCK_ARRAY_SIZE -1)];
- sk != NULL; sk = sk->next)
- {
- /* hash collision? */
- if (sk->num != snum)
- continue;
- if (sk->saddr != saddr)
- continue;
- if (sk->daddr != daddr)
- continue;
- if (sk->dummy_th.dest != dnum)
- continue;
- retval = 0;
- break;
- }
- sti();
- return retval;
-}
-
/*
- * This will initiate an outgoing connection.
+ * Socket option code for TCP.
*/
-
-static int tcp_connect(struct sock *sk, struct sockaddr_in *usin, int addr_len)
+
+int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
+ int optlen)
{
- struct sk_buff *buff;
- struct device *dev=NULL;
- unsigned char *ptr;
- int tmp;
- int atype;
- struct tcphdr *t1;
- struct rtable *rt;
-
- if (sk->state != TCP_CLOSE)
- return(-EISCONN);
-
- /*
- * Don't allow a double connect.
- */
-
- if(sk->daddr)
- return -EINVAL;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int val;
- if (addr_len < 8)
- return(-EINVAL);
-
- if (usin->sin_family && usin->sin_family != AF_INET)
- return(-EAFNOSUPPORT);
-
- /*
- * connect() to INADDR_ANY means loopback (BSD'ism).
- */
-
- if (usin->sin_addr.s_addr==INADDR_ANY)
- usin->sin_addr.s_addr=ip_my_addr();
-
- /*
- * Don't want a TCP connection going to a broadcast address
- */
-
- if ((atype=ip_chk_addr(usin->sin_addr.s_addr)) == IS_BROADCAST || atype==IS_MULTICAST)
- return -ENETUNREACH;
-
- if (!tcp_unique_address(sk->saddr, sk->num, usin->sin_addr.s_addr, usin->sin_port))
- return -EADDRNOTAVAIL;
-
- lock_sock(sk);
- sk->daddr = usin->sin_addr.s_addr;
-
- sk->rcv_ack_cnt = 1;
- sk->err = 0;
- sk->dummy_th.dest = usin->sin_port;
-
- buff = sock_wmalloc(sk,MAX_SYN_SIZE,0, GFP_KERNEL);
- if (buff == NULL)
+ if (level != SOL_TCP)
{
- release_sock(sk);
- return(-ENOMEM);
+ return tp->af_specific->setsockopt(sk, level, optname,
+ optval, optlen);
}
- buff->sk = sk;
- buff->free = 0;
- buff->localroute = sk->localroute;
-
- /*
- * Put in the IP header and routing stuff.
- */
-
- tmp = sk->prot->build_header(buff, sk->saddr, sk->daddr, &dev,
- IPPROTO_TCP, sk->opt, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl,&sk->ip_route_cache);
- if (tmp < 0)
- {
- sock_wfree(sk, buff);
- release_sock(sk);
- return(-ENETUNREACH);
- }
- if ((rt = sk->ip_route_cache) != NULL && !sk->saddr)
- sk->saddr = rt->rt_src;
- sk->rcv_saddr = sk->saddr;
-
- /*
- * Set up our outgoing TCP sequence number
- */
- sk->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
- sk->dummy_th.source,
- usin->sin_port);
- sk->window_seq = sk->write_seq;
- sk->rcv_ack_seq = sk->write_seq -1;
-
- t1 = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
-
- memcpy(t1,(void *)&(sk->dummy_th), sizeof(*t1));
- buff->seq = sk->write_seq++;
- t1->seq = htonl(buff->seq);
- sk->sent_seq = sk->write_seq;
- buff->end_seq = sk->write_seq;
- t1->ack = 0;
- t1->window = 2;
- t1->syn = 1;
- t1->doff = 6;
- /* use 512 or whatever user asked for */
-
- if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
- sk->window_clamp=rt->rt_window;
- else
- sk->window_clamp=0;
-
- if (sk->user_mss)
- sk->mtu = sk->user_mss;
- else if (rt)
- sk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
- else
- sk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
-
- /*
- * but not bigger than device MTU
- */
-
- if(sk->mtu <32)
- sk->mtu = 32; /* Sanity limit */
-
- sk->mtu = min(sk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
-
-#ifdef CONFIG_SKIP
-
- /*
- * SKIP devices set their MTU to 65535. This is so they can take packets
- * unfragmented to security process then fragment. They could lie to the
- * TCP layer about a suitable MTU, but it's easier to let skip sort it out
- * simply because the final package we want unfragmented is going to be
- *
- * [IPHDR][IPSP][Security data][Modified TCP data][Security data]
- */
-
- if(skip_pick_mtu!=NULL) /* If SKIP is loaded.. */
- sk->mtu=skip_pick_mtu(sk->mtu,dev);
-#endif
-
- /*
- * Put in the TCP options to say MTU.
- */
-
- ptr = skb_put(buff,4);
- ptr[0] = 2;
- ptr[1] = 4;
- ptr[2] = (sk->mtu) >> 8;
- ptr[3] = (sk->mtu) & 0xff;
- buff->csum = csum_partial(ptr, 4, 0);
- tcp_send_check(t1, sk->saddr, sk->daddr,
- sizeof(struct tcphdr) + 4, buff);
-
- /*
- * This must go first otherwise a really quick response will get reset.
- */
-
- tcp_cache_zap();
- tcp_set_state(sk,TCP_SYN_SENT);
- if(rt&&rt->rt_flags&RTF_IRTT)
- sk->rto = rt->rt_irtt;
- else
- sk->rto = TCP_TIMEOUT_INIT;
- sk->delack_timer.function = tcp_delack_timer;
- sk->delack_timer.data = (unsigned long) sk;
- sk->retransmit_timer.function = tcp_retransmit_timer;
- sk->retransmit_timer.data = (unsigned long)sk;
- sk->retransmits = 0;
- sk->prot->queue_xmit(sk, dev, buff, 0);
- tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
- tcp_statistics.TcpActiveOpens++;
- tcp_statistics.TcpOutSegs++;
-
- release_sock(sk);
- return(0);
-}
-
-/*
- * Socket option code for TCP.
- */
-
-int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen)
-{
- int val,err;
-
- if(level!=SOL_TCP)
- return ip_setsockopt(sk,level,optname,optval,optlen);
if (optval == NULL)
return(-EINVAL);
- err=verify_area(VERIFY_READ, optval, sizeof(int));
- if(err)
- return err;
-
- get_user(val, (int *)optval);
+ if (get_user(val, (int *)optval))
+ return -EFAULT;
switch(optname)
{
@@ -2155,12 +1801,17 @@
}
}
-int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, int *optlen)
+int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
+ int *optlen)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
int val,err;
- if(level!=SOL_TCP)
- return ip_getsockopt(sk,level,optname,optval,optlen);
+ if(level != SOL_TCP)
+ {
+ return tp->af_specific->getsockopt(sk, level, optname,
+ optval, optlen);
+ }
switch(optname)
{
@@ -2186,29 +1837,21 @@
return(0);
}
+void tcp_set_keepalive(struct sock *sk, int val)
+{
+ if (!sk->keepopen && val)
+ {
+ tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
+ }
+ else if (sk->keepopen && !val)
+ {
+ tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
+ }
+}
-struct proto tcp_prot = {
- tcp_close,
- ip_build_header,
- tcp_connect,
- tcp_accept,
- ip_queue_xmit,
- tcp_retransmit,
- tcp_write_wakeup,
- tcp_read_wakeup,
- tcp_rcv,
- tcp_select,
- tcp_ioctl,
- NULL,
- tcp_shutdown,
- tcp_setsockopt,
- tcp_getsockopt,
- tcp_sendmsg,
- tcp_recvmsg,
- NULL, /* No special bind() */
- 128,
- 0,
- "TCP",
- 0, 0,
- {NULL,}
-};
+/*
+ * Local variables:
+ * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -c -o tcp.o tcp.c"
+ * c-file-style: "Linux"
+ * End:
+ */
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov