patch-2.1.53 linux/net/ipv4/tcp_ipv4.c
Next file: linux/net/ipv4/tcp_output.c
Previous file: linux/net/ipv4/tcp_input.c
Back to the patch index
Back to the overall index
- Lines: 650
- Date:
Thu Sep 4 15:42:20 1997
- Orig file:
v2.1.52/linux/net/ipv4/tcp_ipv4.c
- Orig date:
Thu Aug 14 20:49:18 1997
diff -u --recursive --new-file v2.1.52/linux/net/ipv4/tcp_ipv4.c linux/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_ipv4.c,v 1.52 1997/07/23 15:19:10 freitag Exp $
+ * Version: $Id: tcp_ipv4.c,v 1.61 1997/09/02 09:46:55 freitag Exp $
*
* IPv4 specific functions
*
@@ -33,6 +33,13 @@
* Andi Kleen : Add support for syncookies and fixed
* some bugs: ip options weren't passed to
* the TCP layer, missed a check for an ACK bit.
+ * Andi Kleen : Implemented fast path mtu discovery.
+ * Fixed many serious bugs in the
+ * open_request handling and moved
+ * most of it into the af independent code.
+ * Added tail drop and some other bugfixes.
+ * Added new listen sematics (ifdefed by
+ * NEW_LISTEN for now)
*/
#include <linux/config.h>
@@ -53,6 +60,9 @@
extern int sysctl_tcp_window_scaling;
extern int sysctl_tcp_syncookies;
+/* Define this to check TCP sequence numbers in ICMP packets. */
+#define ICMP_PARANOIA 1
+
static void tcp_v4_send_reset(struct sk_buff *skb);
void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
@@ -517,9 +527,11 @@
}
if (!tcp_unique_address(rt->rt_src, sk->num, rt->rt_dst,
- usin->sin_port))
+ usin->sin_port)) {
+ ip_rt_put(rt);
return -EADDRNOTAVAIL;
-
+ }
+
lock_sock(sk);
sk->dst_cache = &rt->u.dst;
sk->daddr = rt->rt_dst;
@@ -673,6 +685,76 @@
return retval;
}
+
+/*
+ * Do a linear search in the socket open_request list.
+ * This should be replaced with a global hash table.
+ */
+static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
+ void *header,
+ struct tcphdr *th,
+ struct open_request **prevp)
+{
+ struct iphdr *iph = header;
+ struct open_request *req, *prev;
+ __u16 rport = th->source;
+
+ /* assumption: the socket is not in use.
+ * as we checked the user count on tcp_rcv and we're
+ * running from a soft interrupt.
+ */
+ prev = (struct open_request *) (&tp->syn_wait_queue);
+ for (req = prev->dl_next; req; req = req->dl_next) {
+ if (req->af.v4_req.rmt_addr == iph->saddr &&
+ req->af.v4_req.loc_addr == iph->daddr &&
+ req->rmt_port == rport) {
+ *prevp = prev;
+ return req;
+ }
+ prev = req;
+ }
+ return NULL;
+}
+
+
+/*
+ * This routine does path mtu discovery as defined in RFC1197.
+ */
+static inline void do_pmtu_discovery(struct sock *sk,
+ struct iphdr *ip,
+ struct tcphdr *th)
+{
+ int new_mtu;
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ /* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs
+ * send out by Linux are always <576bytes so they should go through
+ * unfragmented).
+ */
+ if (sk->state == TCP_LISTEN)
+ return;
+
+ /* We don't check in the destentry if pmtu discovery is forbidden
+ * on this route. We just assume that no packet_to_big packets
+ * are send back when pmtu discovery is not active.
+ * There is a small race when the user changes this flag in the
+ * route, but I think that's acceptable.
+ */
+ if (sk->ip_pmtudisc != IP_PMTUDISC_DONT && sk->dst_cache) {
+ new_mtu = sk->dst_cache->pmtu -
+ (ip->ihl<<2) - tp->tcp_header_len;
+ if (new_mtu < sk->mss && new_mtu > 0) {
+ sk->mss = new_mtu;
+ /* Resend the TCP packet because it's
+ * clear that the old packet has been
+ * dropped. This is the new "fast" path mtu
+ * discovery.
+ */
+ tcp_simple_retransmit(sk);
+ }
+ }
+}
+
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
@@ -685,61 +767,125 @@
void tcp_v4_err(struct sk_buff *skb, unsigned char *dp)
{
struct iphdr *iph = (struct iphdr*)dp;
- struct tcphdr *th = (struct tcphdr*)(dp+(iph->ihl<<2));
+ struct tcphdr *th;
struct tcp_opt *tp;
int type = skb->h.icmph->type;
int code = skb->h.icmph->code;
struct sock *sk;
+ __u32 seq;
- sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source);
-
- if (sk == NULL)
+#if 0
+ /* check wrong - icmp.c should pass in len */
+ if (skb->len < 8+(iph->ihl << 2)+sizeof(struct tcphdr)) {
+ icmp_statistics.IcmpInErrors++;
return;
+ }
+#endif
+
+ th = (struct tcphdr*)(dp+(iph->ihl<<2));
+
+ sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source);
+ if (sk == NULL) {
+ icmp_statistics.IcmpInErrors++;
+ return;
+ }
+ /* pointless, because we have no way to retry when sk is locked.
+ But the socket should be really locked here for better interaction
+ with the socket layer. This needs to be solved for SMP
+ (I would prefer an "ICMP backlog"). */
+ /* lock_sock(sk); */
tp = &sk->tp_pinfo.af_tcp;
- if (type == ICMP_SOURCE_QUENCH) {
+
+ seq = ntohl(th->seq);
+
+#ifdef ICMP_PARANOIA
+ if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
+ if (net_ratelimit())
+ printk(KERN_DEBUG "icmp packet outside the tcp window:"
+ " s:%d %u,%u,%u\n",
+ (int)sk->state, seq, tp->snd_una, tp->snd_nxt);
+ goto out;
+ }
+#endif
+
+ switch (type) {
+ case ICMP_SOURCE_QUENCH:
tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
tp->snd_cwnd = tp->snd_ssthresh;
tp->high_seq = tp->snd_nxt;
- return;
- }
-
- if (type == ICMP_PARAMETERPROB) {
+ goto out;
+ case ICMP_PARAMETERPROB:
sk->err=EPROTO;
sk->error_report(sk);
- }
-
- /* FIXME: What about the IP layer options size here? */
- /* FIXME: add a timeout here, to cope with broken devices that
- drop all DF=1 packets. Do some more sanity checking
- here to prevent DOS attacks?
- This code should kick the tcp_output routine to
- retransmit a packet immediately because we know that
- the last packet has been dropped. -AK */
- if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
- if (sk->ip_pmtudisc != IP_PMTUDISC_DONT) {
- int new_mtu = sk->dst_cache->pmtu - sizeof(struct iphdr) - tp->tcp_header_len;
- if (new_mtu < sk->mss && new_mtu > 0) {
- sk->mss = new_mtu;
- }
+ break;
+ case ICMP_DEST_UNREACH:
+ if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+ do_pmtu_discovery(sk, iph, th);
+ goto out;
}
- return;
+ break;
}
/* If we've already connected we will keep trying
* until we time out, or the user gives up.
*/
- if (code <= NR_ICMP_UNREACH) {
- if(icmp_err_convert[code].fatal || sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) {
+ if (code <= NR_ICMP_UNREACH) {
+ int fatal = 0;
+
+ if (sk->state == TCP_LISTEN) {
+ struct open_request *req, *prev;
+
+ /* Prevent race conditions with accept()
+ * icmp is unreliable.
+ * This is the easiest solution for now - for
+ * very big servers it might prove inadequate.
+ */
+ if (sk->sock_readers) {
+ /* XXX: add a counter here to profile this.
+ * If too many ICMPs get dropped on busy
+ * servers this needs to be solved differently.
+ */
+ goto out;
+ }
+
+ req = tcp_v4_search_req(tp, iph, th, &prev);
+ if (!req)
+ goto out;
+#ifdef ICMP_PARANOIA
+ if (seq != req->snt_isn) {
+ if (net_ratelimit())
+ printk(KERN_DEBUG "icmp packet for openreq "
+ "with wrong seq number:%d:%d\n",
+ seq, req->snt_isn);
+ goto out;
+ }
+#endif
+ if (req->sk) { /* not yet accept()ed */
+ sk = req->sk;
+ } else {
+ tcp_synq_unlink(tp, req, prev);
+ tcp_openreq_free(req);
+ fatal = 1;
+ }
+ } else if (sk->state == TCP_SYN_SENT
+ || sk->state == TCP_SYN_RECV)
+ fatal = 1;
+
+ if(icmp_err_convert[code].fatal || fatal) {
sk->err = icmp_err_convert[code].errno;
- if (sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) {
+ if (fatal) {
tcp_statistics.TcpAttemptFails++;
- tcp_set_state(sk,TCP_CLOSE);
+ if (sk->state != TCP_LISTEN)
+ tcp_set_state(sk,TCP_CLOSE);
sk->error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
}
} else /* Only an error on timeout */
sk->err_soft = icmp_err_convert[code].errno;
}
+
+out:
+ /* release_sock(sk); */
}
/* This routine computes an IPv4 TCP checksum. */
@@ -872,16 +1018,18 @@
th->dest = req->rmt_port;
skb->seq = req->snt_isn;
skb->end_seq = skb->seq + 1;
- th->seq = ntohl(skb->seq);
+ th->seq = htonl(skb->seq);
th->ack_seq = htonl(req->rcv_isn + 1);
- if (req->rcv_wnd == 0) {
+ if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
+ __u8 rcv_wscale;
/* Set this up on the first call only */
req->window_clamp = skb->dst->window;
tcp_select_initial_window(sock_rspace(sk)/2,req->mss,
&req->rcv_wnd,
&req->window_clamp,
req->wscale_ok,
- &req->rcv_wscale);
+ &rcv_wscale);
+ req->rcv_wscale = rcv_wscale;
}
th->window = htons(req->rcv_wnd);
@@ -912,11 +1060,34 @@
sizeof(struct ip_options) + req->af.v4_req.opt->optlen);
}
+static inline void syn_flood_warning(struct sk_buff *skb)
+{
+ static unsigned long warntime;
+
+ if (jiffies - warntime > HZ*60) {
+ warntime = jiffies;
+ printk(KERN_INFO
+ "possible SYN flooding on port %d. Sending cookies.\n",
+ ntohs(skb->h.th->dest));
+ }
+}
+
+int sysctl_max_syn_backlog = 1024;
+int sysctl_tcp_syn_taildrop = 1;
+
struct or_calltable or_ipv4 = {
tcp_v4_send_synack,
tcp_v4_or_free
};
+#ifdef NEW_LISTEN
+#define BACKLOG(sk) ((sk)->tp_pinfo.af_tcp.syn_backlog) /* lvalue! */
+#define BACKLOGMAX(sk) sysctl_max_syn_backlog
+#else
+#define BACKLOG(sk) ((sk)->ack_backlog)
+#define BACKLOGMAX(sk) ((sk)->max_ack_backlog)
+#endif
+
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb, void *ptr,
__u32 isn)
{
@@ -936,35 +1107,33 @@
if (sk->dead)
goto dead;
- if (sk->ack_backlog >= sk->max_ack_backlog) {
+ /* XXX: Check against a global syn pool counter. */
+ if (BACKLOG(sk) > BACKLOGMAX(sk)) {
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
- static unsigned long warntime;
-
- if (jiffies - warntime > HZ*60) {
- warntime = jiffies;
- printk(KERN_INFO
- "possible SYN flooding on port %d. Sending cookies.\n", ntohs(skb->h.th->dest));
- }
+ syn_flood_warning(skb);
want_cookie = 1;
} else
#endif
- {
- SOCK_DEBUG(sk, "dropping syn ack:%d max:%d\n", sk->ack_backlog,
- sk->max_ack_backlog);
+ if (sysctl_tcp_syn_taildrop) {
+ struct open_request *req;
+
+ req = tcp_synq_unlink_tail(&sk->tp_pinfo.af_tcp);
+ tcp_openreq_free(req);
tcp_statistics.TcpAttemptFails++;
- goto exit;
+ } else {
+ goto error;
}
} else {
if (isn == 0)
isn = tcp_v4_init_sequence(sk, skb);
- sk->ack_backlog++;
+ BACKLOG(sk)++;
}
req = tcp_openreq_alloc();
if (req == NULL) {
- tcp_statistics.TcpAttemptFails++;
- goto exit;
+ if (!want_cookie) BACKLOG(sk)--;
+ goto error;
}
req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
@@ -972,7 +1141,7 @@
req->rcv_isn = skb->seq;
tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
tp.in_mss = 536;
- tcp_parse_options(th,&tp, want_cookie);
+ tcp_parse_options(th,&tp,want_cookie);
if (tp.saw_tstamp)
req->ts_recent = tp.rcv_tsval;
req->mss = tp.in_mss;
@@ -1023,15 +1192,16 @@
}
sk->data_ready(sk, 0);
-
exit:
- kfree_skb(skb, FREE_READ);
return 0;
dead:
SOCK_DEBUG(sk, "Reset on %p: Connect on dead socket.\n",sk);
tcp_statistics.TcpAttemptFails++;
return -ENOTCONN;
+error:
+ tcp_statistics.TcpAttemptFails++;
+ goto exit;
}
struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
@@ -1042,13 +1212,16 @@
struct sock *newsk;
int snd_mss;
- newsk = sk_alloc(GFP_ATOMIC);
- if (newsk == NULL) {
- if (dst)
- dst_release(dst);
- return NULL;
- }
-
+#ifdef NEW_LISTEN
+ if (sk->ack_backlog > sk->max_ack_backlog)
+ goto exit; /* head drop */
+#endif
+ newsk = sk_alloc(AF_INET, GFP_ATOMIC);
+ if (!newsk)
+ goto exit;
+#ifdef NEW_LISTEN
+ sk->ack_backlog++;
+#endif
memcpy(newsk, sk, sizeof(*newsk));
/* Or else we die! -DaveM */
@@ -1132,7 +1305,7 @@
newsk->opt && newsk->opt->srr ?
newsk->opt->faddr : newsk->daddr,
newsk->saddr, newsk->ip_tos, NULL)) {
- kfree(newsk);
+ sk_free(newsk);
return NULL;
}
dst = &rt->u.dst;
@@ -1179,73 +1352,11 @@
tcp_v4_hash(newsk);
add_to_prot_sklist(newsk);
return newsk;
-}
-
-static inline struct sock *tcp_v4_check_req(struct sock *sk, struct sk_buff *skb, struct ip_options *opt)
-{
- struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- struct open_request *req = tp->syn_wait_queue;
-
- /* assumption: the socket is not in use.
- * as we checked the user count on tcp_rcv and we're
- * running from a soft interrupt.
- */
- if(!req) {
-#ifdef CONFIG_SYN_COOKIES
- goto checkcookie;
-#else
- return sk;
-#endif
- }
- while(req) {
- if (req->af.v4_req.rmt_addr == skb->nh.iph->saddr &&
- req->af.v4_req.loc_addr == skb->nh.iph->daddr &&
- req->rmt_port == skb->h.th->source) {
- u32 flg;
-
- if (req->sk) {
- /* socket already created but not
- * yet accepted()...
- */
- sk = req->sk;
- goto ende;
- }
-
- /* Check for syn retransmission */
- flg = *(((u32 *)skb->h.th) + 3);
- flg &= __constant_htonl(0x001f0000);
- if ((flg == __constant_htonl(0x00020000)) &&
- (!after(skb->seq, req->rcv_isn))) {
- /* retransmited syn
- * FIXME: must send an ack
- */
- return NULL;
- }
-
- if (!skb->h.th->ack)
- return sk;
-
- sk = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
- tcp_dec_slow_timer(TCP_SLT_SYNACK);
- if (sk == NULL)
- return NULL;
-
- req->expires = 0UL;
- req->sk = sk;
- goto ende;
- }
- req = req->dl_next;
- }
-
-#ifdef CONFIG_SYN_COOKIES
-checkcookie:
- sk = cookie_v4_check(sk, skb, opt);
-#endif
-ende: skb_orphan(skb);
- if (sk)
- skb_set_owner_r(skb, sk);
- return sk;
+exit:
+ if (dst)
+ dst_release(dst);
+ return NULL;
}
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
@@ -1256,47 +1367,49 @@
* socket locking is here for SMP purposes as backlog rcv
* is currently called with bh processing disabled.
*/
- lock_sock(sk);
-
- if (sk->state == TCP_ESTABLISHED)
- {
+ lock_sock(sk);
+
+ if (sk->state == TCP_ESTABLISHED) { /* Fast path */
if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
goto reset;
- goto ok;
- }
-
- /*
- * We check packets with only the SYN bit set against the
- * open_request queue too: This increases connection latency a bit,
- * but is required to detect retransmitted SYNs.
- *
- * The ACK/SYN bit check is probably not needed here because
- * it is checked later again (we play save now).
- */
- if (sk->state == TCP_LISTEN && (skb->h.th->ack || skb->h.th->syn)) {
- struct sock *nsk;
+ } else {
+ /* Check for embryonic sockets (open_requests)
+ * We check packets with only the SYN bit set
+ * against the open_request queue too: This
+ * increases connection latency a bit, but is
+ * required to detect retransmitted SYNs.
+ */
+ /* FIXME: need to check for multicast syns
+ * here to satisfy RFC1122 4.2.3.10, p. 104:
+ * discard bcast/mcast SYN. I'm not sure if
+ * they're filtered out at the IP layer (I
+ * think not)
+ */
+ if (sk->state == TCP_LISTEN &&
+ ((u32 *)skb->h.th)[3] & __constant_htonl(0x00120000)) {
+ struct sock *nsk;
+
+ /* Find possible connection requests. */
+ nsk = tcp_check_req(sk, skb, &(IPCB(skb)->opt));
+ if (nsk == NULL)
+ goto discard;
+
+ release_sock(sk);
+ lock_sock(nsk);
+ sk = nsk;
+ }
- /* Find possible connection requests. */
- nsk = tcp_v4_check_req(sk, skb, &(IPCB(skb)->opt));
- if (nsk == NULL)
- goto discard_it;
-
- release_sock(sk);
- lock_sock(nsk);
- sk = nsk;
+ if (tcp_rcv_state_process(sk, skb, skb->h.th,
+ &(IPCB(skb)->opt), skb->len))
+ goto reset;
}
-
- if (tcp_rcv_state_process(sk, skb, skb->h.th, &(IPCB(skb)->opt), skb->len) == 0)
- goto ok;
+ release_sock(sk);
+ return 0;
reset:
tcp_v4_send_reset(skb);
-
-discard_it:
- /* Discard frame. */
- kfree_skb(skb, FREE_READ);
-
-ok:
+discard:
+ kfree_skb(skb, FREE_READ);
release_sock(sk);
return 0;
}
@@ -1327,8 +1440,8 @@
case CHECKSUM_HW:
if (tcp_v4_check(th,len,saddr,daddr,skb->csum)) {
struct iphdr * iph = skb->nh.iph;
- printk(KERN_DEBUG "TCPv4 bad checksum from %08x:%04x to %08x:%04x, len=%d/%d/%d\n",
- saddr, ntohs(th->source), daddr,
+ printk(KERN_DEBUG "TCPv4 bad checksum from %I:%04x to %I:%04x, len=%d/%d/%d\n",
+ &saddr, ntohs(th->source), &daddr,
ntohs(th->dest), len, skb->len, ntohs(iph->tot_len));
goto discard_it;
}
@@ -1435,6 +1548,12 @@
ip_getsockopt,
v4_addr2sockaddr,
tcp_v4_send_reset,
+ tcp_v4_search_req,
+#ifdef CONFIG_SYNCOOKIES
+ cookie_v4_check,
+#else
+ NULL,
+#endif
sizeof(struct sockaddr_in)
};
@@ -1461,6 +1580,7 @@
tp->snd_wscale = 0;
tp->sacks = 0;
tp->saw_tstamp = 0;
+ tp->syn_backlog = 0;
/*
* See draft-stevens-tcpca-spec-01 for discussion of the
@@ -1484,8 +1604,7 @@
sk->dummy_th.doff=sizeof(struct tcphdr)>>2;
/* Init SYN queue. */
- tp->syn_wait_queue = NULL;
- tp->syn_wait_last = &tp->syn_wait_queue;
+ tcp_synq_init(tp);
sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov