patch-2.4.23 linux-2.4.23/net/ipv4/ipvs/ip_vs_conn.c

Next file: linux-2.4.23/net/ipv4/ipvs/ip_vs_core.c
Previous file: linux-2.4.23/net/ipv4/ipvs/ip_vs_app.c
Back to the patch index
Back to the overall index

diff -urN linux-2.4.22/net/ipv4/ipvs/ip_vs_conn.c linux-2.4.23/net/ipv4/ipvs/ip_vs_conn.c
@@ -0,0 +1,1562 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_conn.c,v 1.28.2.5 2003/08/09 13:27:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others. Many code here is taken from IP MASQ code of kernel 2.2.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>                  /* for tcphdr */
+#include <linux/in.h>
+#include <linux/proc_fs.h>              /* for proc_net_* */
+#include <asm/softirq.h>                /* for local_bh_* */
+#include <net/ip.h>
+#include <net/tcp.h>                    /* for csum_tcpudp_magic */
+#include <net/udp.h>
+#include <net/icmp.h>                   /* for icmp_send */
+#include <net/route.h>                  /* for ip_route_output */
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *  Connection hash table: for input and output packets lookups of IPVS
+ */
+static struct list_head *ip_vs_conn_tab;
+
+/* SLAB cache for IPVS connections */
+static kmem_cache_t *ip_vs_conn_cachep;
+
+/* counter for current IPVS connections */
+static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
+
+/* counter for no-client-port connections */
+static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
+
+/* random value for IPVS connection hash */
+static unsigned int ip_vs_conn_rnd;
+
+/*
+ *  Fine locking granularity for big connection hash table
+ */
+#define CT_LOCKARRAY_BITS  4
+#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
+#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
+
+struct ip_vs_aligned_lock
+{
+	rwlock_t	l;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+/* lock array for conn table */
+struct ip_vs_aligned_lock
+__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
+
+static inline void ct_read_lock(unsigned key)
+{
+	read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_unlock(unsigned key)
+{
+	read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_lock(unsigned key)
+{
+	write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock(unsigned key)
+{
+	write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_lock_bh(unsigned key)
+{
+	read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_unlock_bh(unsigned key)
+{
+	read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_lock_bh(unsigned key)
+{
+	write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock_bh(unsigned key)
+{
+	write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+
+/*
+ *	Returns hash value for IPVS connection entry
+ */
+static unsigned
+ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
+{
+	return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
+		& IP_VS_CONN_TAB_MASK;
+}
+
+
+/*
+ *	Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
+ *	returns bool success.
+ */
+static int ip_vs_conn_hash(struct ip_vs_conn *cp)
+{
+	unsigned hash;
+
+	if (cp->flags & IP_VS_CONN_F_HASHED) {
+		IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
+			  "called from %p\n", __builtin_return_address(0));
+		return 0;
+	}
+
+	/* Hash by protocol, client address and port */
+	hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
+
+	ct_write_lock(hash);
+
+	list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
+	cp->flags |= IP_VS_CONN_F_HASHED;
+	atomic_inc(&cp->refcnt);
+
+	ct_write_unlock(hash);
+
+	return 1;
+}
+
+
+/*
+ *	UNhashes ip_vs_conn from ip_vs_conn_tab.
+ *	returns bool success.
+ */
+static int ip_vs_conn_unhash(struct ip_vs_conn *cp)
+{
+	unsigned hash;
+
+	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
+		IP_VS_ERR("ip_vs_conn_unhash(): request for unhash flagged, "
+			  "called from %p\n", __builtin_return_address(0));
+		return 0;
+	}
+
+	/* unhash it and decrease its reference counter */
+	hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
+	ct_write_lock(hash);
+
+	list_del(&cp->c_list);
+	cp->flags &= ~IP_VS_CONN_F_HASHED;
+	atomic_dec(&cp->refcnt);
+
+	ct_write_unlock(hash);
+
+	return 1;
+}
+
+
+/*
+ *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ *  Called for pkts coming from OUTside-to-INside.
+ *	s_addr, s_port: pkt source address (foreign host)
+ *	d_addr, d_port: pkt dest address (load balancer)
+ */
+static inline struct ip_vs_conn *__ip_vs_conn_in_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+	unsigned hash;
+	struct ip_vs_conn *cp;
+	struct list_head *l,*e;
+
+	hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
+	l = &ip_vs_conn_tab[hash];
+
+	ct_read_lock(hash);
+
+	for (e=l->next; e!=l; e=e->next) {
+		cp = list_entry(e, struct ip_vs_conn, c_list);
+		if (s_addr==cp->caddr && s_port==cp->cport &&
+		    d_port==cp->vport && d_addr==cp->vaddr &&
+		    protocol==cp->protocol) {
+			/* HIT */
+			atomic_inc(&cp->refcnt);
+			ct_read_unlock(hash);
+			return cp;
+		}
+	}
+
+	ct_read_unlock(hash);
+
+	return NULL;
+}
+
+struct ip_vs_conn *ip_vs_conn_in_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+	struct ip_vs_conn *cp;
+
+	cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
+	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
+		cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
+
+	IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+		  ip_vs_proto_name(protocol),
+		  NIPQUAD(s_addr), ntohs(s_port),
+		  NIPQUAD(d_addr), ntohs(d_port),
+		  cp?"hit":"not hit");
+
+	return cp;
+}
+
+
+/*
+ *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ *  Called for pkts coming from inside-to-OUTside.
+ *	s_addr, s_port: pkt source address (inside host)
+ *	d_addr, d_port: pkt dest address (foreign host)
+ */
+struct ip_vs_conn *ip_vs_conn_out_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+	unsigned hash;
+	struct ip_vs_conn *cp, *ret=NULL;
+	struct list_head *l,*e;
+
+	/*
+	 *	Check for "full" addressed entries
+	 */
+	hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
+	l = &ip_vs_conn_tab[hash];
+
+	ct_read_lock(hash);
+
+	for (e=l->next; e!=l; e=e->next) {
+		cp = list_entry(e, struct ip_vs_conn, c_list);
+		if (d_addr == cp->caddr && d_port == cp->cport &&
+		    s_port == cp->dport && s_addr == cp->daddr &&
+		    protocol == cp->protocol) {
+			/* HIT */
+			atomic_inc(&cp->refcnt);
+			ret = cp;
+			break;
+		}
+	}
+
+	ct_read_unlock(hash);
+
+	IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+		  ip_vs_proto_name(protocol),
+		  NIPQUAD(s_addr), ntohs(s_port),
+		  NIPQUAD(d_addr), ntohs(d_port),
+		  ret?"hit":"not hit");
+
+	return ret;
+}
+
+
+/*
+ *      Put back the conn and restart its timer with its timeout
+ */
+void ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+	/* reset it expire in its timeout */
+	mod_timer(&cp->timer, jiffies+cp->timeout);
+
+	__ip_vs_conn_put(cp);
+}
+
+
+/*
+ *	Timeout table[state]
+ */
+struct ip_vs_timeout_table vs_timeout_table = {
+	ATOMIC_INIT(0),	/* refcnt */
+	0,		/* scale  */
+	{
+		[IP_VS_S_NONE]          =	30*60*HZ,
+		[IP_VS_S_ESTABLISHED]	=	15*60*HZ,
+		[IP_VS_S_SYN_SENT]	=	2*60*HZ,
+		[IP_VS_S_SYN_RECV]	=	1*60*HZ,
+		[IP_VS_S_FIN_WAIT]	=	2*60*HZ,
+		[IP_VS_S_TIME_WAIT]	=	2*60*HZ,
+		[IP_VS_S_CLOSE]         =	10*HZ,
+		[IP_VS_S_CLOSE_WAIT]	=	60*HZ,
+		[IP_VS_S_LAST_ACK]	=	30*HZ,
+		[IP_VS_S_LISTEN]	=	2*60*HZ,
+		[IP_VS_S_SYNACK]	=	120*HZ,
+		[IP_VS_S_UDP]		=	5*60*HZ,
+		[IP_VS_S_ICMP]          =	1*60*HZ,
+		[IP_VS_S_LAST]          =	2*HZ,
+	},	/* timeout */
+};
+
+
+struct ip_vs_timeout_table vs_timeout_table_dos = {
+	ATOMIC_INIT(0),	/* refcnt */
+	0,		/* scale  */
+	{
+		[IP_VS_S_NONE]          =	15*60*HZ,
+		[IP_VS_S_ESTABLISHED]	=	8*60*HZ,
+		[IP_VS_S_SYN_SENT]	=	60*HZ,
+		[IP_VS_S_SYN_RECV]	=	10*HZ,
+		[IP_VS_S_FIN_WAIT]	=	60*HZ,
+		[IP_VS_S_TIME_WAIT]	=	60*HZ,
+		[IP_VS_S_CLOSE]         =	10*HZ,
+		[IP_VS_S_CLOSE_WAIT]	=	60*HZ,
+		[IP_VS_S_LAST_ACK]	=	30*HZ,
+		[IP_VS_S_LISTEN]	=	2*60*HZ,
+		[IP_VS_S_SYNACK]	=	100*HZ,
+		[IP_VS_S_UDP]		=	3*60*HZ,
+		[IP_VS_S_ICMP]          =	1*60*HZ,
+		[IP_VS_S_LAST]          =	2*HZ,
+	},	/* timeout */
+};
+
+
+/*
+ *	Timeout table to use for the VS entries
+ *	If NULL we use the default table (vs_timeout_table).
+ *	Under flood attack we switch to vs_timeout_table_dos
+ */
+
+static struct ip_vs_timeout_table *ip_vs_timeout_table = &vs_timeout_table;
+
+static const char * state_name_table[IP_VS_S_LAST+1] = {
+	[IP_VS_S_NONE]          =	"NONE",
+	[IP_VS_S_ESTABLISHED]	=	"ESTABLISHED",
+	[IP_VS_S_SYN_SENT]	=	"SYN_SENT",
+	[IP_VS_S_SYN_RECV]	=	"SYN_RECV",
+	[IP_VS_S_FIN_WAIT]	=	"FIN_WAIT",
+	[IP_VS_S_TIME_WAIT]	=	"TIME_WAIT",
+	[IP_VS_S_CLOSE]         =	"CLOSE",
+	[IP_VS_S_CLOSE_WAIT]	=	"CLOSE_WAIT",
+	[IP_VS_S_LAST_ACK]	=	"LAST_ACK",
+	[IP_VS_S_LISTEN]	=	"LISTEN",
+	[IP_VS_S_SYNACK]	=	"SYNACK",
+	[IP_VS_S_UDP]		=	"UDP",
+	[IP_VS_S_ICMP]          =	"ICMP",
+	[IP_VS_S_LAST]          =	"BUG!",
+};
+
+#define sNO IP_VS_S_NONE
+#define sES IP_VS_S_ESTABLISHED
+#define sSS IP_VS_S_SYN_SENT
+#define sSR IP_VS_S_SYN_RECV
+#define sFW IP_VS_S_FIN_WAIT
+#define sTW IP_VS_S_TIME_WAIT
+#define sCL IP_VS_S_CLOSE
+#define sCW IP_VS_S_CLOSE_WAIT
+#define sLA IP_VS_S_LAST_ACK
+#define sLI IP_VS_S_LISTEN
+#define sSA IP_VS_S_SYNACK
+
+struct vs_tcp_states_t {
+	int next_state[IP_VS_S_LAST];	/* should be _LAST_TCP */
+};
+
+const char * ip_vs_state_name(int state)
+{
+	if (state >= IP_VS_S_LAST)
+		return "ERR!";
+	return state_name_table[state] ? state_name_table[state] : "?";
+}
+
+static struct vs_tcp_states_t vs_tcp_states [] = {
+/*	INPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
+
+/*	OUTPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/*	INPUT-ONLY */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct vs_tcp_states_t vs_tcp_states_dos [] = {
+/*	INPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
+/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+
+/*	OUTPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/*	INPUT-ONLY */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct vs_tcp_states_t *ip_vs_state_table = vs_tcp_states;
+
+void ip_vs_secure_tcp_set(int on)
+{
+	if (on) {
+		ip_vs_state_table = vs_tcp_states_dos;
+		ip_vs_timeout_table = &vs_timeout_table_dos;
+	} else {
+		ip_vs_state_table = vs_tcp_states;
+		ip_vs_timeout_table = &vs_timeout_table;
+	}
+}
+
+
+static inline int vs_tcp_state_idx(struct tcphdr *th, int state_off)
+{
+	/*
+	 *	[0-3]: input states, [4-7]: output, [8-11] input only states.
+	 */
+	if (th->rst)
+		return state_off+3;
+	if (th->syn)
+		return state_off+0;
+	if (th->fin)
+		return state_off+1;
+	if (th->ack)
+		return state_off+2;
+	return -1;
+}
+
+
+static inline int vs_set_state_timeout(struct ip_vs_conn *cp, int state)
+{
+	struct ip_vs_timeout_table *vstim = cp->timeout_table;
+
+	/*
+	 *	Use default timeout table if no specific for this entry
+	 */
+	if (!vstim)
+		vstim = &vs_timeout_table;
+
+	cp->timeout = vstim->timeout[cp->state=state];
+
+	if (vstim->scale) {
+		int scale = vstim->scale;
+
+		if (scale<0)
+			cp->timeout >>= -scale;
+		else if (scale > 0)
+			cp->timeout <<= scale;
+	}
+
+	return state;
+}
+
+
+static inline int
+vs_tcp_state(struct ip_vs_conn *cp, int state_off, struct tcphdr *th)
+{
+	int state_idx;
+	int new_state = IP_VS_S_CLOSE;
+
+	/*
+	 *    Update state offset to INPUT_ONLY if necessary
+	 *    or delete NO_OUTPUT flag if output packet detected
+	 */
+	if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+		if (state_off == VS_STATE_OUTPUT)
+			cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+		else
+			state_off = VS_STATE_INPUT_ONLY;
+	}
+
+	if ((state_idx = vs_tcp_state_idx(th, state_off)) < 0) {
+		IP_VS_DBG(8, "vs_tcp_state_idx(%d)=%d!!!\n",
+			  state_off, state_idx);
+		goto tcp_state_out;
+	}
+
+	new_state = ip_vs_state_table[state_idx].next_state[cp->state];
+
+  tcp_state_out:
+	if (new_state != cp->state) {
+		struct ip_vs_dest *dest = cp->dest;
+
+		IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
+			  "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
+			  ip_vs_proto_name(cp->protocol),
+			  (state_off==VS_STATE_OUTPUT)?"output ":"input ",
+			  th->syn? 'S' : '.',
+			  th->fin? 'F' : '.',
+			  th->ack? 'A' : '.',
+			  th->rst? 'R' : '.',
+			  NIPQUAD(cp->daddr), ntohs(cp->dport),
+			  NIPQUAD(cp->caddr), ntohs(cp->cport),
+			  ip_vs_state_name(cp->state),
+			  ip_vs_state_name(new_state),
+			  atomic_read(&cp->refcnt));
+		if (dest) {
+			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+			    (new_state != IP_VS_S_ESTABLISHED)) {
+				atomic_dec(&dest->activeconns);
+				atomic_inc(&dest->inactconns);
+				cp->flags |= IP_VS_CONN_F_INACTIVE;
+			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+				   (new_state == IP_VS_S_ESTABLISHED)) {
+				atomic_inc(&dest->activeconns);
+				atomic_dec(&dest->inactconns);
+				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+			}
+		}
+	}
+
+	return vs_set_state_timeout(cp, new_state);
+}
+
+
+/*
+ *	Handle state transitions
+ */
+int ip_vs_set_state(struct ip_vs_conn *cp,
+		    int state_off, struct iphdr *iph, void *tp)
+{
+	int ret;
+
+	spin_lock(&cp->lock);
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		ret = vs_tcp_state(cp, state_off, tp);
+		break;
+	case IPPROTO_UDP:
+		ret = vs_set_state_timeout(cp, IP_VS_S_UDP);
+		break;
+	case IPPROTO_ICMP:
+		ret = vs_set_state_timeout(cp, IP_VS_S_ICMP);
+		break;
+	default:
+		ret = -1;
+	}
+	spin_unlock(&cp->lock);
+
+	return ret;
+}
+
+
+/*
+ *	Set LISTEN timeout. (ip_vs_conn_put will setup timer)
+ */
+int ip_vs_conn_listen(struct ip_vs_conn *cp)
+{
+	vs_set_state_timeout(cp, IP_VS_S_LISTEN);
+	return cp->timeout;
+}
+
+
+/*
+ *      Bypass transmitter
+ *      Let packets bypass the destination when the destination is not
+ *      available, it may be only used in transparent cache cluster.
+ */
+static int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+	struct rtable *rt;			/* Route to the other host */
+	struct iphdr  *iph = skb->nh.iph;
+	u8     tos = iph->tos;
+	int    mtu;
+
+	EnterFunction(10);
+
+	if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(tos), 0)) {
+		IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
+			     "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
+		goto tx_error_icmp;
+	}
+
+	/* MTU checking */
+	mtu = rt->u.dst.pmtu;
+	if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+		ip_rt_put(rt);
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
+		goto tx_error;
+	}
+
+	/* update checksum because skb might be defragmented */
+	ip_send_check(iph);
+
+	if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
+		if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
+			ip_rt_put(rt);
+			IP_VS_ERR_RL("ip_vs_bypass_xmit(): no memory\n");
+			goto tx_error;
+		}
+	}
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+#ifdef CONFIG_NETFILTER_DEBUG
+	skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
+#endif /* CONFIG_NETFILTER_DEBUG */
+	skb->nfcache |= NFC_IPVS_PROPERTY;
+	ip_send(skb);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	kfree_skb(skb);
+	return NF_STOLEN;
+}
+
+
+/*
+ *      NULL transmitter (do nothing except return NF_ACCEPT)
+ */
+static int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+	return NF_ACCEPT;
+}
+
+
+/*
+ *      NAT transmitter (only for outside-to-inside nat forwarding)
+ */
+static int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+	struct rtable *rt;		/* Route to the other host */
+	struct iphdr  *iph;
+	union ip_vs_tphdr h;
+	int ihl;
+	unsigned short size;
+	int mtu;
+
+	EnterFunction(10);
+
+	/*
+	 * If it has ip_vs_app helper, the helper may change the payload,
+	 * so it needs full checksum checking and checksum calculation.
+	 * If not, only the header (such as IP address and port number)
+	 * will be changed, so it is fast to do incremental checksum update,
+	 * and let the destination host  do final checksum checking.
+	 */
+
+	if (cp->app && skb_is_nonlinear(skb)
+	    && skb_linearize(skb, GFP_ATOMIC) != 0)
+		return NF_DROP;
+
+	iph = skb->nh.iph;
+	ihl = iph->ihl << 2;
+	h.raw = (char*) iph + ihl;
+	size = ntohs(iph->tot_len) - ihl;
+
+	/* do TCP/UDP checksum checking if it has application helper */
+	if (cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
+		switch (skb->ip_summed) {
+		case CHECKSUM_NONE:
+			skb->csum = csum_partial(h.raw, size, 0);
+
+		case CHECKSUM_HW:
+			if (csum_tcpudp_magic(iph->saddr, iph->daddr, size,
+					      iph->protocol, skb->csum)) {
+				IP_VS_DBG_RL("Incoming failed %s checksum "
+					     "from %d.%d.%d.%d (size=%d)!\n",
+					     ip_vs_proto_name(iph->protocol),
+					     NIPQUAD(iph->saddr),
+					     size);
+				goto tx_error;
+			}
+			break;
+		default:
+			/* CHECKSUM_UNNECESSARY */
+			break;
+		}
+	}
+
+	/*
+	 *  Check if it is no_cport connection ...
+	 */
+	if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
+		atomic_dec(&ip_vs_conn_no_cport_cnt);
+		ip_vs_conn_unhash(cp);
+		cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+		cp->cport = h.portp[0];
+		/* hash on new dport */
+		ip_vs_conn_hash(cp);
+
+		IP_VS_DBG(10, "filled cport=%d\n", ntohs(cp->dport));
+	}
+
+	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = rt->u.dst.pmtu;
+	if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+		ip_rt_put(rt);
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		IP_VS_DBG_RL("ip_vs_nat_xmit(): frag needed\n");
+		goto tx_error;
+	}
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/* copy-on-write the packet before mangling it */
+	if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, &iph, &h.raw))
+		return NF_DROP;
+
+	/* mangle the packet */
+	iph->daddr = cp->daddr;
+	h.portp[1] = cp->dport;
+
+	/*
+	 *	Attempt ip_vs_app call.
+	 *	will fix ip_vs_conn and iph ack_seq stuff
+	 */
+	if (ip_vs_app_pkt_in(cp, skb) != 0) {
+		/* skb data has probably changed, update pointers */
+		iph = skb->nh.iph;
+		h.raw = (char*) iph + ihl;
+		size = skb->len - ihl;
+	}
+
+	/*
+	 *	Adjust TCP/UDP checksums
+	 */
+	if (!cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) {
+		/* Only port and addr are changed, do fast csum update */
+		ip_vs_fast_check_update(&h, cp->vaddr, cp->daddr,
+					cp->vport, cp->dport, iph->protocol);
+		if (skb->ip_summed == CHECKSUM_HW)
+			skb->ip_summed = CHECKSUM_NONE;
+	} else {
+		/* full checksum calculation */
+		switch (iph->protocol) {
+		case IPPROTO_TCP:
+			h.th->check = 0;
+			h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+							size, iph->protocol,
+							csum_partial(h.raw, size, 0));
+			break;
+		case IPPROTO_UDP:
+			h.uh->check = 0;
+			h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+							size, iph->protocol,
+							csum_partial(h.raw, size, 0));
+			if (h.uh->check == 0)
+				h.uh->check = 0xFFFF;
+			break;
+		}
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+	ip_send_check(iph);
+
+	IP_VS_DBG(10, "NAT to %u.%u.%u.%u:%d\n",
+		  NIPQUAD(iph->daddr), ntohs(h.portp[1]));
+
+	/* FIXME: when application helper enlarges the packet and the length
+	   is larger than the MTU of outgoing device, there will be still
+	   MTU problem. */
+
+#ifdef CONFIG_NETFILTER_DEBUG
+	skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
+#endif /* CONFIG_NETFILTER_DEBUG */
+	skb->nfcache |= NFC_IPVS_PROPERTY;
+	ip_send(skb);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	kfree_skb(skb);
+	return NF_STOLEN;
+}
+
+
+/*
+ *   IP Tunneling transmitter
+ *
+ *   This function encapsulates the packet in a new IP packet, its
+ *   destination will be set to cp->daddr. Most code of this function
+ *   is taken from ipip.c.
+ *
+ *   It is used in VS/TUN cluster. The load balancer selects a real
+ *   server from a cluster based on a scheduling algorithm,
+ *   encapsulates the request packet and forwards it to the selected
+ *   server. For example, all real servers are configured with
+ *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
+ *   the encapsulated packet, it will decapsulate the packet, processe
+ *   the request and return the response packets directly to the client
+ *   without passing the load balancer. This can greatly increase the
+ *   scalability of virtual server.
+ */
+static int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+	struct rtable *rt;			/* Route to the other host */
+	struct net_device *tdev;		/* Device to other host */
+	struct iphdr  *old_iph = skb->nh.iph;
+	u8     tos = old_iph->tos;
+	u16    df = old_iph->frag_off;
+	struct iphdr  *iph;			/* Our new IP header */
+	int    max_headroom;			/* The extra header space needed */
+	int    mtu;
+
+	EnterFunction(10);
+
+	if (skb->protocol != __constant_htons(ETH_P_IP)) {
+		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
+			     "ETH_P_IP: %d, skb protocol: %d\n",
+			     __constant_htons(ETH_P_IP), skb->protocol);
+		goto tx_error;
+	}
+
+	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+		goto tx_error_icmp;
+
+	tdev = rt->u.dst.dev;
+
+	mtu = rt->u.dst.pmtu - sizeof(struct iphdr);
+	if (mtu < 68) {
+		ip_rt_put(rt);
+		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
+		goto tx_error;
+	}
+	if (skb->dst && mtu < skb->dst->pmtu)
+		skb->dst->pmtu = mtu;
+
+	df |= (old_iph->frag_off&__constant_htons(IP_DF));
+
+	if ((old_iph->frag_off&__constant_htons(IP_DF))
+	    && mtu < ntohs(old_iph->tot_len)) {
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		ip_rt_put(rt);
+		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
+		goto tx_error;
+	}
+
+	/* update checksum because skb might be defragmented */
+	ip_send_check(old_iph);
+
+	skb->h.raw = skb->nh.raw;
+
+	/*
+	 * Okay, now see if we can stuff it in the buffer as-is.
+	 */
+	max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
+
+	if (skb_headroom(skb) < max_headroom
+	    || skb_cloned(skb) || skb_shared(skb)) {
+		struct sk_buff *new_skb =
+			skb_realloc_headroom(skb, max_headroom);
+		if (!new_skb) {
+			ip_rt_put(rt);
+			IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
+			return NF_DROP;
+		}
+		kfree_skb(skb);
+		skb = new_skb;
+		old_iph = skb->nh.iph;
+	}
+
+	skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/*
+	 *	Push down and install the IPIP header.
+	 */
+	iph			=	skb->nh.iph;
+	iph->version		=	4;
+	iph->ihl		=	sizeof(struct iphdr)>>2;
+	iph->frag_off		=	df;
+	iph->protocol		=	IPPROTO_IPIP;
+	iph->tos		=	tos;
+	iph->daddr		=	rt->rt_dst;
+	iph->saddr		=	rt->rt_src;
+	iph->ttl		=	old_iph->ttl;
+	iph->tot_len		=	htons(skb->len);
+	ip_select_ident(iph, &rt->u.dst, NULL);
+	ip_send_check(iph);
+
+	skb->ip_summed = CHECKSUM_NONE;
+#ifdef CONFIG_NETFILTER_DEBUG
+	skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
+#endif /* CONFIG_NETFILTER_DEBUG */
+	skb->nfcache |= NFC_IPVS_PROPERTY;
+	ip_send(skb);
+
+	LeaveFunction(10);
+
+	return NF_STOLEN;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	kfree_skb(skb);
+	return NF_STOLEN;
+}
+
+
+/*
+ *      Direct Routing transmitter
+ */
+static int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+	struct rtable *rt;			/* Route to the other host */
+	struct iphdr  *iph = skb->nh.iph;
+	int    mtu;
+
+	EnterFunction(10);
+
+	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = rt->u.dst.pmtu;
+	if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		ip_rt_put(rt);
+		IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
+		goto tx_error;
+	}
+
+	/* update checksum because skb might be defragmented */
+	ip_send_check(iph);
+
+	if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
+		if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
+			ip_rt_put(rt);
+			IP_VS_ERR_RL("ip_vs_dr_xmit(): no memory\n");
+			goto tx_error;
+		}
+	}
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+#ifdef CONFIG_NETFILTER_DEBUG
+	skb->nf_debug = 1 << NF_IP_LOCAL_OUT;
+#endif /* CONFIG_NETFILTER_DEBUG */
+	skb->nfcache |= NFC_IPVS_PROPERTY;
+	ip_send(skb);
+
+#if 0000
+	NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+		do_ip_send);
+#endif
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	kfree_skb(skb);
+	return NF_STOLEN;
+}
+
+
+/*
+ *  Bind a connection entry with the corresponding packet_xmit.
+ *  Called by ip_vs_conn_new.
+ */
+static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
+{
+	switch (IP_VS_FWD_METHOD(cp)) {
+	case IP_VS_CONN_F_MASQ:
+		cp->packet_xmit = ip_vs_nat_xmit;
+		break;
+
+	case IP_VS_CONN_F_TUNNEL:
+		cp->packet_xmit = ip_vs_tunnel_xmit;
+		break;
+
+	case IP_VS_CONN_F_DROUTE:
+		cp->packet_xmit = ip_vs_dr_xmit;
+		break;
+
+	case IP_VS_CONN_F_LOCALNODE:
+		cp->packet_xmit = ip_vs_null_xmit;
+		break;
+
+	case IP_VS_CONN_F_BYPASS:
+		cp->packet_xmit = ip_vs_bypass_xmit;
+		break;
+	}
+}
+
+
+/*
+ *  Bind a connection entry with a virtual service destination
+ *  Called just after a new connection entry is created.
+ */
+static inline void
+ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
+{
+	/* if dest is NULL, then return directly */
+	if (!dest)
+		return;
+
+	/* Increase the refcnt counter of the dest */
+	atomic_inc(&dest->refcnt);
+
+	/* Bind with the destination and its corresponding transmitter */
+	cp->flags |= atomic_read(&dest->conn_flags);
+	cp->dest = dest;
+
+	IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+		  "d:%u.%u.%u.%u:%d fwd:%c s:%s flg:%X cnt:%d destcnt:%d\n",
+		  ip_vs_proto_name(cp->protocol),
+		  NIPQUAD(cp->caddr), ntohs(cp->cport),
+		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+		  NIPQUAD(cp->daddr), ntohs(cp->dport),
+		  ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
+		  cp->flags, atomic_read(&cp->refcnt),
+		  atomic_read(&dest->refcnt));
+}
+
+
+/*
+ *  Unbind a connection entry with its VS destination
+ *  Called by the ip_vs_conn_expire function.
+ */
+static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
+{
+	struct ip_vs_dest *dest = cp->dest;
+
+	/* if dest is NULL, then return directly */
+	if (!dest)
+		return;
+
+	IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d "
+		  "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d fwd:%c "
+		  "s:%s flg:%X cnt:%d destcnt:%d",
+		  ip_vs_proto_name(cp->protocol),
+		  NIPQUAD(cp->caddr), ntohs(cp->cport),
+		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+		  NIPQUAD(cp->daddr), ntohs(cp->dport),
+		  ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
+		  cp->flags, atomic_read(&cp->refcnt),
+		  atomic_read(&dest->refcnt));
+
+	/*
+	 * Decrease the inactconns or activeconns counter
+	 * if it is not a connection template ((cp->cport!=0)
+	 *   || (cp->flags & IP_VS_CONN_F_NO_CPORT)).
+	 */
+	if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+		if (cp->flags & IP_VS_CONN_F_INACTIVE) {
+			atomic_dec(&dest->inactconns);
+		} else {
+			atomic_dec(&dest->activeconns);
+		}
+	}
+
+	/*
+	 * Simply decrease the refcnt of the dest, because the
+	 * dest will be either in service's destination list
+	 * or in the trash.
+	 */
+	atomic_dec(&dest->refcnt);
+}
+
+
+/*
+ *  Checking if the destination of a connection template is available.
+ *  If available, return 1, otherwise invalidate this connection
+ *  template and return 0.
+ */
+int ip_vs_check_template(struct ip_vs_conn *ct)
+{
+	struct ip_vs_dest *dest = ct->dest;
+
+	/*
+	 * Checking the dest server status.
+	 */
+	if ((dest == NULL) ||
+	    !(dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+		IP_VS_DBG(9, "check_template: dest not available for "
+			  "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+			  "-> d:%u.%u.%u.%u:%d\n",
+			  ip_vs_proto_name(ct->protocol),
+			  NIPQUAD(ct->caddr), ntohs(ct->cport),
+			  NIPQUAD(ct->vaddr), ntohs(ct->vport),
+			  NIPQUAD(ct->daddr), ntohs(ct->dport));
+
+		/*
+		 * Invalidate the connection template
+		 */
+		ip_vs_conn_unhash(ct);
+		ct->dport = 65535;
+		ct->vport = 65535;
+		ct->cport = 0;
+		ip_vs_conn_hash(ct);
+
+		/*
+		 * Simply decrease the refcnt of the template,
+		 * don't restart its timer.
+		 */
+		atomic_dec(&ct->refcnt);
+		return 0;
+	}
+	return 1;
+}
+
+
+static inline void
+ip_vs_timeout_attach(struct ip_vs_conn *cp, struct ip_vs_timeout_table *vstim)
+{
+	atomic_inc(&vstim->refcnt);
+	cp->timeout_table = vstim;
+}
+
+static inline void ip_vs_timeout_detach(struct ip_vs_conn *cp)
+{
+	struct ip_vs_timeout_table *vstim = cp->timeout_table;
+
+	if (!vstim)
+		return;
+	cp->timeout_table = NULL;
+	atomic_dec(&vstim->refcnt);
+}
+
+
+static void ip_vs_conn_expire(unsigned long data)
+{
+	struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+
+	if (cp->timeout_table)
+		cp->timeout = cp->timeout_table->timeout[IP_VS_S_TIME_WAIT];
+	else
+		cp->timeout = vs_timeout_table.timeout[IP_VS_S_TIME_WAIT];
+
+	/*
+	 *	hey, I'm using it
+	 */
+	atomic_inc(&cp->refcnt);
+
+	/*
+	 *	do I control anybody?
+	 */
+	if (atomic_read(&cp->n_control))
+		goto expire_later;
+
+	/*
+	 *	unhash it if it is hashed in the conn table
+	 */
+	ip_vs_conn_unhash(cp);
+
+	/*
+	 *	refcnt==1 implies I'm the only one referrer
+	 */
+	if (likely(atomic_read(&cp->refcnt) == 1)) {
+		/* make sure that there is no timer on it now */
+		if (timer_pending(&cp->timer))
+			del_timer(&cp->timer);
+
+		/* does anybody control me? */
+		if (cp->control)
+			ip_vs_control_del(cp);
+
+		ip_vs_unbind_dest(cp);
+		ip_vs_unbind_app(cp);
+		ip_vs_timeout_detach(cp);
+		if (cp->flags & IP_VS_CONN_F_NO_CPORT)
+			atomic_dec(&ip_vs_conn_no_cport_cnt);
+		atomic_dec(&ip_vs_conn_count);
+
+		kmem_cache_free(ip_vs_conn_cachep, cp);
+		return;
+	}
+
+	/* hash it back to the table */
+	ip_vs_conn_hash(cp);
+
+  expire_later:
+	IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
+		  atomic_read(&cp->refcnt)-1,
+		  atomic_read(&cp->n_control));
+
+	ip_vs_conn_put(cp);
+}
+
+
+void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
+{
+	cp->timeout = 0;
+	mod_timer(&cp->timer, jiffies);
+	__ip_vs_conn_put(cp);
+}
+
+/*
+ *  Create a new connection entry and hash it into the ip_vs_conn_tab.
+ */
+struct ip_vs_conn *
+ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
+	       __u32 daddr, __u16 dport, unsigned flags,
+	       struct ip_vs_dest *dest)
+{
+	struct ip_vs_conn *cp;
+
+	cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
+	if (cp == NULL) {
+		IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
+		return NULL;
+	}
+
+	memset(cp, 0, sizeof(*cp));
+	INIT_LIST_HEAD(&cp->c_list);
+	init_timer(&cp->timer);
+	cp->timer.data     = (unsigned long)cp;
+	cp->timer.function = ip_vs_conn_expire;
+	ip_vs_timeout_attach(cp, ip_vs_timeout_table);
+	cp->protocol	   = proto;
+	cp->caddr	   = caddr;
+	cp->cport	   = cport;
+	cp->vaddr	   = vaddr;
+	cp->vport	   = vport;
+	cp->daddr          = daddr;
+	cp->dport          = dport;
+	cp->flags	   = flags;
+	cp->app_data	   = NULL;
+	cp->control	   = NULL;
+	cp->lock           = SPIN_LOCK_UNLOCKED;
+
+	atomic_set(&cp->n_control, 0);
+	atomic_set(&cp->in_pkts, 0);
+
+	atomic_inc(&ip_vs_conn_count);
+	if (flags & IP_VS_CONN_F_NO_CPORT)
+		atomic_inc(&ip_vs_conn_no_cport_cnt);
+
+	/* Bind its application helper (only for VS/NAT) if any */
+	ip_vs_bind_app(cp);
+
+	/* Bind the connection with a destination server */
+	ip_vs_bind_dest(cp, dest);
+
+	/* Set its state and timeout */
+	vs_set_state_timeout(cp, IP_VS_S_NONE);
+
+	/* Bind its packet transmitter */
+	ip_vs_bind_xmit(cp);
+
+	/*
+	 * Set the entry is referenced by the current thread before hashing
+	 * it in the table, so that other thread run ip_vs_random_dropentry
+	 * but cannot drop this entry.
+	 */
+	atomic_set(&cp->refcnt, 1);
+
+	/* Hash it in the ip_vs_conn_tab finally */
+	ip_vs_conn_hash(cp);
+
+	return cp;
+}
+
+
+/*
+ *	/proc/net/ip_vs_conn entries
+ */
+static int
+ip_vs_conn_getinfo(char *buffer, char **start, off_t offset, int length)
+{
+	off_t pos=0;
+	int idx, len=0;
+	char temp[70];
+	struct ip_vs_conn *cp;
+	struct list_head *l, *e;
+
+	pos = 128;
+	if (pos > offset) {
+		len += sprintf(buffer+len, "%-127s\n",
+			       "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires");
+	}
+
+	for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+		/*
+		 *	Lock is actually only need in next loop
+		 *	we are called from uspace: must stop bh.
+		 */
+		ct_read_lock_bh(idx);
+
+		l = &ip_vs_conn_tab[idx];
+		for (e=l->next; e!=l; e=e->next) {
+			cp = list_entry(e, struct ip_vs_conn, c_list);
+			pos += 128;
+			if (pos <= offset)
+				continue;
+			sprintf(temp,
+				"%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu",
+				ip_vs_proto_name(cp->protocol),
+				ntohl(cp->caddr), ntohs(cp->cport),
+				ntohl(cp->vaddr), ntohs(cp->vport),
+				ntohl(cp->daddr), ntohs(cp->dport),
+				ip_vs_state_name(cp->state),
+				(cp->timer.expires-jiffies)/HZ);
+			len += sprintf(buffer+len, "%-127s\n", temp);
+			if (pos >= offset+length) {
+				ct_read_unlock_bh(idx);
+				goto done;
+			}
+		}
+		ct_read_unlock_bh(idx);
+	}
+
+  done:
+	*start = buffer+len-(pos-offset);       /* Start of wanted data */
+	len = pos-offset;
+	if (len > length)
+		len = length;
+	if (len < 0)
+		len = 0;
+	return len;
+}
+
+
+/*
+ *      Randomly drop connection entries before running out of memory
+ */
+static inline int todrop_entry(struct ip_vs_conn *cp)
+{
+	/*
+	 * The drop rate array needs tuning for real environments.
+	 * Called from timer bh only => no locking
+	 */
+	static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+	static char todrop_counter[9] = {0};
+	int i;
+
+	/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
+	   This will leave enough time for normal connection to get
+	   through. */
+	if (cp->timeout+jiffies-cp->timer.expires < 60*HZ)
+		return 0;
+
+	/* Don't drop the entry if its number of incoming packets is not
+	   located in [0, 8] */
+	i = atomic_read(&cp->in_pkts);
+	if (i > 8 || i < 0) return 0;
+
+	if (!todrop_rate[i]) return 0;
+	if (--todrop_counter[i] > 0) return 0;
+
+	todrop_counter[i] = todrop_rate[i];
+	return 1;
+}
+
+
+void ip_vs_random_dropentry(void)
+{
+	int idx;
+	struct ip_vs_conn *cp;
+	struct list_head *l,*e;
+	struct ip_vs_conn *ct;
+
+	/*
+	 * Randomly scan 1/32 of the whole table every second
+	 */
+	for (idx=0; idx<(IP_VS_CONN_TAB_SIZE>>5); idx++) {
+		unsigned hash = net_random()&IP_VS_CONN_TAB_MASK;
+
+		/*
+		 *  Lock is actually needed in this loop.
+		 */
+		ct_write_lock(hash);
+
+		l = &ip_vs_conn_tab[hash];
+		for (e=l->next; e!=l; e=e->next) {
+			cp = list_entry(e, struct ip_vs_conn, c_list);
+			if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
+				/* connection template */
+				continue;
+			switch(cp->state) {
+			case IP_VS_S_SYN_RECV:
+			case IP_VS_S_SYNACK:
+				break;
+
+			case IP_VS_S_ESTABLISHED:
+			case IP_VS_S_UDP:
+				if (todrop_entry(cp))
+					break;
+				continue;
+
+			default:
+				continue;
+			}
+
+			/*
+			 * Drop the entry, and drop its ct if not referenced
+			 */
+			atomic_inc(&cp->refcnt);
+			ct_write_unlock(hash);
+
+			if ((ct = cp->control))
+				atomic_inc(&ct->refcnt);
+			IP_VS_DBG(4, "del connection\n");
+			ip_vs_conn_expire_now(cp);
+			if (ct) {
+				IP_VS_DBG(4, "del conn template\n");
+				ip_vs_conn_expire_now(ct);
+			}
+			ct_write_lock(hash);
+		}
+		ct_write_unlock(hash);
+	}
+}
+
+
+/*
+ *      Flush all the connection entries in the ip_vs_conn_tab
+ */
+static void ip_vs_conn_flush(void)
+{
+	int idx;
+	struct ip_vs_conn *cp;
+	struct list_head *l,*e;
+	struct ip_vs_conn *ct;
+
+  flush_again:
+	for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
+		/*
+		 *  Lock is actually needed in this loop.
+		 */
+		ct_write_lock_bh(idx);
+
+		l = &ip_vs_conn_tab[idx];
+		for (e=l->next; e!=l; e=e->next) {
+			cp = list_entry(e, struct ip_vs_conn, c_list);
+			atomic_inc(&cp->refcnt);
+			ct_write_unlock(idx);
+
+			if ((ct = cp->control))
+				atomic_inc(&ct->refcnt);
+			IP_VS_DBG(4, "del connection\n");
+			ip_vs_conn_expire_now(cp);
+			if (ct) {
+				IP_VS_DBG(4, "del conn template\n");
+				ip_vs_conn_expire_now(ct);
+			}
+			ct_write_lock(idx);
+		}
+		ct_write_unlock_bh(idx);
+	}
+
+	/* the counter may be not NULL, because maybe some conn entries
+	   are run by slow timer handler or unhashed but still referred */
+	if (atomic_read(&ip_vs_conn_count) != 0) {
+		schedule();
+		goto flush_again;
+	}
+}
+
+
+int ip_vs_conn_init(void)
+{
+	int idx;
+
+	/*
+	 * Allocate the connection hash table and initialize its list heads
+	 */
+	ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
+	if (!ip_vs_conn_tab)
+		return -ENOMEM;
+
+	IP_VS_INFO("Connection hash table configured "
+		   "(size=%d, memory=%ldKbytes)\n",
+		   IP_VS_CONN_TAB_SIZE,
+		   (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
+	IP_VS_DBG(0, "Each connection entry needs %d bytes at least\n",
+		  sizeof(struct ip_vs_conn));
+
+	for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+		INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
+	}
+
+	for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
+		__ip_vs_conntbl_lock_array[idx].l = RW_LOCK_UNLOCKED;
+	}
+
+	/* Allocate ip_vs_conn slab cache */
+	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
+					      sizeof(struct ip_vs_conn), 0,
+					      SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!ip_vs_conn_cachep) {
+		vfree(ip_vs_conn_tab);
+		return -ENOMEM;
+	}
+
+	proc_net_create("ip_vs_conn", 0, ip_vs_conn_getinfo);
+
+	/* calculate the random value for connection hash */
+	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
+
+	return 0;
+}
+
+void ip_vs_conn_cleanup(void)
+{
+	/* flush all the connection entries first */
+	ip_vs_conn_flush();
+
+	/* Release the empty cache */
+	kmem_cache_destroy(ip_vs_conn_cachep);
+	proc_net_remove("ip_vs_conn");
+	vfree(ip_vs_conn_tab);
+}

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)