patch-2.3.99-pre2 linux/net/ipv4/netfilter/ip_nat_core.c

Next file: linux/net/ipv4/netfilter/ip_nat_ftp.c
Previous file: linux/net/ipv4/netfilter/ip_fw_compat_redir.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.3.99-pre1/linux/net/ipv4/netfilter/ip_nat_core.c linux/net/ipv4/netfilter/ip_nat_core.c
@@ -0,0 +1,855 @@
+/* NAT for netfilter; shared with compatibility layer. */
+
+/* (c) 1999 Paul `Rusty' Russell.  Licenced under the GNU General
+   Public Licence. */
+#ifdef MODULE
+#define __NO_VERSION__
+#endif
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/brlock.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h>  /* For tcp_prot in getorigdst */
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+DECLARE_RWLOCK(ip_nat_lock);
+
+#define IP_NAT_HTABLE_SIZE 64
+
+static struct list_head bysource[IP_NAT_HTABLE_SIZE];
+static struct list_head byipsproto[IP_NAT_HTABLE_SIZE];
+LIST_HEAD(protos);
+static LIST_HEAD(helpers);
+
+extern struct ip_nat_protocol unknown_nat_protocol;
+
+/* We keep extra hashes for each conntrack, for fast searching. */
+static inline size_t
+hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
+{
+	/* Modified src and dst, to ensure we don't create two
+           identical streams. */
+	return (src + dst + proto) % IP_NAT_HTABLE_SIZE;
+}
+
+static inline size_t
+hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
+{
+	/* Original src, to ensure we map it consistently if poss. */
+	return (manip->ip + manip->u.all + proto) % IP_NAT_HTABLE_SIZE;
+}
+
+/* Noone using conntrack by the time this called. */
+static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
+{
+	struct ip_nat_info *info = &conn->nat.info;
+
+	if (!info->initialized)
+		return;
+
+	IP_NF_ASSERT(info->bysource.conntrack);
+	IP_NF_ASSERT(info->byipsproto.conntrack);
+
+	WRITE_LOCK(&ip_nat_lock);
+	LIST_DELETE(&bysource[hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL]
+					  .tuple.src,
+					  conn->tuplehash[IP_CT_DIR_ORIGINAL]
+					  .tuple.dst.protonum)],
+		    &info->bysource);
+
+	LIST_DELETE(&byipsproto
+		    [hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY]
+				      .tuple.src.ip,
+				      conn->tuplehash[IP_CT_DIR_REPLY]
+				      .tuple.dst.ip,
+				      conn->tuplehash[IP_CT_DIR_REPLY]
+				      .tuple.dst.protonum)],
+		    &info->byipsproto);
+	WRITE_UNLOCK(&ip_nat_lock);
+}
+
+/* We do checksum mangling, so if they were wrong before they're still
+ * wrong.  Also works for incomplete packets (eg. ICMP dest
+ * unreachables.) */
+u_int16_t
+ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
+{
+	u_int32_t diffs[] = { oldvalinv, newval };
+	return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
+				      oldcheck^0xFFFF));
+}
+
+static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
+{
+	return i->protonum == proto;
+}
+
+struct ip_nat_protocol *
+find_nat_proto(u_int16_t protonum)
+{
+	struct ip_nat_protocol *i;
+
+	MUST_BE_READ_LOCKED(&ip_nat_lock);
+	i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
+	if (!i)
+		i = &unknown_nat_protocol;
+	return i;
+}
+
+/* Is this tuple already taken? (not by us) */
+int
+ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
+		  const struct ip_conntrack *ignored_conntrack)
+{
+	/* Conntrack tracking doesn't keep track of outgoing tuples; only
+	   incoming ones.  NAT means they don't have a fixed mapping,
+	   so we invert the tuple and look for the incoming reply.
+
+	   We could keep a separate hash if this proves too slow. */
+	struct ip_conntrack_tuple reply;
+
+	invert_tuplepr(&reply, tuple);
+	return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
+}
+
+/* Does tuple + the source manip come within the range mr */
+static int
+in_range(const struct ip_conntrack_tuple *tuple,
+	 const struct ip_conntrack_manip *manip,
+	 const struct ip_nat_multi_range *mr)
+{
+	struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
+	unsigned int i;
+	struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
+
+	for (i = 0; i < mr->rangesize; i++) {
+		/* If we are allowed to map IPs, then we must be in the
+		   range specified, otherwise we must be unchanged. */
+		if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
+			if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
+			    || (ntohl(newtuple.src.ip)
+				> ntohl(mr->range[i].max_ip)))
+				continue;
+		} else {
+			if (newtuple.src.ip != tuple->src.ip)
+				continue;
+		}
+
+		if ((mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
+		    && proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
+				       &mr->range[i].min, &mr->range[i].max))
+			return 1;
+	}
+	return 0;
+}
+
+static inline int
+src_cmp(const struct ip_nat_hash *i,
+	const struct ip_conntrack_tuple *tuple,
+	const struct ip_nat_multi_range *mr)
+{
+	return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
+		== tuple->dst.protonum
+		&& i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
+		== tuple->src.ip
+		&& i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
+		== tuple->src.u.all
+		&& in_range(tuple,
+			    &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+			    .tuple.src,
+			    mr));
+}
+
+/* Only called for SRC manip */
+static struct ip_conntrack_manip *
+find_appropriate_src(const struct ip_conntrack_tuple *tuple,
+		     const struct ip_nat_multi_range *mr)
+{
+	unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
+	struct ip_nat_hash *i;
+
+	MUST_BE_READ_LOCKED(&ip_nat_lock);
+	i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
+	if (i)
+		return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
+	else
+		return NULL;
+}
+
+/* If it's really a local destination manip, it may need to do a
+   source manip too. */
+static int
+do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
+{
+	struct rtable *rt;
+
+	/* FIXME: IPTOS_TOS(iph->tos) --RR */
+	if (ip_route_output(&rt, var_ip, 0, 0, 0) != 0) {
+		DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
+		       IP_PARTS(var_ip));
+		return 0;
+	}
+
+	*other_ipp = rt->rt_src;
+	ip_rt_put(rt);
+	return 1;
+}
+
+/* Simple way to iterate through all. */
+static inline int fake_cmp(const struct ip_nat_hash *i,
+			   u_int32_t src, u_int32_t dst, u_int16_t protonum,
+			   unsigned int *score,
+			   const struct ip_conntrack *conntrack)
+{
+	/* Compare backwards: we're dealing with OUTGOING tuples, and
+           inside the conntrack is the REPLY tuple.  Don't count this
+           conntrack. */
+	if (i->conntrack != conntrack
+	    && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
+	    && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
+	    && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
+		== protonum))
+		(*score)++;
+	return 0;
+}
+
+static inline unsigned int
+count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
+	   const struct ip_conntrack *conntrack)
+{
+	unsigned int score = 0;
+
+	MUST_BE_READ_LOCKED(&ip_nat_lock);
+	LIST_FIND(&byipsproto[hash_by_ipsproto(src, dst, protonum)],
+		  fake_cmp, struct ip_nat_hash *, src, dst, protonum, &score,
+		  conntrack);
+
+	return score;
+}
+
+/* For [FUTURE] fragmentation handling, we want the least-used
+   src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
+   if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
+   1-65535, we don't do pro-rata allocation based on ports; we choose
+   the ip with the lowest src-ip/dst-ip/proto usage.
+
+   If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
+   range), we eliminate that and try again.  This is not the most
+   efficient approach, but if you're worried about that, don't hand us
+   ranges you don't really have.  */
+static struct ip_nat_range *
+find_best_ips_proto(struct ip_conntrack_tuple *tuple,
+		    const struct ip_nat_multi_range *mr,
+		    const struct ip_conntrack *conntrack,
+		    unsigned int hooknum)
+{
+	unsigned int i;
+	struct {
+		const struct ip_nat_range *range;
+		unsigned int score;
+		struct ip_conntrack_tuple tuple;
+	} best = { NULL,  0xFFFFFFFF };
+	u_int32_t *var_ipp, *other_ipp, saved_ip;
+
+	if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
+		var_ipp = &tuple->src.ip;
+		saved_ip = tuple->dst.ip;
+		other_ipp = &tuple->dst.ip;
+	} else {
+		var_ipp = &tuple->dst.ip;
+		saved_ip = tuple->src.ip;
+		other_ipp = &tuple->src.ip;
+	}
+
+	IP_NF_ASSERT(mr->rangesize >= 1);
+	for (i = 0; i < mr->rangesize; i++) {
+		u_int32_t minip, maxip;
+
+		/* Don't do ranges which are already eliminated. */
+		if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
+			continue;
+		}
+
+		if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
+			minip = mr->range[i].min_ip;
+			maxip = mr->range[i].max_ip;
+		} else
+			minip = maxip = *var_ipp;
+
+		for (*var_ipp = minip;
+		     ntohl(*var_ipp) <= ntohl(maxip);
+		     *var_ipp = htonl(ntohl(*var_ipp) + 1)) {
+			unsigned int score;
+
+			/* Reset the other ip in case it was mangled by
+			 * do_extra_mangle last time. */
+			*other_ipp = saved_ip;
+
+			if (hooknum == NF_IP_LOCAL_OUT
+			    && !do_extra_mangle(*var_ipp, other_ipp)) {
+				DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
+				       i, IP_PARTS(*var_ipp));
+				/* Can't route?  This whole range part is
+				 * probably screwed, but keep trying
+				 * anyway. */
+				continue;
+			}
+
+			/* Count how many others map onto this. */
+			score = count_maps(tuple->src.ip, tuple->dst.ip,
+					   tuple->dst.protonum, conntrack);
+			if (score < best.score) {
+				/* Optimization: doesn't get any better than
+				   this. */
+				if (score == 0)
+					return (struct ip_nat_range *)
+						&mr->range[i];
+
+				best.score = score;
+				best.tuple = *tuple;
+				best.range = &mr->range[i];
+			}
+		}
+	}
+	*tuple = best.tuple;
+
+	/* Discard const. */
+	return (struct ip_nat_range *)best.range;
+}
+
+static int
+get_unique_tuple(struct ip_conntrack_tuple *tuple,
+		 const struct ip_conntrack_tuple *orig_tuple,
+		 const struct ip_nat_multi_range *mrr,
+		 struct ip_conntrack *conntrack,
+		 unsigned int hooknum)
+{
+	struct ip_nat_protocol *proto
+		= find_nat_proto(orig_tuple->dst.protonum);
+	struct ip_nat_range *rptr;
+	unsigned int i;
+	int ret;
+
+	/* We temporarily use flags for marking full parts, but we
+	   always clean up afterwards */
+	struct ip_nat_multi_range *mr = (void *)mrr;
+
+	/* 1) If this srcip/proto/src-proto-part is currently mapped,
+	   and that same mapping gives a unique tuple within the given
+	   range, use that.
+
+	   This is only required for source (ie. NAT/masq) mappings.
+	   So far, we don't do local source mappings, so multiple
+	   manips not an issue.  */
+	if (hooknum == NF_IP_POST_ROUTING) {
+		struct ip_conntrack_manip *manip;
+
+		manip = find_appropriate_src(orig_tuple, mr);
+		if (manip) {
+			/* Apply same source manipulation. */
+			*tuple = ((struct ip_conntrack_tuple)
+				  { *manip, orig_tuple->dst });
+			DEBUGP("get_unique_tuple: Found current src map\n");
+			return 1;
+		}
+	}
+
+	/* 2) Select the least-used IP/proto combination in the given
+	   range.
+	*/
+	*tuple = *orig_tuple;
+	while ((rptr = find_best_ips_proto(tuple, mr, conntrack, hooknum))
+	       != NULL) {
+		DEBUGP("Found best for "); DUMP_TUPLE(tuple);
+		/* 3) The per-protocol part of the manip is made to
+		   map into the range to make a unique tuple. */
+
+		/* Only bother mapping if it's not already in range
+		   and unique */
+		if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
+		     || proto->in_range(tuple, HOOK2MANIP(hooknum),
+					&rptr->min, &rptr->max))
+		    && !ip_nat_used_tuple(tuple, conntrack)) {
+			ret = 1;
+			goto clear_fulls;
+		} else {
+			if (proto->unique_tuple(tuple, rptr,
+						HOOK2MANIP(hooknum),
+						conntrack)) {
+				/* Must be unique. */
+				IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
+								conntrack));
+				ret = 1;
+				goto clear_fulls;
+			}
+			DEBUGP("Protocol can't get unique tuple.\n");
+		}
+
+		/* Eliminate that from range, and try again. */
+		rptr->flags |= IP_NAT_RANGE_FULL;
+		*tuple = *orig_tuple;
+	}
+
+	ret = 0;
+
+ clear_fulls:
+	/* Clear full flags. */
+	IP_NF_ASSERT(mr->rangesize >= 1);
+	for (i = 0; i < mr->rangesize; i++)
+		mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
+
+	return ret;
+}
+
+static inline int
+helper_cmp(const struct ip_nat_helper *helper,
+	   u_int16_t protocol,
+	   u_int16_t protocol_dst)
+{
+	return (protocol == helper->protocol
+		&& protocol_dst == helper->protocol_dst);
+}
+
+/* Where to manip the reply packets (will be reverse manip). */
+static unsigned int opposite_hook[NF_IP_NUMHOOKS]
+= { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
+    [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
+    [NF_IP_LOCAL_OUT] = NF_IP_PRE_ROUTING
+};
+
+unsigned int
+ip_nat_setup_info(struct ip_conntrack *conntrack,
+		  const struct ip_nat_multi_range *mr,
+		  unsigned int hooknum)
+{
+	struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
+	struct ip_conntrack_tuple orig_tp;
+	struct ip_nat_info *info = &conntrack->nat.info;
+
+	MUST_BE_WRITE_LOCKED(&ip_nat_lock);
+	IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+		     || hooknum == NF_IP_POST_ROUTING
+		     || hooknum == NF_IP_LOCAL_OUT);
+	IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
+
+	/* What we've got will look like inverse of reply. Normally
+	   this is what is in the conntrack, except for prior
+	   manipulations (future optimization: if num_manips == 0,
+	   orig_tp =
+	   conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
+	invert_tuplepr(&orig_tp,
+		       &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+#if 0
+	{
+	unsigned int i;
+
+	DEBUGP("Hook %u (%s), ", hooknum,
+	       HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
+	DUMP_TUPLE(&orig_tp);
+	DEBUGP("Range %p: ", mr);
+	for (i = 0; i < mr->rangesize; i++) {
+		DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
+		       i,
+		       (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
+		       ? " MAP_IPS" : "",
+		       (mr->range[i].flags
+			& IP_NAT_RANGE_PROTO_SPECIFIED)
+		       ? " PROTO_SPECIFIED" : "",
+		       (mr->range[i].flags & IP_NAT_RANGE_FULL)
+		       ? " FULL" : "",
+		       IP_PARTS(mr->range[i].min_ip),
+		       IP_PARTS(mr->range[i].max_ip),
+		       mr->range[i].min.all,
+		       mr->range[i].max.all);
+	}
+	}
+#endif
+
+	do {
+		if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
+				      hooknum)) {
+			DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
+			       conntrack);
+			return NF_DROP;
+		}
+
+#if 0
+		DEBUGP("Hook %u (%s) %p\n", hooknum,
+		       HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
+		       conntrack);
+		DEBUGP("Original: ");
+		DUMP_TUPLE(&orig_tp);
+		DEBUGP("New: ");
+		DUMP_TUPLE(&new_tuple);
+#endif
+
+		/* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
+		   the original (A/B/C/D') and the mangled one (E/F/G/H').
+
+		   We're only allowed to work with the SRC per-proto
+		   part, so we create inverses of both to start, then
+		   derive the other fields we need.  */
+
+		/* Reply connection: simply invert the new tuple
+                   (G/H/E/F') */
+		invert_tuplepr(&reply, &new_tuple);
+
+		/* Alter conntrack table so it recognizes replies.
+                   If fail this race (reply tuple now used), repeat. */
+	} while (!ip_conntrack_alter_reply(conntrack, &reply));
+
+	/* FIXME: We can simply used existing conntrack reply tuple
+           here --RR */
+	/* Create inverse of original: C/D/A/B' */
+	invert_tuplepr(&inv_tuple, &orig_tp);
+
+	/* Has source changed?. */
+	if (memcmp(&new_tuple.src, &orig_tp.src, sizeof(new_tuple.src))
+	    != 0) {
+		/* In this direction, a source manip. */
+		info->manips[info->num_manips++] =
+			((struct ip_nat_info_manip)
+			 { IP_CT_DIR_ORIGINAL, hooknum,
+			   IP_NAT_MANIP_SRC, new_tuple.src });
+
+		IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
+
+		/* In the reverse direction, a destination manip. */
+		info->manips[info->num_manips++] =
+			((struct ip_nat_info_manip)
+			 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
+			   IP_NAT_MANIP_DST, orig_tp.src });
+		IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
+	}
+
+	/* Has destination changed? */
+	if (memcmp(&new_tuple.dst, &orig_tp.dst, sizeof(new_tuple.dst))
+	    != 0) {
+		/* In this direction, a destination manip */
+		info->manips[info->num_manips++] =
+			((struct ip_nat_info_manip)
+			 { IP_CT_DIR_ORIGINAL, hooknum,
+			   IP_NAT_MANIP_DST, reply.src });
+
+		IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
+
+		/* In the reverse direction, a source manip. */
+		info->manips[info->num_manips++] =
+			((struct ip_nat_info_manip)
+			 { IP_CT_DIR_REPLY, opposite_hook[hooknum],
+			   IP_NAT_MANIP_SRC, inv_tuple.src });
+		IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
+	}
+
+	/* If there's a helper, assign it; based on new tuple. */
+	info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
+				 new_tuple.dst.protonum,
+				 new_tuple.dst.u.all);
+
+	/* It's done. */
+	info->initialized |= (1 << HOOK2MANIP(hooknum));
+	return NF_ACCEPT;
+}
+
+void replace_in_hashes(struct ip_conntrack *conntrack,
+		       struct ip_nat_info *info)
+{
+	/* Source has changed, so replace in hashes. */
+	unsigned int srchash
+		= hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+			      .tuple.src,
+			      conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+			      .tuple.dst.protonum);
+	/* We place packet as seen OUTGOUNG in byips_proto hash
+           (ie. reverse dst and src of reply packet. */
+	unsigned int ipsprotohash
+		= hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
+				   .tuple.dst.ip,
+				   conntrack->tuplehash[IP_CT_DIR_REPLY]
+				   .tuple.src.ip,
+				   conntrack->tuplehash[IP_CT_DIR_REPLY]
+				   .tuple.dst.protonum);
+
+	IP_NF_ASSERT(info->bysource.conntrack == conntrack);
+	MUST_BE_WRITE_LOCKED(&ip_nat_lock);
+
+	list_del(&info->bysource.list);
+	list_del(&info->byipsproto.list);
+
+	list_prepend(&bysource[srchash], &info->bysource);
+	list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
+}
+
+void place_in_hashes(struct ip_conntrack *conntrack,
+		     struct ip_nat_info *info)
+{
+	unsigned int srchash
+		= hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+			      .tuple.src,
+			      conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+			      .tuple.dst.protonum);
+	/* We place packet as seen OUTGOUNG in byips_proto hash
+           (ie. reverse dst and src of reply packet. */
+	unsigned int ipsprotohash
+		= hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
+				   .tuple.dst.ip,
+				   conntrack->tuplehash[IP_CT_DIR_REPLY]
+				   .tuple.src.ip,
+				   conntrack->tuplehash[IP_CT_DIR_REPLY]
+				   .tuple.dst.protonum);
+
+	IP_NF_ASSERT(!info->bysource.conntrack);
+
+	MUST_BE_WRITE_LOCKED(&ip_nat_lock);
+	info->byipsproto.conntrack = conntrack;
+	info->bysource.conntrack = conntrack;
+
+	list_prepend(&bysource[srchash], &info->bysource);
+	list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
+}
+
+static void
+manip_pkt(u_int16_t proto, struct iphdr *iph, size_t len,
+	  const struct ip_conntrack_manip *manip,
+	  enum ip_nat_manip_type maniptype)
+{
+	find_nat_proto(proto)->manip_pkt(iph, len, manip, maniptype);
+
+	if (maniptype == IP_NAT_MANIP_SRC) {
+		iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
+						iph->check);
+		iph->saddr = manip->ip;
+	} else {
+		iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
+						iph->check);
+		iph->daddr = manip->ip;
+	}
+#if 0
+	if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
+		DEBUGP("IP: checksum on packet bad.\n");
+
+	if (proto == IPPROTO_TCP) {
+		void *th = (u_int32_t *)iph + iph->ihl;
+		if (tcp_v4_check(th, len - 4*iph->ihl, iph->saddr, iph->daddr,
+				 csum_partial((char *)th, len-4*iph->ihl, 0)))
+			DEBUGP("TCP: checksum on packet bad\n");
+	}
+#endif
+}
+
+/* Do packet manipulations according to binding. */
+unsigned int
+do_bindings(struct ip_conntrack *ct,
+	    enum ip_conntrack_info ctinfo,
+	    struct ip_nat_info *info,
+	    unsigned int hooknum,
+	    struct sk_buff **pskb)
+{
+	unsigned int i;
+	struct ip_nat_helper *helper;
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+	/* Need nat lock to protect against modification, but neither
+	   conntrack (referenced) and helper (deleted with
+	   synchronize_bh()) can vanish. */
+	READ_LOCK(&ip_nat_lock);
+	for (i = 0; i < info->num_manips; i++) {
+		if (info->manips[i].direction == dir
+		    && info->manips[i].hooknum == hooknum) {
+			DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
+			       *pskb,
+			       info->manips[i].maniptype == IP_NAT_MANIP_SRC
+			       ? "SRC" : "DST",
+			       IP_PARTS(info->manips[i].manip.ip),
+			       htons(info->manips[i].manip.u.all));
+			manip_pkt((*pskb)->nh.iph->protocol,
+				  (*pskb)->nh.iph,
+				  (*pskb)->len,
+				  &info->manips[i].manip,
+				  info->manips[i].maniptype);
+		}
+	}
+	helper = info->helper;
+	READ_UNLOCK(&ip_nat_lock);
+
+	if (helper) {
+		/* Always defragged for helpers */
+		IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
+			       & __constant_htons(IP_MF|IP_OFFSET)));
+		return helper->help(ct, info, ctinfo, hooknum, pskb);
+	} else return NF_ACCEPT;
+}
+
+void
+icmp_reply_translation(struct sk_buff *skb,
+		       struct ip_conntrack *conntrack,
+		       unsigned int hooknum,
+		       int dir)
+{
+	struct iphdr *iph = skb->nh.iph;
+	struct icmphdr *hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
+	struct iphdr *inner = (struct iphdr *)(hdr + 1);
+	size_t datalen = skb->len - ((void *)inner - (void *)iph);
+	unsigned int i;
+	struct ip_nat_info *info = &conntrack->nat.info;
+
+	IP_NF_ASSERT(skb->len >= iph->ihl*4 + sizeof(struct icmphdr));
+
+	DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
+	       skb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
+	/* Note: May not be from a NAT'd host, but probably safest to
+	   do translation always as if it came from the host itself
+	   (even though a "host unreachable" coming from the host
+	   itself is a bit wierd).
+
+	   More explanation: some people use NAT for anonomizing.
+	   Also, CERT recommends dropping all packets from private IP
+	   addresses (although ICMP errors from internal links with
+	   such addresses are not too uncommon, as Alan Cox points
+	   out) */
+
+	READ_LOCK(&ip_nat_lock);
+	for (i = 0; i < info->num_manips; i++) {
+		DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
+		       i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
+		       "ORIG" : "REPLY", info->manips[i].hooknum);
+		/* Mapping the inner packet is just like a normal
+		   packet in the other direction, except it was never
+		   src/dst reversed, so where we would normally apply
+		   a dst manip, we reply a src, and vice versa. */
+		if (info->manips[i].direction != dir
+		    && info->manips[i].hooknum == opposite_hook[hooknum]) {
+			DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
+			       info->manips[i].maniptype == IP_NAT_MANIP_SRC
+			       ? "DST" : "SRC",
+			       IP_PARTS(info->manips[i].manip.ip),
+			       ntohs(info->manips[i].manip.u.udp.port));
+			manip_pkt(inner->protocol, inner,
+				  skb->len - ((void *)inner - (void *)iph),
+				  &info->manips[i].manip,
+				  !info->manips[i].maniptype);
+		}
+		/* Outer packet needs to have IP header NATed like
+                   it's a reply. */
+		else if (info->manips[i].direction != dir
+			 && info->manips[i].hooknum == hooknum) {
+			/* Use mapping to map outer packet: 0 give no
+                           per-proto mapping */
+			DEBUGP("icmp_reply: outer %s %u.%u.%u.%u\n",
+			       info->manips[i].maniptype == IP_NAT_MANIP_SRC
+			       ? "SRC" : "DST",
+			       IP_PARTS(info->manips[i].manip.ip));
+			manip_pkt(0, iph, skb->len,
+				  &info->manips[i].manip,
+				  info->manips[i].maniptype);
+		}
+	}
+	READ_UNLOCK(&ip_nat_lock);
+
+	/* Since we mangled inside ICMP packet, recalculate its
+	   checksum from scratch.  (Hence the handling of incorrect
+	   checksums in conntrack, so we don't accidentally fix one.)  */
+	hdr->checksum = 0;
+	hdr->checksum = ip_compute_csum((unsigned char *)hdr,
+					sizeof(*hdr) + datalen);
+}
+
+int ip_nat_helper_register(struct ip_nat_helper *me)
+{
+	int ret = 0;
+
+	WRITE_LOCK(&ip_nat_lock);
+	if (LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
+		      me->protocol, me->protocol_dst))
+		ret = -EBUSY;
+	else {
+		list_prepend(&helpers, me);
+		MOD_INC_USE_COUNT;
+	}
+	WRITE_UNLOCK(&ip_nat_lock);
+
+	return ret;
+}
+
+static int
+kill_helper(const struct ip_conntrack *i, void *helper)
+{
+	int ret;
+
+	READ_LOCK(&ip_nat_lock);
+	ret = (i->nat.info.helper == helper);
+	READ_UNLOCK(&ip_nat_lock);
+
+	return ret;
+}
+
+void ip_nat_helper_unregister(struct ip_nat_helper *me)
+{
+	WRITE_LOCK(&ip_nat_lock);
+	LIST_DELETE(&helpers, me);
+	WRITE_UNLOCK(&ip_nat_lock);
+
+	/* Someone could be still looking at the helper in a bh. */
+	br_write_lock_bh(BR_NETPROTO_LOCK);
+	br_write_unlock_bh(BR_NETPROTO_LOCK);
+
+	/* Find anything using it, and umm, kill them.  We can't turn
+	   them into normal connections: if we've adjusted SYNs, then
+	   they'll ackstorm.  So we just drop it.  We used to just
+	   bump module count when a connection existed, but that
+	   forces admins to gen fake RSTs or bounce box, either of
+	   which is just a long-winded way of making things
+	   worse. --RR */
+	ip_ct_selective_cleanup(kill_helper, me);
+
+	MOD_DEC_USE_COUNT;
+}
+
+int __init ip_nat_init(void)
+{
+	size_t i;
+
+	/* Sew in builtin protocols. */
+	WRITE_LOCK(&ip_nat_lock);
+	list_append(&protos, &ip_nat_protocol_tcp);
+	list_append(&protos, &ip_nat_protocol_udp);
+	list_append(&protos, &ip_nat_protocol_icmp);
+	WRITE_UNLOCK(&ip_nat_lock);
+
+	for (i = 0; i < IP_NAT_HTABLE_SIZE; i++) {
+		INIT_LIST_HEAD(&bysource[i]);
+		INIT_LIST_HEAD(&byipsproto[i]);
+	}
+
+	/* FIXME: Man, this is a hack.  <SIGH> */
+	IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
+	ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
+
+	return 0;
+}
+
+void ip_nat_cleanup(void)
+{
+	ip_conntrack_destroyed = NULL;
+}

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)