patch-2.1.15 linux/net/ipv4/route.c

Next file: linux/net/ipv4/sysctl_net_ipv4.c
Previous file: linux/net/ipv4/raw.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.1.14/linux/net/ipv4/route.c linux/net/ipv4/route.c
@@ -11,6 +11,7 @@
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
+ *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  *
  * Fixes:
  *		Alan Cox	:	Verify area fixes.
@@ -42,6 +43,8 @@
  *		Bjorn Ekwall	:	Kerneld route support.
  *		Alan Cox	:	Multicast fixed (I hope)
  * 		Pavel Krauz	:	Limited broadcast fixed
+ *	Alexey Kuznetsov	:	End of old history. Splitted to fib.c and
+ *					route.c and rewritten from scratch.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -65,761 +68,60 @@
 #include <linux/inet.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
+#include <linux/proc_fs.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/route.h>
+#include <net/arp.h>
 #include <net/tcp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/icmp.h>
-#include <net/netlink.h>
-#ifdef CONFIG_KERNELD
-#include <linux/kerneld.h>
-#endif
+#include <linux/net_alias.h>
+  
+static void rt_run_flush(unsigned long);
+  
+static struct timer_list rt_flush_timer =
+	{ NULL, NULL, RT_FLUSH_DELAY, 0L, rt_run_flush };
 
 /*
- * Forwarding Information Base definitions.
+ *	Interface to generic destination cache.
  */
 
-struct fib_node
-{
-	struct fib_node		*fib_next;
-	__u32			fib_dst;
-	unsigned long		fib_use;
-	struct fib_info		*fib_info;
-	short			fib_metric;
-	unsigned char		fib_tos;
-};
+static void ipv4_dst_destroy(struct dst_entry * dst);
+static struct dst_entry * ipv4_dst_check(struct dst_entry * dst);
+static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst);
 
-/*
- * This structure contains data shared by many of routes.
- */	
 
-struct fib_info
+struct dst_ops ipv4_dst_ops =
 {
-	struct fib_info		*fib_next;
-	struct fib_info		*fib_prev;
-	__u32			fib_gateway;
-	struct device		*fib_dev;
-	int			fib_refcnt;
-	unsigned long		fib_window;
-	unsigned short		fib_flags;
-	unsigned short		fib_mtu;
-	unsigned short		fib_irtt;
-};
-
-struct fib_zone
-{
-	struct fib_zone	*fz_next;
-	struct fib_node	**fz_hash_table;
-	struct fib_node	*fz_list;
-	int		fz_nent;
-	int		fz_logmask;
-	__u32		fz_mask;
-};
-
-static struct fib_zone 	*fib_zones[33];
-static struct fib_zone 	*fib_zone_list;
-static struct fib_node 	*fib_loopback = NULL;
-static struct fib_info 	*fib_info_list;
-
-/*
- * Backlogging.
- */
-
-#define RT_BH_REDIRECT		0
-#define RT_BH_GARBAGE_COLLECT 	1
-#define RT_BH_FREE	 	2
-
-struct rt_req
-{
-	struct rt_req * rtr_next;
-	struct device *dev;
-	__u32 dst;
-	__u32 gw;
-	unsigned char tos;
+	AF_INET,
+	ipv4_dst_check,
+	ipv4_dst_reroute,
+	ipv4_dst_destroy
 };
 
-int		    	ip_rt_lock;
-unsigned		ip_rt_bh_mask;
-static struct rt_req 	*rt_backlog;
 
 /*
  * Route cache.
  */
 
-struct rtable 		*ip_rt_hash_table[RT_HASH_DIVISOR];
-static int		rt_cache_size;
-static struct rtable 	*rt_free_queue;
-struct wait_queue	*rt_wait;
-
-static void rt_kick_backlog(void);
-static void rt_cache_add(unsigned hash, struct rtable * rth);
-static void rt_cache_flush(void);
-static void rt_garbage_collect_1(void);
-
-/* 
- * Evaluate mask length.
- */
-
-static __inline__ int rt_logmask(__u32 mask)
-{
-	if (!(mask = ntohl(mask)))
-		return 32;
-	return ffz(~mask);
-}
-
-/* 
- * Create mask from length.
- */
-
-static __inline__ __u32 rt_mask(int logmask)
-{
-	if (logmask >= 32)
-		return 0;
-	return htonl(~((1<<logmask)-1));
-}
-
-static __inline__ unsigned fz_hash_code(__u32 dst, int logmask)
-{
-	return ip_rt_hash_code(ntohl(dst)>>logmask);
-}
-
-/*
- * Free FIB node.
- */
-
-static void fib_free_node(struct fib_node * f)
-{
-	struct fib_info * fi = f->fib_info;
-	if (!--fi->fib_refcnt)
-	{
-#if RT_CACHE_DEBUG >= 2
-		printk("fib_free_node: fi %08x/%s is free\n", fi->fib_gateway, fi->fib_dev->name);
-#endif
-		if (fi->fib_next)
-			fi->fib_next->fib_prev = fi->fib_prev;
-		if (fi->fib_prev)
-			fi->fib_prev->fib_next = fi->fib_next;
-		if (fi == fib_info_list)
-			fib_info_list = fi->fib_next;
-	}
-	kfree_s(f, sizeof(struct fib_node));
-}
-
-/*
- * Find gateway route by address.
- */
-
-static struct fib_node * fib_lookup_gateway(__u32 dst)
-{
-	struct fib_zone * fz;
-	struct fib_node * f;
-
-	for (fz = fib_zone_list; fz; fz = fz->fz_next) 
-	{
-		if (fz->fz_hash_table)
-			f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
-		else
-			f = fz->fz_list;
-		
-		for ( ; f; f = f->fib_next)
-		{
-			if ((dst ^ f->fib_dst) & fz->fz_mask)
-				continue;
-			if (f->fib_info->fib_flags & RTF_GATEWAY)
-				return NULL;
-			return f;
-		}
-	}
-	return NULL;
-}
-
-/*
- * Find local route by address.
- * FIXME: I use "longest match" principle. If destination
- *	  has some non-local route, I'll not search shorter matches.
- *	  It's possible, I'm wrong, but I wanted to prevent following
- *	  situation:
- *	route add 193.233.7.128 netmask 255.255.255.192 gw xxxxxx
- *	route add 193.233.7.0	netmask 255.255.255.0 eth1
- *	  (Two ethernets connected by serial line, one is small and other is large)
- *	  Host 193.233.7.129 is locally unreachable,
- *	  but old (<=1.3.37) code will send packets destined for it to eth1.
- *
- */
-
-static struct fib_node * fib_lookup_local(__u32 dst)
-{
-	struct fib_zone * fz;
-	struct fib_node * f;
-
-	for (fz = fib_zone_list; fz; fz = fz->fz_next) 
-	{
-		int longest_match_found = 0;
-
-		if (fz->fz_hash_table)
-			f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
-		else
-			f = fz->fz_list;
-		
-		for ( ; f; f = f->fib_next)
-		{
-			if ((dst ^ f->fib_dst) & fz->fz_mask)
-				continue;
-			if (!(f->fib_info->fib_flags & RTF_GATEWAY))
-				return f;
-			longest_match_found = 1;
-		}
-		if (longest_match_found)
-			return NULL;
-	}
-	return NULL;
-}
-
-/*
- * Main lookup routine.
- *	IMPORTANT NOTE: this algorithm has small difference from <=1.3.37 visible
- *	by user. It doesn't route non-CIDR broadcasts by default.
- *
- *	F.e.
- *		ifconfig eth0 193.233.7.65 netmask 255.255.255.192 broadcast 193.233.7.255
- *	is valid, but if you really are not able (not allowed, do not want) to
- *	use CIDR compliant broadcast 193.233.7.127, you should add host route:
- *		route add -host 193.233.7.255 eth0
- */
-
-static struct fib_node * fib_lookup(__u32 dst)
-{
-	struct fib_zone * fz;
-	struct fib_node * f;
-
-	for (fz = fib_zone_list; fz; fz = fz->fz_next) 
-	{
-		if (fz->fz_hash_table)
-			f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
-		else
-			f = fz->fz_list;
-		
-		for ( ; f; f = f->fib_next)
-		{
-			if ((dst ^ f->fib_dst) & fz->fz_mask)
-				continue;
-			return f;
-		}
-	}
-	return NULL;
-}
-
-static __inline__ struct device * get_gw_dev(__u32 gw)
-{
-	struct fib_node * f;
-	f = fib_lookup_gateway(gw);
-	if (f)
-		return f->fib_info->fib_dev;
-	return NULL;
-}
-
-/*
- *	Check if a mask is acceptable.
- */
- 
-static inline int bad_mask(__u32 mask, __u32 addr)
-{
-	if (addr & (mask = ~mask))
-		return 1;
-	mask = ntohl(mask);
-	if (mask & (mask+1))
-		return 1;
-	return 0;
-}
-
-
-static int fib_del_list(struct fib_node **fp, __u32 dst,
-		struct device * dev, __u32 gtw, short flags, short metric, __u32 mask)
-{
-	struct fib_node *f;
-	int found=0;
-
-	while((f = *fp) != NULL) 
-	{
-		struct fib_info * fi = f->fib_info;
-
-		/*
-		 *	Make sure the destination and netmask match.
-		 *	metric, gateway and device are also checked
-		 *	if they were specified.
-		 */
-		if (f->fib_dst != dst ||
-		    (gtw && fi->fib_gateway != gtw) ||
-		    (metric >= 0 && f->fib_metric != metric) ||
-		    (dev && fi->fib_dev != dev) )
-		{
-			fp = &f->fib_next;
-			continue;
-		}
-		cli();
-		*fp = f->fib_next;
-		if (fib_loopback == f)
-			fib_loopback = NULL;
-		sti();
-		ip_netlink_msg(RTMSG_DELROUTE, dst, gtw, mask, flags, metric, fi->fib_dev->name);
-		fib_free_node(f);
-		found++;
-	}
-	return found;
-}
-
-static __inline__ int fib_del_1(__u32 dst, __u32 mask,
-		struct device * dev, __u32 gtw, short flags, short metric)
-{
-	struct fib_node **fp;
-	struct fib_zone *fz;
-	int found=0;
-
-	if (!mask)
-	{
-		for (fz=fib_zone_list; fz; fz = fz->fz_next)
-		{
-			int tmp;
-			if (fz->fz_hash_table)
-				fp = &fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
-			else
-				fp = &fz->fz_list;
-
-			tmp = fib_del_list(fp, dst, dev, gtw, flags, metric, mask);
-			fz->fz_nent -= tmp;
-			found += tmp;
-		}
-	} 
-	else
-	{
-		if ((fz = fib_zones[rt_logmask(mask)]) != NULL)
-		{
-			if (fz->fz_hash_table)
-				fp = &fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
-			else
-				fp = &fz->fz_list;
-	
-			found = fib_del_list(fp, dst, dev, gtw, flags, metric, mask);
-			fz->fz_nent -= found;
-		}
-	}
-
-	if (found)
-	{
-		rt_cache_flush();
-		return 0;
-	}
-	return -ESRCH;
-}
-
-
-static struct fib_info * fib_create_info(__u32 gw, struct device * dev,
-					 unsigned short flags, unsigned short mss,
-					 unsigned long window, unsigned short irtt)
-{
-	struct fib_info * fi;
-
-	if (!(flags & RTF_MSS))
-	{
-		mss = dev->mtu;
-#ifdef CONFIG_NO_PATH_MTU_DISCOVERY
-		/*
-		 *	If MTU was not specified, use default.
-		 *	If you want to increase MTU for some net (local subnet)
-		 *	use "route add .... mss xxx".
-		 *
-		 * 	The MTU isn't currently always used and computed as it
-		 *	should be as far as I can tell. [Still verifying this is right]
-		 */
-		if ((flags & RTF_GATEWAY) && mss > 576)
-			mss = 576;
-#endif
-	}
-	if (!(flags & RTF_WINDOW))
-		window = 0;
-	if (!(flags & RTF_IRTT))
-		irtt = 0;
-
-	for (fi=fib_info_list; fi; fi = fi->fib_next)
-	{
-		if (fi->fib_gateway != gw ||
-		    fi->fib_dev != dev  ||
-		    fi->fib_flags != flags ||
-		    fi->fib_mtu != mss ||
-		    fi->fib_window != window ||
-		    fi->fib_irtt != irtt)
-			continue;
-		fi->fib_refcnt++;
-#if RT_CACHE_DEBUG >= 2
-		printk("fib_create_info: fi %08x/%s is duplicate\n", fi->fib_gateway, fi->fib_dev->name);
-#endif
-		return fi;
-	}
-	fi = (struct fib_info*)kmalloc(sizeof(struct fib_info), GFP_KERNEL);
-	if (!fi)
-		return NULL;
-	memset(fi, 0, sizeof(struct fib_info));
-	fi->fib_flags = flags;
-	fi->fib_dev = dev;
-	fi->fib_gateway = gw;
-	fi->fib_mtu = mss;
-	fi->fib_window = window;
-	fi->fib_refcnt++;
-	fi->fib_next = fib_info_list;
-	fi->fib_prev = NULL;
-	fi->fib_irtt = irtt;
-	if (fib_info_list)
-		fib_info_list->fib_prev = fi;
-	fib_info_list = fi;
-#if RT_CACHE_DEBUG >= 2
-	printk("fib_create_info: fi %08x/%s is created\n", fi->fib_gateway, fi->fib_dev->name);
-#endif
-	return fi;
-}
-
-
-static __inline__ void fib_add_1(short flags, __u32 dst, __u32 mask,
-	__u32 gw, struct device *dev, unsigned short mss,
-	unsigned long window, unsigned short irtt, short metric)
-{
-	struct fib_node *f, *f1;
-	struct fib_node **fp;
-	struct fib_node **dup_fp = NULL;
-	struct fib_zone * fz;
-	struct fib_info * fi;
-	int logmask;
-
-	/*
-	 *	Allocate an entry and fill it in.
-	 */
-	 
-	f = (struct fib_node *) kmalloc(sizeof(struct fib_node), GFP_KERNEL);
-	if (f == NULL)
-		return;
-
-	memset(f, 0, sizeof(struct fib_node));
-	f->fib_dst = dst;
-	f->fib_metric = metric;
-	f->fib_tos    = 0;
-
-	if  ((fi = fib_create_info(gw, dev, flags, mss, window, irtt)) == NULL)
-	{
-		kfree_s(f, sizeof(struct fib_node));
-		return;
-	}
-	f->fib_info = fi;
-
-	logmask = rt_logmask(mask);
-	fz = fib_zones[logmask];
-
-
-	if (!fz)
-	{
-		int i;
-		fz = kmalloc(sizeof(struct fib_zone), GFP_KERNEL);
-		if (!fz)
-		{
-			fib_free_node(f);
-			return;
-		}
-		memset(fz, 0, sizeof(struct fib_zone));
-		fz->fz_logmask = logmask;
-		fz->fz_mask = mask;
-		for (i=logmask-1; i>=0; i--)
-			if (fib_zones[i])
-				break;
-		cli();
-		if (i<0)
-		{
-			fz->fz_next = fib_zone_list;
-			fib_zone_list = fz;
-		}
-		else
-		{
-			fz->fz_next = fib_zones[i]->fz_next;
-			fib_zones[i]->fz_next = fz;
-		}
-		fib_zones[logmask] = fz;
-		sti();
-	}
-
-	/*
-	 * If zone overgrows RTZ_HASHING_LIMIT, create hash table.
-	 */
-
-	if (fz->fz_nent >= RTZ_HASHING_LIMIT && !fz->fz_hash_table && logmask<32)
-	{
-		struct fib_node ** ht;
-#if RT_CACHE_DEBUG >= 2
-		printk("fib_add_1: hashing for zone %d started\n", logmask);
-#endif
-		ht = kmalloc(RTZ_HASH_DIVISOR*sizeof(struct rtable*), GFP_KERNEL);
-
-		if (ht)
-		{
-			memset(ht, 0, RTZ_HASH_DIVISOR*sizeof(struct fib_node*));
-			cli();
-			f1 = fz->fz_list;
-			while (f1)
-			{
-				struct fib_node * next;
-				unsigned hash = fz_hash_code(f1->fib_dst, logmask);
-				next = f1->fib_next;
-				f1->fib_next = ht[hash];
-				ht[hash] = f1;
-				f1 = next;
-			}
-			fz->fz_list = NULL;
-			fz->fz_hash_table = ht; 
-			sti();
-		}
-	}
-
-	if (fz->fz_hash_table)
-		fp = &fz->fz_hash_table[fz_hash_code(dst, logmask)];
-	else
-		fp = &fz->fz_list;
-
-	/*
-	 * Scan list to find the first route with the same destination
-	 */
-	while ((f1 = *fp) != NULL)
-	{
-		if (f1->fib_dst == dst)
-			break;
-		fp = &f1->fib_next;
-	}
-
-	/*
-	 * Find route with the same destination and less (or equal) metric.
-	 */
-	while ((f1 = *fp) != NULL && f1->fib_dst == dst)
-	{
-		if (f1->fib_metric >= metric)
-			break;
-		/*
-		 *	Record route with the same destination and gateway,
-		 *	but less metric. We'll delete it 
-		 *	after instantiation of new route.
-		 */
-		if (f1->fib_info->fib_gateway == gw &&
-		    (gw || f1->fib_info->fib_dev == dev))
-			dup_fp = fp;
-		fp = &f1->fib_next;
-	}
-
-	/*
-	 * Is it already present?
-	 */
-
-	if (f1 && f1->fib_metric == metric && f1->fib_info == fi)
-	{
-		fib_free_node(f);
-		return;
-	}
-	
-	/*
-	 * Insert new entry to the list.
-	 */
-
-	cli();
-	f->fib_next = f1;
-	*fp = f;
-	if (!fib_loopback && (fi->fib_dev->flags & IFF_LOOPBACK))
-		fib_loopback = f;
-	sti();
-	fz->fz_nent++;
-	ip_netlink_msg(RTMSG_NEWROUTE, dst, gw, mask, flags, metric, fi->fib_dev->name);
-
-	/*
-	 *	Delete route with the same destination and gateway.
-	 *	Note that we should have at most one such route.
-	 */
-	if (dup_fp)
-		fp = dup_fp;
-	else
-		fp = &f->fib_next;
-
-	while ((f1 = *fp) != NULL && f1->fib_dst == dst)
-	{
-		if (f1->fib_info->fib_gateway == gw &&
-		    (gw || f1->fib_info->fib_dev == dev))
-		{
-			cli();
-			*fp = f1->fib_next;
-			if (fib_loopback == f1)
-				fib_loopback = NULL;
-			sti();
-			ip_netlink_msg(RTMSG_DELROUTE, dst, gw, mask, flags, metric, f1->fib_info->fib_dev->name);
-			fib_free_node(f1);
-			fz->fz_nent--;
-			break;
-		}
-		fp = &f1->fib_next;
-	}
-	rt_cache_flush();
-	return;
-}
-
-static int rt_flush_list(struct fib_node ** fp, struct device *dev)
-{
-	int found = 0;
-	struct fib_node *f;
+static atomic_t		 rt_cache_size;
+static struct rtable 	*rt_hash_table[RT_HASH_DIVISOR];
 
-	while ((f = *fp) != NULL) {
-/*
- *	"Magic" device route is allowed to point to loopback,
- *	discard it too.
- */
-		if (f->fib_info->fib_dev != dev &&
-		    (f->fib_info->fib_dev != &loopback_dev || f->fib_dst != dev->pa_addr)) {
-			fp = &f->fib_next;
-			continue;
-		}
-		cli();
-		*fp = f->fib_next;
-		if (fib_loopback == f)
-			fib_loopback = NULL;
-		sti();
-		fib_free_node(f);
-		found++;
-	}
-	return found;
-}
+static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth, u16 protocol);
 
-static __inline__ void fib_flush_1(struct device *dev)
+static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
 {
-	struct fib_zone *fz;
-	int found = 0;
-
-	for (fz = fib_zone_list; fz; fz = fz->fz_next)
-	{
-		if (fz->fz_hash_table)
-		{
-			int i;
-			int tmp = 0;
-			for (i=0; i<RTZ_HASH_DIVISOR; i++)
-				tmp += rt_flush_list(&fz->fz_hash_table[i], dev);
-			fz->fz_nent -= tmp;
-			found += tmp;
-		}
-		else
-		{
-			int tmp;
-			tmp = rt_flush_list(&fz->fz_list, dev);
-			fz->fz_nent -= tmp;
-			found += tmp;
-		}
-	}
-		
-	if (found)
-		rt_cache_flush();
+	unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4);
+	hash = hash^saddr^tos;
+	hash = hash^(hash>>16);
+	return (hash^(hash>>8)) & 0xFF;
 }
 
+#ifdef CONFIG_PROC_FS
 
-/* 
- *	Called from the PROCfs module. This outputs /proc/net/route.
- *
- *	We preserve the old format but pad the buffers out. This means that
- *	we can spin over the other entries as we read them. Remember the
- *	gated BGP4 code could need to read 60,000+ routes on occasion (that's
- *	about 7Mb of data). To do that ok we will need to also cache the
- *	last route we got to (reads will generally be following on from
- *	one another without gaps).
- */
- 
-int rt_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
-{
-	struct fib_zone *fz;
-	struct fib_node *f;
-	int len=0;
-	off_t pos=0;
-	char temp[129];
-	int i;
-	
-	pos = 128;
-
-	if (offset<128)
-	{
-		sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT");
-		len = 128;
-  	}
-  	
-	while  (ip_rt_lock)
-		sleep_on(&rt_wait);
-	ip_rt_fast_lock();
-
-	for (fz=fib_zone_list; fz; fz = fz->fz_next)
-	{
-		int maxslot;
-		struct fib_node ** fp;
-
-		if (fz->fz_nent == 0)
-			continue;
-
-		if (pos + 128*fz->fz_nent <= offset)
-		{
-			pos += 128*fz->fz_nent;
-			len = 0;
-			continue;
-		}
-
-		if (fz->fz_hash_table)
-		{
-			maxslot = RTZ_HASH_DIVISOR;
-			fp	= fz->fz_hash_table;
-		}
-		else
-		{
-			maxslot	= 1;
-			fp	= &fz->fz_list;
-		}
-			
-		for (i=0; i < maxslot; i++, fp++)
-		{
-			
-			for (f = *fp; f; f = f->fib_next) 
-			{
-				struct fib_info * fi;
-				/*
-				 *	Spin through entries until we are ready
-				 */
-				pos += 128;
-
-				if (pos <= offset)
-				{
-					len=0;
-					continue;
-				}
-					
-				fi = f->fib_info;
-				sprintf(temp, "%s\t%08lX\t%08lX\t%02X\t%d\t%lu\t%d\t%08lX\t%d\t%lu\t%u",
-					fi->fib_dev->name, (unsigned long)f->fib_dst, (unsigned long)fi->fib_gateway,
-					fi->fib_flags, 0, f->fib_use, f->fib_metric,
-					(unsigned long)fz->fz_mask, (int)fi->fib_mtu, fi->fib_window, (int)fi->fib_irtt);
-				sprintf(buffer+len,"%-127s\n",temp);
-
-				len += 128;
-				if (pos >= offset+length)
-					goto done;
-			}
-		}
-        }
-
-done:
-	ip_rt_unlock();
-	wake_up(&rt_wait);
-  	
-  	*start = buffer+len-(pos-offset);
-  	len = pos - offset;
-  	if (len>length)
-  		len = length;
-  	return len;
-}
-
-int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
+static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
 {
 	int len=0;
 	off_t pos=0;
@@ -829,36 +131,39 @@
 
 	pos = 128;
 
-	if (offset<128)
-	{
-		sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tHH\tARP");
+	if (offset<128)	{
+		sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHH\tARP");
 		len = 128;
   	}
 	
   	
-	while  (ip_rt_lock)
-		sleep_on(&rt_wait);
-	ip_rt_fast_lock();
-
-	for (i = 0; i<RT_HASH_DIVISOR; i++)
-	{
-		for (r = ip_rt_hash_table[i]; r; r = r->rt_next) 
-		{
+	start_bh_atomic();
+
+	for (i = 0; i<RT_HASH_DIVISOR; i++) {
+		for (r = rt_hash_table[i]; r; r = r->u.rt_next) {
 			/*
 			 *	Spin through entries until we are ready
 			 */
 			pos += 128;
 
-			if (pos <= offset)
-			{
+			if (pos <= offset) {
 				len = 0;
 				continue;
 			}
 					
-			sprintf(temp, "%s\t%08lX\t%08lX\t%02X\t%d\t%u\t%d\t%08lX\t%d\t%lu\t%u\t%d\t%1d",
-				r->rt_dev->name, (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
-				r->rt_flags, r->rt_refcnt, r->rt_use, 0,
-				(unsigned long)r->rt_src, (int)r->rt_mtu, r->rt_window, (int)r->rt_irtt, r->rt_hh ? r->rt_hh->hh_refcnt : -1, r->rt_hh ? r->rt_hh->hh_uptodate : 0);
+			sprintf(temp, "%s\t%08lX\t%08lX\t%X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02x\t%d\t%1d\t%08x\t%02x",
+				r->u.dst.dev ? r->u.dst.dev->name : "*",
+				(unsigned long)r->rt_dst,
+				(unsigned long)r->rt_gateway,
+				r->rt_flags, r->u.dst.refcnt,
+				r->u.dst.use, 0,
+				(unsigned long)r->rt_src, (int)r->u.dst.pmtu,
+				r->u.dst.window,
+				(int)r->u.dst.rtt, r->key.tos,
+				r->u.dst.hh ? r->u.dst.hh->hh_refcnt : -1,
+				r->u.dst.hh ? r->u.dst.hh->hh_uptodate : 0,
+				r->rt_spec_dst,
+				i);
 			sprintf(buffer+len,"%-127s\n",temp);
 			len += 128;
 			if (pos >= offset+length)
@@ -867,8 +172,7 @@
         }
 
 done:
-	ip_rt_unlock();
-	wake_up(&rt_wait);
+	end_bh_atomic();
   	
   	*start = buffer+len-(pos-offset);
   	len = pos-offset;
@@ -876,218 +180,123 @@
   		len = length;
   	return len;
 }
-
-
-static void rt_free(struct rtable * rt)
-{
-	unsigned long flags;
-
-	save_flags(flags);
-	cli();
-	if (!rt->rt_refcnt)
-	{
-		struct hh_cache * hh = rt->rt_hh;
-		rt->rt_hh = NULL;
-		restore_flags(flags);
-		if (hh && atomic_dec_and_test(&hh->hh_refcnt))
-			kfree_s(hh, sizeof(struct hh_cache));
-		kfree_s(rt, sizeof(struct rt_table));
-		return;
-	}
-	rt->rt_next = rt_free_queue;
-	rt->rt_flags &= ~RTF_UP;
-	rt_free_queue = rt;
-	ip_rt_bh_mask |= RT_BH_FREE;
-#if RT_CACHE_DEBUG >= 2
-	printk("rt_free: %08x\n", rt->rt_dst);
 #endif
-	restore_flags(flags);
+  
+static void __inline__ rt_free(struct rtable *rt)
+{
+	dst_free(&rt->u.dst);
 }
 
-/*
- * RT "bottom half" handlers. Called with masked interrupts.
- */
 
-static __inline__ void rt_kick_free_queue(void)
+void ip_rt_check_expire()
 {
-	struct rtable *rt, **rtp;
-
-	rtp = &rt_free_queue;
-
-	while ((rt = *rtp) != NULL)
-	{
-		if  (!rt->rt_refcnt)
-		{
-			struct hh_cache * hh = rt->rt_hh;
-#if RT_CACHE_DEBUG >= 2
-			__u32 daddr = rt->rt_dst;
-#endif
-			*rtp = rt->rt_next;
-			rt->rt_hh = NULL;
-			sti();
-			if (hh && atomic_dec_and_test(&hh->hh_refcnt))
-				kfree_s(hh, sizeof(struct hh_cache));
-			kfree_s(rt, sizeof(struct rt_table));
-#if RT_CACHE_DEBUG >= 2
-			printk("rt_kick_free_queue: %08x is free\n", daddr);
-#endif
-			cli();
-			continue;
-		}
-		rtp = &rt->rt_next;
-	}
-}
+	int i;
+	static int rover;
+	struct rtable *rth, **rthp;
+	unsigned long now = jiffies;
 
-void ip_rt_run_bh()
-{
-	unsigned long flags;
-	save_flags(flags);
-	cli();
-	if (ip_rt_bh_mask && !ip_rt_lock)
-	{
-		if (ip_rt_bh_mask & RT_BH_REDIRECT)
-			rt_kick_backlog();
+	start_bh_atomic();
 
-		if (ip_rt_bh_mask & RT_BH_GARBAGE_COLLECT)
-		{
-			ip_rt_fast_lock();
-			ip_rt_bh_mask &= ~RT_BH_GARBAGE_COLLECT;
-			sti();
-			rt_garbage_collect_1();
-			cli();
-			ip_rt_fast_unlock();
-		}
+	for (i=0; i<RT_HASH_DIVISOR/5; i++) {
+		rover = (rover + 1) & (RT_HASH_DIVISOR-1);
+		rthp = &rt_hash_table[rover];
 
-		if (ip_rt_bh_mask & RT_BH_FREE)
-			rt_kick_free_queue();
-	}
-	restore_flags(flags);
-}
+		while ((rth = *rthp) != NULL) {
+			struct rtable * rth_next = rth->u.rt_next;
 
+			/*
+			 * Cleanup aged off entries.
+			 */
 
-void ip_rt_check_expire()
-{
-	ip_rt_fast_lock();
-	if (ip_rt_lock == 1)
-	{
-		int i;
-		struct rtable *rth, **rthp;
-		unsigned long flags;
-		unsigned long now = jiffies;
-
-		save_flags(flags);
-		for (i=0; i<RT_HASH_DIVISOR; i++)
-		{
-			rthp = &ip_rt_hash_table[i];
-
-			while ((rth = *rthp) != NULL)
-			{
-				struct rtable * rth_next = rth->rt_next;
-
-				/*
-				 * Cleanup aged off entries.
-				 */
-
-				cli();
-				if (!rth->rt_refcnt && rth->rt_lastuse + RT_CACHE_TIMEOUT < now)
-				{
-					*rthp = rth_next;
-					sti();
-					rt_cache_size--;
+			if (!rth->u.dst.refcnt && now - rth->u.dst.lastuse > RT_CACHE_TIMEOUT) {
+				*rthp = rth_next;
+				atomic_dec(&rt_cache_size);
 #if RT_CACHE_DEBUG >= 2
-					printk("rt_check_expire clean %02x@%08x\n", i, rth->rt_dst);
+				printk("rt_check_expire clean %02x@%08x\n", rover, rth->rt_dst);
 #endif
-					rt_free(rth);
-					continue;
-				}
-				sti();
+				rt_free(rth);
+				continue;
+			}
 
-				if (!rth_next)
-					break;
+			if (!rth_next)
+				break;
+
+			/*
+			 * Pseudo-LRU ordering.
+			 * Really we should teach it to move
+			 * rarely used but permanently living entries
+			 * (f.e. rdisc, igmp etc.) to the end of list.
+			 */
 
-				/*
-				 * LRU ordering.
-				 */
-
-				if (rth->rt_lastuse + RT_CACHE_BUBBLE_THRESHOLD < rth_next->rt_lastuse ||
-				    (rth->rt_lastuse < rth_next->rt_lastuse &&
-				     rth->rt_use < rth_next->rt_use))
-				{
+			if ( rth_next->u.dst.lastuse - rth->u.dst.lastuse > RT_CACHE_BUBBLE_THRESHOLD ||
+			    (rth->u.dst.lastuse - rth_next->u.dst.lastuse < 0 &&
+			     rth->u.dst.use < rth_next->u.dst.use)) {
 #if RT_CACHE_DEBUG >= 2
-					printk("rt_check_expire bubbled %02x@%08x<->%08x\n", i, rth->rt_dst, rth_next->rt_dst);
+				printk("rt_check_expire bubbled %02x@%08x<->%08x\n", rover, rth->rt_dst, rth_next->rt_dst);
 #endif
-					cli();
-					*rthp = rth_next;
-					rth->rt_next = rth_next->rt_next;
-					rth_next->rt_next = rth;
-					sti();
-					rthp = &rth_next->rt_next;
-					continue;
-				}
-				rthp = &rth->rt_next;
+				*rthp = rth_next;
+ 				rth->u.rt_next = rth_next->u.rt_next;
+				rth_next->u.rt_next = rth;
+				sti();
+				rthp = &rth_next->u.rt_next;
+				continue;
 			}
+			rthp = &rth->u.rt_next;
 		}
-		restore_flags(flags);
-		rt_kick_free_queue();
 	}
-	ip_rt_unlock();
-}
 
-static void rt_redirect_1(__u32 dst, __u32 gw, struct device *dev)
+	end_bh_atomic();
+}
+  
+  
+void rt_cache_flush(int how)
 {
-	struct rtable *rt;
-	unsigned long hash = ip_rt_hash_code(dst);
-
-	if (gw == dev->pa_addr)
-		return;
-	if (dev != get_gw_dev(gw))
+	start_bh_atomic();
+	if (rt_flush_timer.expires) {
+		if (jiffies - rt_flush_timer.expires > 0 ||
+		    rt_flush_timer.expires - jiffies > RT_FLUSH_DELAY/2)
+			how = 1;
+	}
+	if (how) {
+		if (rt_flush_timer.expires)
+			del_timer(&rt_flush_timer);
+		rt_flush_timer.expires = 0;
+		end_bh_atomic();
+		rt_run_flush(0);
 		return;
-	rt = (struct rtable *) kmalloc(sizeof(struct rtable), GFP_ATOMIC);
-	if (rt == NULL) 
+	}
+	if (rt_flush_timer.expires) {
+		end_bh_atomic();
 		return;
-	memset(rt, 0, sizeof(struct rtable));
-	rt->rt_flags = RTF_DYNAMIC | RTF_MODIFIED | RTF_HOST | RTF_GATEWAY | RTF_UP;
-	rt->rt_dst = dst;
-	rt->rt_dev = dev;
-	rt->rt_gateway = gw;
-	rt->rt_src = dev->pa_addr;
-	rt->rt_mtu = dev->mtu;
-#ifdef CONFIG_NO_PATH_MTU_DISCOVERY
-	if (dev->mtu > 576)
-		rt->rt_mtu = 576;
-#endif
-	rt->rt_lastuse  = jiffies;
-	rt->rt_refcnt  = 1;
-	rt_cache_add(hash, rt);
-	ip_rt_put(rt);
-	return;
+	}
+	del_timer(&rt_flush_timer);
+	rt_flush_timer.expires = jiffies + RT_FLUSH_DELAY;
+	add_timer(&rt_flush_timer);
+	end_bh_atomic();
 }
-
-static void rt_cache_flush(void)
+  
+void rt_run_flush(unsigned long dummy)
 {
 	int i;
 	struct rtable * rth, * next;
 
-	for (i=0; i<RT_HASH_DIVISOR; i++)
-	{
+	for (i=0; i<RT_HASH_DIVISOR; i++) {
 		int nr=0;
 
 		cli();
-		if (!(rth = ip_rt_hash_table[i]))
-		{
+		if (!(rth = rt_hash_table[i])) {
 			sti();
 			continue;
 		}
 
-		ip_rt_hash_table[i] = NULL;
+		rt_hash_table[i] = NULL;
 		sti();
 
-		for (; rth; rth=next)
-		{
-			next = rth->rt_next;
-			rt_cache_size--;
+		for (; rth; rth=next) {
+			next = rth->u.rt_next;
+			atomic_dec(&rt_cache_size);
 			nr++;
-			rth->rt_next = NULL;
+			rth->u.rt_next = NULL;
 			rt_free(rth);
 		}
 #if RT_CACHE_DEBUG >= 2
@@ -1095,631 +304,1090 @@
 			printk("rt_cache_flush: %d@%02x\n", nr, i);
 #endif
 	}
-#if RT_CACHE_DEBUG >= 1
-	if (rt_cache_size)
-	{
-		printk("rt_cache_flush: bug rt_cache_size=%d\n", rt_cache_size);
-		rt_cache_size = 0;
+}
+
+static void rt_garbage_collect(void)
+{
+	int i;
+	static unsigned expire = RT_CACHE_TIMEOUT>>1;
+	static unsigned long last_gc;
+	struct rtable *rth, **rthp;
+	unsigned long now;
+
+	start_bh_atomic();
+	now = jiffies;
+
+	/*
+	 * Garbage collection is pretty expensive,
+	 * do not make it too frequently.
+	 */
+	if (now - last_gc < 1*HZ) {
+		expire >>= 1;
+		end_bh_atomic();
+		return;
+	}
+
+	expire++;
+
+	for (i=0; i<RT_HASH_DIVISOR; i++) {
+		if (!rt_hash_table[i])
+			continue;
+		for (rthp=&rt_hash_table[i]; (rth=*rthp); rthp=&rth->u.rt_next)	{
+			if (rth->u.dst.refcnt || now - rth->u.dst.lastuse > expire)
+				continue;
+			atomic_dec(&rt_cache_size);
+			*rthp = rth->u.rt_next;
+			rth->u.rt_next = NULL;
+			rt_free(rth);
+			break;
+		}
+	}
+
+	last_gc = now;
+	if (rt_cache_size < RT_CACHE_MAX_SIZE)
+		expire = RT_CACHE_TIMEOUT>>1;
+	else
+		expire >>= 1;
+	end_bh_atomic();
+}
+
+static int rt_ll_bind(struct rtable *rt)
+{
+	struct dst_entry *neigh;
+	struct hh_cache	*hh = NULL;
+
+	if (rt->u.dst.dev && rt->u.dst.dev->hard_header_cache) {
+		neigh = rt->u.dst.neighbour;
+		if (!neigh)
+			neigh = arp_find_neighbour(&rt->u.dst, 1);
+
+		if (neigh) {
+			rt->u.dst.neighbour = neigh;
+			for (hh=neigh->hh; hh; hh = hh->hh_next)
+				if (hh->hh_type == ETH_P_IP)
+					break;
+		}
+
+		if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {
+#if RT_CACHE_DEBUG >= 2
+			extern atomic_t hh_count;
+			atomic_inc(&hh_count);
+#endif
+			memset(hh, 0, sizeof(struct hh_cache));
+			hh->hh_type = ETH_P_IP;
+			hh->hh_refcnt = 0;
+			hh->hh_next = NULL;
+			if (rt->u.dst.dev->hard_header_cache(&rt->u.dst, neigh, hh)) {
+				kfree(hh);
+#if RT_CACHE_DEBUG >= 2
+				atomic_dec(&hh_count);
+#endif
+				hh = NULL;
+			} else if (neigh) {
+				atomic_inc(&hh->hh_refcnt);
+				hh->hh_next = neigh->hh;
+				neigh->hh = hh;
+			}
+		}
+		if (hh)	{
+			atomic_inc(&hh->hh_refcnt);
+			rt->u.dst.hh = hh;
+			return hh->hh_uptodate;
+		}
+	}
+	return 0;
+}
+
+
+static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt, u16 protocol)
+{
+	struct rtable	*rth, **rthp;
+	unsigned long	now = jiffies;
+
+	rt->u.dst.priority = rt_tos2priority(rt->key.tos);
+
+	start_bh_atomic();
+
+	rthp = &rt_hash_table[hash];
+
+	while ((rth = *rthp) != NULL) {
+		if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
+			/* Put it first */
+			*rthp = rth->u.rt_next;
+			rth->u.rt_next = rt_hash_table[hash];
+			rt_hash_table[hash] = rth;
+
+			atomic_inc(&rth->u.dst.refcnt);
+			atomic_inc(&rth->u.dst.use);
+			rth->u.dst.lastuse = now;
+			end_bh_atomic();
+
+			ip_rt_put(rt);
+			rt_free(rt);
+			return rth;
+		}
+
+		rthp = &rth->u.rt_next;
+	}
+
+	if (rt_cache_size >= RT_CACHE_MAX_SIZE)
+		rt_garbage_collect();
+
+	rt->u.rt_next = rt_hash_table[hash];
+#if RT_CACHE_DEBUG >= 2
+	if (rt->u.rt_next) {
+		struct rtable * trt;
+		printk("rt_cache @%02x: %08x", hash, rt->rt_dst);
+		for (trt=rt->u.rt_next; trt; trt=trt->u.rt_next)
+			printk(" . %08x", trt->rt_dst);
+		printk("\n");
 	}
 #endif
+	rt_hash_table[hash] = rt;
+	atomic_inc(&rt_cache_size);
+
+	if (protocol == ETH_P_IP)
+		rt_ll_bind(rt);
+
+	end_bh_atomic();
+	return rt;
 }
 
-static void rt_garbage_collect_1(void)
+void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
+		    u32 saddr, u8 tos, struct device *dev)
 {
 	int i;
-	unsigned expire = RT_CACHE_TIMEOUT>>1;
-	struct rtable * rth, **rthp;
-	unsigned long now = jiffies;
-
-	for (;;)
-	{
-		for (i=0; i<RT_HASH_DIVISOR; i++)
-		{
-			if (!ip_rt_hash_table[i])
+	int  off_link = 0;
+	struct fib_info *fi;
+	struct rtable *rth, **rthp;
+	u32  skeys[2] = { saddr, 0, };
+	struct device *pdev = net_alias_main_dev(dev);
+
+	tos &= IPTOS_TOS_MASK;
+
+	if (new_gw == old_gw || !ipv4_config.accept_redirects
+	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
+		goto reject_redirect;
+
+	if ((new_gw^dev->pa_addr)&dev->pa_mask)
+		off_link = 1;
+
+	if (!ipv4_config.rfc1620_redirects) {
+		if (off_link)
+			goto reject_redirect;
+		if (ipv4_config.secure_redirects && ip_fib_chk_default_gw(new_gw, dev))
+			goto reject_redirect;
+	}
+
+	fi = fib_lookup_info(new_gw, 0, 0, &loopback_dev, NULL);
+	if (fi == NULL || fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_NAT))
+		goto reject_redirect;
+
+	for (i=0; i<2; i++) {
+		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
+
+		rthp=&rt_hash_table[hash];
+
+		while ( (rth = *rthp) != NULL) {
+			struct rtable *rt;
+
+			if (rth->key.dst != daddr ||
+			    rth->key.src != skeys[i] ||
+			    rth->key.tos != tos ||
+			    rth->key.dst_dev != NULL ||
+			    rth->key.src_dev != NULL) {
+				rthp = &rth->u.rt_next;
 				continue;
-			for (rthp=&ip_rt_hash_table[i]; (rth=*rthp); rthp=&rth->rt_next)
-			{
-				if (rth->rt_lastuse + expire*(rth->rt_refcnt+1) > now)
-					continue;
-				rt_cache_size--;
-				cli();
-				*rthp=rth->rt_next;
-				rth->rt_next = NULL;
-				sti();
-				rt_free(rth);
+			}
+
+			if (rth->rt_dst != daddr ||
+			    rth->rt_src != saddr ||
+			    rth->rt_flags&RTF_REJECT ||
+			    rth->rt_gateway != old_gw ||
+			    rth->u.dst.dev != dev)
 				break;
+
+			rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+			if (rt == NULL)
+				return;
+
+			/*
+			 * Copy all the information.
+			 */
+			rt->u.dst.refcnt = 1;
+			rt->u.dst.dev = dev;
+			rt->u.dst.input = rth->u.dst.input;
+			rt->u.dst.output = rth->u.dst.output;
+			rt->u.dst.pmtu = dev->mtu;
+			rt->u.dst.rtt = TCP_TIMEOUT_INIT;
+			rt->u.dst.window = 0;
+			rt->u.dst.use = 1;
+			rt->u.dst.lastuse = jiffies;
+
+			rt->rt_flags = rth->rt_flags|RTF_DYNAMIC|RTF_MODIFIED;
+			rt->rt_flags &= ~RTF_GATEWAY;
+			if (new_gw != daddr)
+				rt->rt_flags |= RTF_GATEWAY;
+
+			rt->rt_src = rth->rt_src;
+			rt->rt_dst = rth->rt_dst;
+			rt->rt_src_dev = rth->rt_src_dev;
+			rt->rt_spec_dst = rth->rt_spec_dst;
+			rt->key = rth->key;
+
+			/* But gateway is different ... */
+			rt->rt_gateway = new_gw;
+
+			if (off_link) {
+				if (fi->fib_dev != dev &&
+				    net_alias_main_dev(fi->fib_dev) == pdev)
+					rt->u.dst.dev = fi->fib_dev;
 			}
+
+			if (ipv4_config.rfc1620_redirects && !rt_ll_bind(rt)) {
+				ip_rt_put(rt);
+				rt_free(rt);
+				break;
+			}
+
+			*rthp = rth->u.rt_next;
+			rt_free(rth);
+			rt = rt_intern_hash(hash, rt, ETH_P_IP);
+			ip_rt_put(rt);
+			break;
 		}
-		if (rt_cache_size < RT_CACHE_SIZE_MAX)
-			return;
-		expire >>= 1;
 	}
+	return;
+
+reject_redirect:
+	if (ipv4_config.log_martians)
+		printk(KERN_INFO "Redirect from %lX/%s to %lX ignored."
+		       "Path = %lX -> %lX, tos %02x\n",
+		       ntohl(old_gw), dev->name, ntohl(new_gw),
+		       ntohl(saddr), ntohl(daddr), tos);
 }
 
-static __inline__ void rt_req_enqueue(struct rt_req **q, struct rt_req *rtr)
+
+void ip_rt_advice(struct rtable **rp, int advice)
 {
-	unsigned long flags;
-	struct rt_req * tail;
+	struct rtable *rt;
 
-	save_flags(flags);
-	cli();
-	tail = *q;
-	if (!tail)
-		rtr->rtr_next = rtr;
-	else
-	{
-		rtr->rtr_next = tail->rtr_next;
-		tail->rtr_next = rtr;
+	if (advice)
+		return;
+
+	start_bh_atomic();
+	if ((rt = *rp) != NULL && (rt->rt_flags&(RTF_DYNAMIC|RTF_MODIFIED))) {
+#if RT_CACHE_DEBUG >= 1
+		printk(KERN_DEBUG "ip_rt_advice: redirect to %08x/%02x dropped\n", rt->rt_dst, rt->key.tos);
+#endif
+		*rp = NULL;
+		ip_rt_put(rt);
+		rt_cache_flush(0);
 	}
-	*q = rtr;
-	restore_flags(flags);
+	end_bh_atomic();
 	return;
 }
 
 /*
- * Caller should mask interrupts.
+ * Algorithm:
+ *	1. The first RT_REDIRECT_NUMBER redirects are sent
+ *	   with exponential backoff, then we stop sending them at all,
+ *	   assuming that the host ignores our redirects.
+ *	2. If we did not see a packets requiring redirects
+ *	   during RT_REDIRECT_SILENCE, we assume that the host
+ *	   forgot redirected route and start to send redirects again.
+ *
+ * This algorithm is much cheaper and more intelligent than dumb load limiting
+ * in icmp.c.
+ *
+ * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
+ * and "frag. need" (breaks PMTU discovery) in icmp.c.
  */
 
-static __inline__ struct rt_req * rt_req_dequeue(struct rt_req **q)
+void ip_rt_send_redirect(struct sk_buff *skb)
 {
-	struct rt_req * rtr;
+	struct rtable *rt = (struct rtable*)skb->dst;
 
-	if (*q)
-	{
-		rtr = (*q)->rtr_next;
-		(*q)->rtr_next = rtr->rtr_next;
-		if (rtr->rtr_next == rtr)
-			*q = NULL;
-		rtr->rtr_next = NULL;
-		return rtr;
+	/* No redirected packets during RT_REDIRECT_SILENCE;
+	 * reset the algorithm.
+	 */
+	if (jiffies - rt->last_error > RT_REDIRECT_SILENCE)
+		rt->errors = 0;
+
+	/* Too many ignored redirects; do not send anything
+	 * set last_error to the last seen redirected packet.
+	 */
+	if (rt->errors >= RT_REDIRECT_NUMBER) {
+		rt->last_error = jiffies;
+		return;
+	}
+
+	/* Check for load limit; set last_error to the latest sent
+	 * redirect.
+	 */
+	if (jiffies - rt->last_error > (RT_REDIRECT_LOAD<<rt->errors)) {
+		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+		rt->last_error = jiffies;
+		if (ipv4_config.log_martians && ++rt->errors == RT_REDIRECT_NUMBER)
+			printk(KERN_WARNING "host %08x/%s ignores redirects.\n", rt->rt_src, rt->rt_src_dev->name);
 	}
-	return NULL;
 }
 
-/*
-   Called with masked interrupts
- */
+static int ip_error(struct sk_buff *skb)
+{
+	struct rtable *rt = (struct rtable*)skb->dst;
+	int code;
+
+	switch (rt->u.dst.error) {
+	case EINVAL:
+	default:
+		kfree_skb(skb, FREE_READ);
+		return 0;
+	case ENETUNREACH:
+		code = ICMP_NET_UNREACH;
+		break;
+	case EACCES:
+		code = ICMP_PKT_FILTERED;
+		break;
+	}
+	if (jiffies - rt->last_error > RT_ERROR_LOAD) {
+		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
+		rt->last_error = jiffies;
+	}
+	kfree_skb(skb, FREE_READ);
+	return 0;
+} 
+
 
-static void rt_kick_backlog()
+static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
 {
-	if (!ip_rt_lock)
-	{
-		struct rt_req * rtr;
+	if (old_mtu > 32000)
+		return 32000;
+	else if (old_mtu > 17914)
+		return 17914;
+	else if (old_mtu > 8166)
+		return 8166;
+	else if (old_mtu > 4352)
+		return 4352;
+	else if (old_mtu > 2002)
+		return 2002;
+	else if (old_mtu > 1492)
+		return 1492;
+	else if (old_mtu > 576)
+		return 576;
+	else if (old_mtu > 296)
+		return 296;
+	/*
+	 *	These two are not from the RFC but
+	 *	are needed for AMPRnet AX.25 paths.
+	 */
+	else if (old_mtu > 216)
+		return 216;
+	else if (old_mtu > 128)
+		return 128;
+	return 68;
+}
 
-		ip_rt_fast_lock();
 
-		while ((rtr = rt_req_dequeue(&rt_backlog)) != NULL)
-		{
-			sti();
-			rt_redirect_1(rtr->dst, rtr->gw, rtr->dev);
-			kfree_s(rtr, sizeof(struct rt_req));
-			cli();
+unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
+{
+	int i;
+	unsigned short old_mtu = ntohs(iph->tot_len);
+	struct rtable *rth;
+	u32  skeys[2] = { iph->saddr, 0, };
+	u32  daddr = iph->daddr;
+	u8   tos = iph->tos & IPTOS_TOS_MASK;
+	unsigned short est_mtu = 0;
+
+	if (ipv4_config.no_pmtu_disc)
+		return 0;
+
+	for (i=0; i<2; i++) {
+		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
+
+		for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) {
+			if (rth->key.dst == daddr &&
+			    rth->key.src == skeys[i] &&
+			    rth->rt_dst == daddr &&
+			    rth->rt_src == iph->saddr &&
+			    rth->key.tos == tos &&
+			    !rth->key.src_dev &&
+			    !(rth->rt_flags&RTF_NOPMTUDISC)) {
+				unsigned short mtu = new_mtu;
+
+				if (new_mtu < 68 || new_mtu >= old_mtu) {
+
+					/* BSD 4.2 compatibility hack :-( */
+					if (mtu == 0 && old_mtu >= rth->u.dst.pmtu &&
+					    old_mtu >= 68 + (iph->ihl<<2))
+						old_mtu -= iph->ihl<<2;
+
+					mtu = guess_mtu(old_mtu);
+				}
+				if (mtu < rth->u.dst.pmtu) {
+					rth->u.dst.pmtu = mtu;
+					est_mtu = mtu;
+				}
+			}
 		}
+	}
+	return est_mtu;
+}
 
-		ip_rt_bh_mask &= ~RT_BH_REDIRECT;
 
-		ip_rt_fast_unlock();
+static void ipv4_dst_destroy(struct dst_entry * dst)
+{
+	struct rtable * rt = (struct rtable*)dst;
+	struct hh_cache * hh = rt->u.dst.hh;
+	rt->u.dst.hh = NULL;
+	if (hh && atomic_dec_and_test(&hh->hh_refcnt)) {
+#if RT_CACHE_DEBUG >= 2
+		extern atomic_t hh_count;
+		atomic_dec(&hh_count);
+#endif
+		kfree(hh);
 	}
 }
 
-/*
- * rt_{del|add|flush} called only from USER process. Waiting is OK.
- */
+static struct dst_entry * ipv4_dst_check(struct dst_entry * dst)
+{
+	return NULL;
+}
 
-static int rt_del(__u32 dst, __u32 mask,
-		struct device * dev, __u32 gtw, short rt_flags, short metric)
+static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst)
 {
-	int retval;
+	return NULL;
+}
+
+int
+ip_check_mc(struct device *dev, u32 mc_addr)
+{
+	struct ip_mc_list *ip_mc;
+
+	if (mc_addr==htonl(INADDR_ALLHOSTS_GROUP))
+		return 1;
+
+	for (ip_mc=dev->ip_mc_list; ip_mc; ip_mc=ip_mc->next)
+		if (ip_mc->multiaddr == mc_addr)
+			return 1;
+	return 0;
+}
 
-	while (ip_rt_lock)
-		sleep_on(&rt_wait);
-	ip_rt_fast_lock();
-	retval = fib_del_1(dst, mask, dev, gtw, rt_flags, metric);
-	ip_rt_unlock();
-	wake_up(&rt_wait);
-	return retval;
-}
-
-static void rt_add(short flags, __u32 dst, __u32 mask,
-	__u32 gw, struct device *dev, unsigned short mss,
-	unsigned long window, unsigned short irtt, short metric)
-{
-	while (ip_rt_lock)
-		sleep_on(&rt_wait);
-	ip_rt_fast_lock();
-	fib_add_1(flags, dst, mask, gw, dev, mss, window, irtt, metric);
-	ip_rt_unlock();
-	wake_up(&rt_wait);
-}
-
-void ip_rt_flush(struct device *dev)
-{
-	while (ip_rt_lock)
-		sleep_on(&rt_wait);
-	ip_rt_fast_lock();
-	fib_flush_1(dev);
-	ip_rt_unlock();
-	wake_up(&rt_wait);
+static int ip_rt_bug(struct sk_buff *skb)
+{
+	kfree_skb(skb, FREE_WRITE);
+	printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr,
+	       skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?");
+	return 0;
 }
 
 /*
-   Called by ICMP module.
+ *	This function is called ONLY FROM NET BH. No locking!
+ *
+ *	NOTE. We drop all the packets that has local source
+ *	addresses, because every properly looped back packet
+ *	must have correct destination already attached by output routine.
+ *
+ *	Such approach solves two big problems:
+ *	1. Not simplex devices (if they exist 8)) are handled properly.
+ *	2. IP spoofing attempts are filtered with 100% of guarantee.
  */
 
-void ip_rt_redirect(__u32 src, __u32 dst, __u32 gw, struct device *dev)
-{
-	struct rt_req * rtr;
-	struct rtable * rt;
+int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
+			u8 tos, struct device *pdev)
+{
+	struct device * dev = pdev;
+	struct fib_info *fi = NULL;
+	struct fib_info *src_fi = NULL;
+	unsigned	flags = 0;
+	struct	device	*devout;
+	struct rtable * rth;
+	unsigned	hash;
+	struct fib_result res;
+	u32	src_key = saddr;
+	u32	dst_key = daddr;
+	int	err = -EINVAL;
+	int	log = 0;
 
-	rt = ip_rt_route(dst, 0);
-	if (!rt)
-		return;
+	hash = rt_hash_code(daddr, saddr^(unsigned long)pdev, tos);
 
-	if (rt->rt_gateway != src ||
-	    rt->rt_dev != dev ||
-	    ((gw^dev->pa_addr)&dev->pa_mask) ||
-	    ip_chk_addr(gw))
-	{
-		ip_rt_put(rt);
-		return;
-	}
-	ip_rt_put(rt);
+	/*	Check for martians... */
 
-	ip_rt_fast_lock();
-	if (ip_rt_lock == 1)
-	{
-		rt_redirect_1(dst, gw, dev);
-		ip_rt_unlock();
-		return;
-	}
+	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
+		goto martian_source;
+	if (MULTICAST(daddr) || daddr == 0xFFFFFFFF)
+		goto mc_input;
 
-	rtr = kmalloc(sizeof(struct rt_req), GFP_ATOMIC);
-	if (rtr)
-	{
-		rtr->dst = dst;
-		rtr->gw = gw;
-		rtr->dev = dev;
-		rt_req_enqueue(&rt_backlog, rtr);
-		ip_rt_bh_mask |= RT_BH_REDIRECT;
-	}
-	ip_rt_unlock();
-}
+	/* Accept zero addresses only to limited broadcast/multicasts;
+	 * I even do not know to fix it or not.
+	 */
+	if (ZERONET(saddr))
+		goto martian_source;
+	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
+		goto martian_destination;
 
+	/*
+	 * Device is not yet initialized, accept all addresses as ours.
+	 */
+	if (ZERONET(dev->pa_addr))
+		goto promisc_ip;
 
-static __inline__ void rt_garbage_collect(void)
-{
-	if (ip_rt_lock == 1)
-	{
-		rt_garbage_collect_1();
-		return;
+	/*
+	 *	Now we are able to route packet.
+	 */
+	if ((err = fib_lookup(&res, daddr, saddr, tos, pdev, NULL)) < 0) {
+		if (!IS_ROUTER)
+			return -EINVAL;
+		goto no_route;
 	}
-	ip_rt_bh_mask |= RT_BH_GARBAGE_COLLECT;
-}
 
-static void rt_cache_add(unsigned hash, struct rtable * rth)
-{
-	unsigned long	flags;
-	struct rtable	**rthp;
-	__u32		daddr = rth->rt_dst;
-	unsigned long	now = jiffies;
+	fi = res.f->fib_info;
+	flags  = fi->fib_flags;
+	devout = fi->fib_dev;
 
-#if RT_CACHE_DEBUG >= 2
-	if (ip_rt_lock != 1)
-	{
-		printk("rt_cache_add: ip_rt_lock==%d\n", ip_rt_lock);
-		return;
+	if (flags&RTF_NAT) {
+		daddr = htonl((ntohl(daddr)&((1<<res.fm)-1)))|fi->fib_gateway;
+		fi = fib_lookup_info(daddr, saddr, tos, pdev, NULL);
+		if (!fi || fi->fib_flags&(RTF_NAT|RTF_LOCAL|RTF_MULTICAST|RTF_BROADCAST))
+			return -EINVAL;
+		devout = fi->fib_dev;
+		flags = fi->fib_flags|RTCF_NAT|RTF_NAT;
 	}
-#endif
 
-	save_flags(flags);
+	switch (res.fr->cl_action) {
+	case RTP_NAT:
+		/* Packet is from  translated source; remember it */
+		saddr = (saddr&~res.fr->cl_srcmask)|res.fr->cl_srcmap;
+		flags |= RTCF_NAT;
+		break;
+	case RTP_MASQUERADE:
+		/* Packet is from masqueraded source; remember it */
+		flags |= RTCF_MASQ;
+		break;
+	default:
+	}
+	log = res.fr->cl_flags&RTRF_LOG;
 
-	if (rth->rt_dev->header_cache_bind)
-	{
-		struct rtable * rtg = rth;
-
-		if (rth->rt_gateway != daddr)
-		{
-			ip_rt_fast_unlock();
-			rtg = ip_rt_route(rth->rt_gateway, 0);
-			ip_rt_fast_lock();
-		}
+	if (!(flags & RTF_LOCAL)) {
+		if (!IS_ROUTER || flags&RTF_NOFORWARD)
+			return -EINVAL;
+	} else {
+		fi = NULL;
+		devout = &loopback_dev;
+		if (flags&RTF_BROADCAST)
+		    goto mc_input;
+	}
 
-		if (rtg)
-		{
-			if (rtg == rth)
-				rtg->rt_dev->header_cache_bind(&rtg->rt_hh, rtg->rt_dev, ETH_P_IP, rtg->rt_dst);
-			else
-			{
-				if (rtg->rt_hh)
-					atomic_inc(&rtg->rt_hh->hh_refcnt);
-				rth->rt_hh = rtg->rt_hh;
-				ip_rt_put(rtg);
-			}
-		}
+#ifndef CONFIG_IP_LOCAL_RT_POLICY
+	if (flags&RTF_LOCAL)
+		src_fi = fib_lookup_info(src_key, 0, tos, &loopback_dev, NULL);
+	else
+#endif
+	if (fib_lookup(&res, src_key, daddr, tos, net_alias_main_dev(devout), NULL) == 0) {
+		src_fi = res.f->fib_info;
+		/* Destination is on masqueraded network:
+		 * if it is real incoming frame, ip_forward will drop it.
+		 */
+		if (res.fr->cl_flags&RTRF_VALVE)
+			flags |= RTCF_VALVE;
 	}
 
-	if (rt_cache_size >= RT_CACHE_SIZE_MAX)
-		rt_garbage_collect();
+        if (src_fi) {
+		if (src_fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTF_NAT))
+			goto martian_source;
+
+		if (!(src_fi->fib_flags&RTF_GATEWAY))
+			flags |= RTCF_DIRECTSRC;
+
+		if (net_alias_main_dev(src_fi->fib_dev) == pdev)
+			skb->dev = dev = src_fi->fib_dev;
+		else {
+			/* Route to packet source goes via
+			   different interface; rfc1812 proposes
+			   to drop them.
+			   It is dangerous on not-stub/transit networks
+			   because of path asymmetry.
+			 */
+			if (ipv4_config.rfc1812_filter >= 2)
+				goto martian_source;
 
-	cli();
-	rth->rt_next = ip_rt_hash_table[hash];
-#if RT_CACHE_DEBUG >= 2
-	if (rth->rt_next)
-	{
-		struct rtable * trth;
-		printk("rt_cache @%02x: %08x", hash, daddr);
-		for (trth=rth->rt_next; trth; trth=trth->rt_next)
-			printk(" . %08x", trth->rt_dst);
-		printk("\n");
+			/* Weaker form of rfc1812 filtering.
+			   If source is on directly connected network,
+			   it can mean either local network configuration error
+			   (the most probable case) or real IP spoofing attempt.
+			 */
+			if (ipv4_config.rfc1812_filter >= 1 && !(flags&RTCF_DIRECTSRC))
+				goto martian_source;
+		}
+	} else if (ipv4_config.rfc1812_filter >= 1)
+		goto martian_source;
+
+make_route:
+	if (skb->protocol != __constant_htons(ETH_P_IP)) {
+		/* ARP request. Do not make route for invalid destination or
+		 * if it is redirected.
+		 */
+		if (flags&(RTF_REJECT|RTF_BROADCAST|RTF_MULTICAST) ||
+		    skb->pkt_type == PACKET_OTHERHOST ||
+		    (devout == dev && !(flags&(RTF_LOCAL|RTCF_NAT))))
+			return -EINVAL;
 	}
-#endif
-	ip_rt_hash_table[hash] = rth;
-	rthp = &rth->rt_next;
-	sti();
-	rt_cache_size++;
 
-	/*
-	 * Cleanup duplicate (and aged off) entries.
-	 */
+	rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+	if (!rth)
+		return -ENOBUFS;
 
-	while ((rth = *rthp) != NULL)
-	{
+	rth->u.dst.output= ip_rt_bug;
 
-		cli();
-		if ((!rth->rt_refcnt && rth->rt_lastuse + RT_CACHE_TIMEOUT < now)
-		    || rth->rt_dst == daddr)
-		{
-			*rthp = rth->rt_next;
-			rt_cache_size--;
-			sti();
-#if RT_CACHE_DEBUG >= 2
-			printk("rt_cache clean %02x@%08x\n", hash, rth->rt_dst);
+	rth->u.dst.use	= 1;
+	rth->key.dst	= dst_key;
+	rth->rt_dst	= dst_key;
+	rth->rt_dst_map	= daddr;
+	rth->key.tos	= tos;
+	rth->key.src	= src_key;
+	rth->rt_src	= src_key;
+	rth->rt_src_map	= saddr;
+	rth->rt_src_dev = dev;
+	rth->key.src_dev= pdev;
+	rth->u.dst.dev	= devout;
+	rth->key.dst_dev= NULL;
+	rth->rt_gateway	= daddr;
+	rth->rt_spec_dst= daddr;
+
+	if (!(flags&RTF_REJECT)) {
+		if (flags&RTF_LOCAL)
+			rth->u.dst.input= ip_local_deliver;
+		if (!(flags&(RTF_NOFORWARD|RTF_BROADCAST))) {
+			if (flags&RTF_MULTICAST) {
+#ifdef CONFIG_IP_MROUTE
+				if (!LOCAL_MCAST(daddr) && ipv4_config.multicast_route) {
+					rth->u.dst.input = ip_mr_input;
+					rth->u.dst.output = ip_output;
+				}
 #endif
-			rt_free(rth);
-			continue;
+			} else if (!(flags&RTF_LOCAL)) {
+				rth->u.dst.input = ip_forward;
+				rth->u.dst.output = ip_output;
+			}
 		}
-		sti();
-		rthp = &rth->rt_next;
-	}
-	restore_flags(flags);
-}
-
-/*
-   RT should be already locked.
-   
-   We could improve this by keeping a chain of say 32 struct rtable's
-   last freed for fast recycling.
-   
- */
+	} else if (IS_ROUTER && !(flags&(RTF_MULTICAST|RTF_BROADCAST))) {
+		rth->u.dst.input= ip_error;
+		rth->u.dst.error= -err;
+	}
+
+	if ((flags&(RTF_BROADCAST|RTF_MULTICAST)) || !(flags&RTF_LOCAL))
+		rth->rt_spec_dst= dev->pa_addr;
+
+	if (fi) {
+		rth->u.dst.pmtu	= fi->fib_mtu;
+		rth->u.dst.window=fi->fib_window;
+		rth->u.dst.rtt	= fi->fib_irtt;
+		if (flags & RTF_GATEWAY)
+			rth->rt_gateway	= fi->fib_gateway;
+	} else {
+		rth->u.dst.pmtu	= devout->mtu;
+		rth->u.dst.window=0;
+		rth->u.dst.rtt	= TCP_TIMEOUT_INIT;
+	}
+
+	if (!(flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTCF_NAT)) &&
+	    flags&RTCF_DIRECTSRC &&
+	    (devout == dev || (ipv4_config.rfc1620_redirects &&
+			       net_alias_main_dev(devout) == pdev)))
+		flags |= RTCF_DOREDIRECT;
 
-struct rtable * ip_rt_slow_route (__u32 daddr, int local)
-{
-	unsigned hash = ip_rt_hash_code(daddr)^local;
-	struct rtable * rth;
-	struct fib_node * f;
-	struct fib_info * fi;
-	__u32 saddr;
+	rth->rt_flags = flags;
 
-#if RT_CACHE_DEBUG >= 2
-	printk("rt_cache miss @%08x\n", daddr);
-#endif
+	if (log)
+		printk(KERN_INFO "installing route %08lX -> %08lX\n", ntohl(rth->rt_src), ntohl(rth->rt_dst));
 
-	rth = kmalloc(sizeof(struct rtable), GFP_ATOMIC);
-	if (!rth)
-	{
-		ip_rt_unlock();
-		return NULL;
+	if (flags&(RTF_LOCAL|RTF_MULTICAST|RTF_BROADCAST|RTF_REJECT)) {
+		skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0);
+		return 0;
 	}
+	skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, __constant_ntohs(skb->protocol));
+	return 0;
 
-	if (local)
-		f = fib_lookup_local(daddr);
-	else
-		f = fib_lookup (daddr);
+mc_input:
+	if (skb->protocol != __constant_htons(ETH_P_IP))
+		return -EINVAL;
 
-	if (f)
-	{
-		fi = f->fib_info;
-		f->fib_use++;
+	if (ZERONET(saddr)) {
+		if (!ipv4_config.bootp_agent)
+			goto martian_source;
+		flags |= RTF_NOFORWARD|RTF_LOCAL;
+	} else {
+		src_fi = fib_lookup_info(saddr, 0, tos, &loopback_dev, NULL);
+		if (!src_fi)
+			goto martian_source;
+
+		if (src_fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTF_NAT))
+			goto martian_source;
+
+		if (!(src_fi->fib_flags&RTF_GATEWAY))
+			flags |= RTCF_DIRECTSRC;
+
+		if (!MULTICAST(daddr) || !ipv4_config.multicast_route ||
+		    LOCAL_MCAST(daddr)) {
+			if (net_alias_main_dev(src_fi->fib_dev) == pdev) {
+				skb->dev = dev = src_fi->fib_dev;
+			} else {
+				/* Fascist not-unicast filtering 8) */
+				goto martian_source;
+			}
+		}
 	}
 
-	if (!f || (fi->fib_flags & RTF_REJECT))
-	{
-#ifdef CONFIG_KERNELD	
-		char wanted_route[20];
-#endif		
-#if RT_CACHE_DEBUG >= 2
-		printk("rt_route failed @%08x\n", daddr);
-#endif
-		ip_rt_unlock();
-		kfree_s(rth, sizeof(struct rtable));
-#ifdef CONFIG_KERNELD		
-		daddr=ntohl(daddr);
-		sprintf(wanted_route, "%d.%d.%d.%d",
-			(int)(daddr >> 24) & 0xff, (int)(daddr >> 16) & 0xff,
-			(int)(daddr >> 8) & 0xff, (int)daddr & 0xff);
-		kerneld_route(wanted_route); 	/* Dynamic route request */
-#endif		
-		return NULL;
-	}
-
-	saddr = fi->fib_dev->pa_addr;
-
-	if (daddr == fi->fib_dev->pa_addr)
-	{
-		f->fib_use--;
-		if ((f = fib_loopback) != NULL)
-		{
-			f->fib_use++;
-			fi = f->fib_info;
-		}
+	if (!MULTICAST(daddr)) {
+		flags |= RTF_LOCAL|RTF_BROADCAST|RTF_NOFORWARD;
+		devout = dev;
+		goto make_route;
 	}
-	
-	if (!f)
-	{
-		ip_rt_unlock();
-		kfree_s(rth, sizeof(struct rtable));
-		return NULL;
+
+	flags |= RTF_MULTICAST|RTF_LOCAL;
+
+	if (ip_check_mc(dev, daddr) == 0) {
+		flags &= ~RTF_LOCAL;
+
+		if (!ipv4_config.multicast_route || !(dev->flags&IFF_ALLMULTI))
+			goto no_route;
 	}
+	devout = dev;
+	goto make_route;
 
-	rth->rt_dst	= daddr;
-	rth->rt_src	= saddr;
-	rth->rt_lastuse	= jiffies;
-	rth->rt_refcnt	= 1;
-	rth->rt_use	= 1;
-	rth->rt_next	= NULL;
-	rth->rt_hh	= NULL;
-	rth->rt_gateway	= fi->fib_gateway;
-	rth->rt_dev	= fi->fib_dev;
-	rth->rt_mtu	= fi->fib_mtu;
-	rth->rt_window	= fi->fib_window;
-	rth->rt_irtt	= fi->fib_irtt;
-	rth->rt_tos	= f->fib_tos;
-	rth->rt_flags   = fi->fib_flags | RTF_HOST;
-	if (local)
-		rth->rt_flags   |= RTF_LOCAL;
+promisc_ip:
+	flags |= RTF_LOCAL|RTF_NOFORWARD;
+	if (MULTICAST(daddr))
+		flags |= RTF_MULTICAST;
+	else
+		flags |= RTF_BROADCAST;
+	devout = dev;
+	goto make_route;
+
+no_route:
+	flags |= RTF_REJECT;
+	devout = dev;
+	goto make_route;
 
-	if (!(rth->rt_flags & RTF_GATEWAY))
-		rth->rt_gateway = rth->rt_dst;
 	/*
-	 *	Multicast or limited broadcast is never gatewayed.
+	 *	Do not cache martian addresses: they should be logged (RFC1812)
 	 */
-	if (MULTICAST(daddr) || daddr == 0xFFFFFFFF)
-		rth->rt_gateway = rth->rt_dst;
+martian_destination:
+	if (ipv4_config.log_martians)
+		printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name);
+	return -EINVAL;
 
-	if (ip_rt_lock == 1)
-		rt_cache_add(hash, rth);
-	else
-	{
-		rt_free(rth);
-#if RT_CACHE_DEBUG >= 1
-		printk(KERN_DEBUG "rt_cache: route to %08x was born dead\n", daddr);
-#endif
+martian_source:
+	if (ipv4_config.log_martians) {
+		/*
+		 *	RFC1812 recommenadtion, if source is martian,
+		 *	the only hint is MAC header.
+		 */
+		printk(KERN_WARNING "martian source %08x for %08x, dev %s\n", saddr, daddr, dev->name);
+		if (dev->hard_header_len) {
+			int i;
+			unsigned char *p = skb->mac.raw;
+			printk(KERN_WARNING "ll header:");
+			for (i=0; i<dev->hard_header_len; i++, p++)
+				printk(" %02x", *p);
+			printk("\n");
+		}
 	}
-
-	ip_rt_unlock();
-	return rth;
-}
-
-void ip_rt_put(struct rtable * rt)
-{
-	if (rt)
-		atomic_dec(&rt->rt_refcnt);
+	return -EINVAL;
 }
 
-struct rtable * ip_rt_route(__u32 daddr, int local)
+int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
+		   u8 tos, struct device *dev)
 {
 	struct rtable * rth;
+	unsigned	hash;
 
-	ip_rt_fast_lock();
+	if (skb->dst)
+		return 0;
 
-	for (rth=ip_rt_hash_table[ip_rt_hash_code(daddr)^local]; rth; rth=rth->rt_next)
-	{
-		if (rth->rt_dst == daddr)
-		{
-			rth->rt_lastuse = jiffies;
-			atomic_inc(&rth->rt_use);
-			atomic_inc(&rth->rt_refcnt);
-			ip_rt_unlock();
-			return rth;
+#if RT_CACHE_DEBUG >= 1
+	if (dev->flags & IFF_LOOPBACK) {
+		printk(KERN_DEBUG "ip_route_input: bug: packet is looped back\n");
+		return -EINVAL;
+	}
+	if (net_alias_main_dev(dev) != dev)
+		printk(KERN_DEBUG "ip_route_input: bug: packet is received on alias %s\n", dev->name);
+#endif
+
+	tos &= IPTOS_TOS_MASK;
+	hash = rt_hash_code(daddr, saddr^(unsigned long)dev, tos);
+	skb->dev = dev;
+
+	for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
+		if (rth->key.dst == daddr &&
+		    rth->key.src == saddr &&
+		    rth->key.src_dev == dev &&
+		    rth->key.dst_dev == NULL &&
+		    rth->key.tos == tos) {
+			rth->u.dst.lastuse = jiffies;
+			atomic_inc(&rth->u.dst.use);
+			atomic_inc(&rth->u.dst.refcnt);
+			skb->dst = (struct dst_entry*)rth;
+			skb->dev = rth->rt_src_dev;
+			return 0;
 		}
 	}
-	return ip_rt_slow_route (daddr, local);
+	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
 }
 
+
 /*
- *	Process a route add request from the user, or from a kernel
- *	task.
+ * Major route resolver routine.
  */
- 
-int ip_rt_new(struct rtentry *r)
-{
-	int err;
-	char * devname;
-	struct device * dev = NULL;
-	unsigned long flags;
-	__u32 daddr, mask, gw;
-	short metric;
 
-	/*
-	 *	If a device is specified find it.
-	 */
-	 
-	if ((devname = r->rt_dev) != NULL) 
-	{
-		err = getname(devname, &devname);
-		if (err)
-			return err;
-		dev = dev_get(devname);
-		putname(devname);
-		if (!dev)
-			return -ENODEV;
-	}
-	
-	/*
-	 *	If the device isn't INET, don't allow it
-	 */
+int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u8 tos,
+			 struct device *dev_out)
+{
+	u32 src_key = saddr;
+	u32 dst_key = daddr;
+	u32 dst_map;
+	struct device *dst_dev_key = dev_out;
+	unsigned flags = 0;
+	struct fib_info *fi = NULL;
+	struct rtable *rth;
+#ifdef CONFIG_IP_LOCAL_RT_POLICY
+	struct fib_result res;
+#endif
+	unsigned hash;
 
-	if (r->rt_dst.sa_family != AF_INET)
-		return -EAFNOSUPPORT;
+	tos &= IPTOS_TOS_MASK|1;
 
-	/*
-	 *	Make local copies of the important bits
-	 *	We decrement the metric by one for BSD compatibility.
-	 */
-	 
-	flags = r->rt_flags;
-	daddr = (__u32) ((struct sockaddr_in *) &r->rt_dst)->sin_addr.s_addr;
-	mask  = (__u32) ((struct sockaddr_in *) &r->rt_genmask)->sin_addr.s_addr;
-	gw    = (__u32) ((struct sockaddr_in *) &r->rt_gateway)->sin_addr.s_addr;
-	metric = r->rt_metric > 0 ? r->rt_metric - 1 : 0;
+	if (saddr) {
+		if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr) ||
+		    __ip_chk_addr(saddr) != IS_MYADDR)
+			return -EINVAL;
+		if (dev_out == NULL && (MULTICAST(daddr) || daddr == 0xFFFFFFFF))
+			dev_out = ip_dev_find(saddr, NULL);
+	}
+	if (!daddr)
+		daddr = saddr;
 
-	/*
-	 *	BSD emulation: Permits route add someroute gw one-of-my-addresses
-	 *	to indicate which iface. Not as clean as the nice Linux dev technique
-	 *	but people keep using it...  (and gated likes it ;))
-	 */
-	 
-	if (!dev && (flags & RTF_GATEWAY)) 
-	{
-		struct device *dev2;
-		for (dev2 = dev_base ; dev2 != NULL ; dev2 = dev2->next) 
-		{
-			if ((dev2->flags & IFF_UP) && dev2->pa_addr == gw) 
-			{
-				flags &= ~RTF_GATEWAY;
-				dev = dev2;
-				break;
-			}
-		}
+	if (dev_out) {
+		if (!saddr) {
+			saddr = dev_out->pa_addr;
+			if (!daddr)
+				daddr = saddr;
+		}
+		dst_map = daddr;
+		if (MULTICAST(daddr) || daddr == 0xFFFFFFFF)
+			goto make_route;
+	}
+
+	if (!daddr)
+		daddr = htonl(INADDR_LOOPBACK);
+
+#ifdef CONFIG_IP_LOCAL_RT_POLICY
+	if (fib_lookup(&res, daddr, saddr, tos, &loopback_dev, dev_out))
+		return -ENETUNREACH;
+	fi = res.f->fib_info;
+	dst_map = daddr;
+
+	if (fi->fib_flags&RTF_NAT) {
+		dst_map = htonl((ntohl(daddr)&((1<<res.fm)-1)))|fi->fib_gateway;
+		fi = fib_lookup_info(dst_map, saddr, tos, &loopback_dev, dev_out);
+		if (!fi || fi->fib_flags&(RTF_NAT|RTF_LOCAL|RTF_MULTICAST|RTF_BROADCAST))
+			return -EINVAL;
+		flags = RTCF_NAT;
 	}
 
-	if (flags & RTF_HOST) 
-		mask = 0xffffffff;
-	else if (mask && r->rt_genmask.sa_family != AF_INET)
-		return -EAFNOSUPPORT;
-
-	if (flags & RTF_GATEWAY) 
-	{
-		if (r->rt_gateway.sa_family != AF_INET)
-			return -EAFNOSUPPORT;
+	if (!saddr) {
+		saddr = fi->fib_dev->pa_addr;
 
 		/*
-		 *	Don't try to add a gateway we can't reach.. 
-		 *	Tunnel devices are exempt from this rule.
+		 * "Stabilization" of route.
+		 * This step is necessary, if locally originated packets
+		 * are subjected to source routing, else we could get
+		 * route flapping.
 		 */
-
-		if (!dev)
-			dev = get_gw_dev(gw);
-		else if (dev != get_gw_dev(gw) && dev->type != ARPHRD_TUNNEL)
-			return -EINVAL;
-		if (!dev)
-			return -ENETUNREACH;
-	} 
-	else
-	{
-		gw = 0;
-		if (!dev)
-			dev = ip_dev_bynet(daddr, mask);
-		if (!dev)
+		fi = fib_lookup_info(dst_map, saddr, tos, &loopback_dev, dev_out);
+		if (!fi)
 			return -ENETUNREACH;
-		if (!mask)
-		{
-			if (((daddr ^ dev->pa_addr) & dev->pa_mask) == 0)
-				mask = dev->pa_mask;
-		}
 	}
+#else
+	fi = fib_lookup_info(daddr, 0, tos, &loopback_dev, dev_out);
+	if (!fi)
+		return -ENETUNREACH;
+
+	if (fi->fib_flags&RTF_NAT)
+		return -EINVAL;
 
-#ifndef CONFIG_IP_CLASSLESS
-	if (!mask)
-		mask = ip_get_mask(daddr);
+	dst_map = daddr;
+	if (!saddr)
+		saddr = fi->fib_dev->pa_addr;
 #endif
-	
-	if (bad_mask(mask, daddr))
+
+	flags |= fi->fib_flags;
+	dev_out = fi->fib_dev;
+
+	if (RT_LOCALADDR(flags)) {
+		dev_out = &loopback_dev;
+		fi = NULL;
+	}
+
+	if (dst_dev_key && dev_out != dst_dev_key)
 		return -EINVAL;
 
-	/*
-	 *	Add the route
-	 */
+make_route:
+	if (LOOPBACK(saddr) && !(dev_out->flags&IFF_LOOPBACK)) {
+		printk(KERN_DEBUG "this guy talks to %08x from loopback\n", daddr);
+		return -EINVAL;
+	}
+
+	if (daddr == 0xFFFFFFFF)
+		flags |= RTF_BROADCAST;
+	else if (MULTICAST(daddr))
+		flags |= RTF_MULTICAST;
+	else if (BADCLASS(daddr) || ZERONET(daddr))
+		return -EINVAL;
+
+	if (flags&RTF_BROADCAST && (dev_out->flags&IFF_LOOPBACK ||
+	    !(dev_out->flags&IFF_BROADCAST)))
+		flags &= ~RTF_LOCAL;
+	else if (flags&RTF_MULTICAST) {
+		if (ip_check_mc(dev_out, daddr))
+			flags |= RTF_LOCAL;
+	}
+	
+	rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+	if (!rth)
+		return -ENOBUFS;
+
+	rth->u.dst.use	= 1;
+	rth->key.dst	= dst_key;
+	rth->key.tos	= tos;
+	rth->key.src	= src_key;
+	rth->key.src_dev= NULL;
+	rth->key.dst_dev= dst_dev_key;
+	rth->rt_dst	= daddr;
+	rth->rt_dst_map	= dst_map;
+	rth->rt_src	= saddr;
+	rth->rt_src_map	= saddr;
+	rth->rt_src_dev = dev_out;
+	rth->u.dst.dev	= dev_out;
+	rth->rt_gateway = dst_map;
+	rth->rt_spec_dst= dev_out->pa_addr;
+
+	rth->u.dst.output=ip_output;
+
+	if (flags&RTF_LOCAL) {
+		rth->u.dst.input = ip_local_deliver;
+		rth->rt_spec_dst = daddr;
+	}
+	if (flags&(RTF_BROADCAST|RTF_MULTICAST)) {
+		rth->rt_spec_dst = dev_out->pa_addr;
+		flags &= ~RTF_GATEWAY;
+		if (flags&RTF_LOCAL)
+			rth->u.dst.output = ip_mc_output;
+		if (flags&RTF_MULTICAST) {
+			if (dev_out->flags&IFF_ALLMULTI)
+				rth->u.dst.output = ip_mc_output;
+#ifdef CONFIG_IP_MROUTE
+			if (ipv4_config.multicast_route && !LOCAL_MCAST(daddr))
+				rth->u.dst.input = ip_mr_input;
+#endif
+		}
+	}
 
-	rt_add(flags, daddr, mask, gw, dev, r->rt_mss, r->rt_window, r->rt_irtt, metric);
+	if (fi) {
+		if (flags&RTF_GATEWAY)
+			rth->rt_gateway = fi->fib_gateway;
+		rth->u.dst.pmtu	= fi->fib_mtu;
+		rth->u.dst.window=fi->fib_window;
+		rth->u.dst.rtt	= fi->fib_irtt;
+	} else {
+		rth->u.dst.pmtu	= dev_out->mtu;
+		rth->u.dst.window=0;
+		rth->u.dst.rtt	= TCP_TIMEOUT_INIT;
+	}
+	rth->rt_flags = flags;
+	hash = rt_hash_code(dst_key, src_key, tos);
+	if (dst_dev_key)
+		hash ^= dev_hash_name(dst_dev_key->name);
+	*rp = rt_intern_hash(hash, rth, ETH_P_IP);
 	return 0;
 }
 
+int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, struct device *dev_out)
+{
+	unsigned hash;
+	struct rtable *rth;
 
-/*
- *	Remove a route, as requested by the user.
- */
+	hash = rt_hash_code(daddr, saddr, tos);
+	if (dev_out)
+		hash ^= dev_out->hash;
 
-int ip_rt_kill(struct rtentry *r)
-{
-	struct sockaddr_in *trg;
-	struct sockaddr_in *msk;
-	struct sockaddr_in *gtw;
-	char *devname;
-	int err;
-	struct device * dev = NULL;
-
-	trg = (struct sockaddr_in *) &r->rt_dst;
-	msk = (struct sockaddr_in *) &r->rt_genmask;
-	gtw = (struct sockaddr_in *) &r->rt_gateway;
-	if ((devname = r->rt_dev) != NULL) 
-	{
-		err = getname(devname, &devname);
-		if (err)
-			return err;
-		dev = dev_get(devname);
-		putname(devname);
-		if (!dev)
-			return -ENODEV;
+	start_bh_atomic();
+	for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
+		if (rth->key.dst == daddr &&
+		    rth->key.src == saddr &&
+		    rth->key.src_dev == NULL &&
+		    rth->key.dst_dev == dev_out &&
+		    rth->key.tos == tos) {
+			rth->u.dst.lastuse = jiffies;
+			atomic_inc(&rth->u.dst.use);
+			atomic_inc(&rth->u.dst.refcnt);
+			end_bh_atomic();
+			*rp = rth;
+			return 0;
+		}
 	}
-	/*
-	 * metric can become negative here if it wasn't filled in
-	 * but that's a fortunate accident; we really use that in rt_del.
-	 */
-	err=rt_del((__u32)trg->sin_addr.s_addr, (__u32)msk->sin_addr.s_addr, dev,
-		(__u32)gtw->sin_addr.s_addr, r->rt_flags, r->rt_metric - 1);
-	return err;
+	end_bh_atomic();
+
+	return ip_route_output_slow(rp, daddr, saddr, tos, dev_out);
 }
 
-/*
- *	Handle IP routing ioctl calls. These are used to manipulate the routing tables
- */
- 
-int ip_rt_ioctl(unsigned int cmd, void *arg)
+int ip_route_output_dev(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, char *devname)
 {
-	int err;
-	struct rtentry rt;
+	unsigned hash;
+	struct rtable *rth;
+	struct device *dev_out;
+
+	hash = rt_hash_code(daddr, saddr, tos)^dev_hash_mc_name(devname);
 
-	switch(cmd) 
-	{
-		case SIOCADDRT:		/* Add a route */
-		case SIOCDELRT:		/* Delete a route */
-			if (!suser())
-				return -EPERM;
-			err = copy_from_user(&rt, arg, sizeof(struct rtentry));
-			if (err)
-				return -EFAULT; 
-			return (cmd == SIOCDELRT) ? ip_rt_kill(&rt) : ip_rt_new(&rt);
+	start_bh_atomic();
+	for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
+		if (rth->key.dst == daddr &&
+		    rth->key.src == saddr &&
+		    rth->key.src_dev == NULL &&
+		    rth->key.tos == tos &&
+		    rth->key.dst_dev &&
+		    strcmp(rth->key.dst_dev->name, devname)==0) {
+			rth->u.dst.lastuse = jiffies;
+			atomic_inc(&rth->u.dst.use);
+			atomic_inc(&rth->u.dst.refcnt);
+			end_bh_atomic();
+			*rp = rth;
+			return 0;
+		}
 	}
+	end_bh_atomic();
 
-	return -EINVAL;
+	dev_out = dev_get(devname);
+	if (!dev_out)
+		return -ENODEV;
+	return ip_route_output_slow(rp, daddr, saddr, tos, dev_out);
 }
 
-void ip_rt_advice(struct rtable **rp, int advice)
+void ip_rt_multicast_event(struct device *dev)
 {
-	/* Thanks! */
-	return;
+	rt_cache_flush(0);
 }
 
-void ip_rt_update(int event, struct device *dev)
+void ip_rt_init()
 {
-/*
- *	This causes too much grief to do now.
- */
-#ifdef COMING_IN_2_1
-	if (event == NETDEV_UP)
-		rt_add(RTF_HOST|RTF_UP, dev->pa_addr, ~0, 0, dev, 0, 0, 0, 0);
-	else if (event == NETDEV_DOWN)
-		rt_del(dev->pa_addr, ~0, dev, 0, RTF_HOST|RTF_UP, 0);
-#endif		
+	ip_fib_init();
+
+#ifdef CONFIG_PROC_FS
+	proc_net_register(&(struct proc_dir_entry) {
+		PROC_NET_RTCACHE, 8, "rt_cache",
+		S_IFREG | S_IRUGO, 1, 0, 0,
+		0, &proc_net_inode_operations,
+		rt_cache_get_info
+	});
+#endif
 }

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov