patch-2.3.6 linux/net/core/dev.c

Next file: linux/net/core/dev_mcast.c
Previous file: linux/mm/swapfile.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.3.5/linux/net/core/dev.c linux/net/core/dev.c
@@ -263,13 +263,13 @@
 {
 	struct device *dev;
 
-	read_lock_bh(&dev_base_lock);
+	read_lock(&dev_base_lock);
 	for (dev = dev_base; dev != NULL; dev = dev->next) {
 		if (strcmp(dev->name, name) == 0)
 			goto out;
 	}
 out:
-	read_unlock_bh(&dev_base_lock);
+	read_unlock(&dev_base_lock);
 	return dev;
 }
 
@@ -277,13 +277,13 @@
 {
 	struct device *dev;
 
-	read_lock_bh(&dev_base_lock);
+	read_lock(&dev_base_lock);
 	for (dev = dev_base; dev != NULL; dev = dev->next) {
 		if (dev->ifindex == ifindex)
 			goto out;
 	}
 out:
-	read_unlock_bh(&dev_base_lock);
+	read_unlock(&dev_base_lock);
 	return dev;
 }
 
@@ -291,14 +291,14 @@
 {
 	struct device *dev;
 
-	read_lock_bh(&dev_base_lock);
+	read_lock(&dev_base_lock);
 	for (dev = dev_base; dev != NULL; dev = dev->next) {
 		if (dev->type == type &&
 		    memcmp(dev->dev_addr, ha, dev->addr_len) == 0)
 			goto out;
 	}
 out:
-	read_unlock_bh(&dev_base_lock);
+	read_unlock(&dev_base_lock);
 	return dev;
 }
 
@@ -321,7 +321,7 @@
 	}
 	return -ENFILE;	/* Over 100 of the things .. bail out! */
 }
- 
+
 struct device *dev_alloc(const char *name, int *err)
 {
 	struct device *dev=kmalloc(sizeof(struct device)+16, GFP_KERNEL);
@@ -387,9 +387,6 @@
 	if (dev->flags&IFF_UP)
 		return 0;
 
-	/* Setup the lock before we open the faucet. */
-	spin_lock_init(&dev->xmit_lock);
-
 	/*
 	 *	Call device private open method
 	 */
@@ -452,10 +449,10 @@
 	if (dev) {
 		dev_do_clear_fastroute(dev);
 	} else {
-		read_lock_bh(&dev_base_lock);
+		read_lock(&dev_base_lock);
 		for (dev = dev_base; dev; dev = dev->next)
 			dev_do_clear_fastroute(dev);
-		read_unlock_bh(&dev_base_lock);
+		read_unlock(&dev_base_lock);
 	}
 }
 #endif
@@ -596,59 +593,61 @@
 	struct device *dev = skb->dev;
 	struct Qdisc  *q;
 
-#ifdef CONFIG_NET_PROFILE
-	start_bh_atomic();
-	NET_PROFILE_ENTER(dev_queue_xmit);
-#endif
-
-	spin_lock_bh(&dev->xmit_lock);
+	/* Grab device queue */
+	spin_lock_bh(&dev->queue_lock);
 	q = dev->qdisc;
 	if (q->enqueue) {
 		q->enqueue(skb, q);
-		qdisc_wakeup(dev);
-		spin_unlock_bh(&dev->xmit_lock);
 
-#ifdef CONFIG_NET_PROFILE
-	        NET_PROFILE_LEAVE(dev_queue_xmit);
-		end_bh_atomic();
-#endif
+		/* If the device is not busy, kick it.
+		 * Otherwise or if queue is not empty after kick,
+		 * add it to run list.
+		 */
+		if (dev->tbusy || qdisc_restart(dev))
+			qdisc_run(dev->qdisc);
 
+		spin_unlock_bh(&dev->queue_lock);
 		return 0;
 	}
+	spin_unlock_bh(&dev->queue_lock);
 
 	/* The device has no queue. Common case for software devices:
 	   loopback, all the sorts of tunnels...
 
-	   Really, it is unlikely that bh protection is necessary here:
-	   virtual devices do not generate EOI events.
-	   However, it is possible, that they rely on bh protection
+	   Really, it is unlikely that xmit_lock protection is necessary here.
+	   (f.e. loopback and IP tunnels are clean ignoring statistics counters.)
+	   However, it is possible, that they rely on protection
 	   made by us here.
+
+	   Check this and shot the lock. It is not prone from deadlocks.
+	   Either shot noqueue qdisc, it is even simpler 8)
 	 */
 	if (dev->flags&IFF_UP) {
 		if (netdev_nit) 
 			dev_queue_xmit_nit(skb,dev);
-		if (dev->hard_start_xmit(skb, dev) == 0) {
-			spin_unlock_bh(&dev->xmit_lock);
-
-#ifdef CONFIG_NET_PROFILE
-			NET_PROFILE_LEAVE(dev_queue_xmit);
-			end_bh_atomic();
-#endif
 
-			return 0;
+		local_bh_disable();
+		if (dev->xmit_lock_owner != smp_processor_id()) {
+			spin_lock(&dev->xmit_lock);
+			dev->xmit_lock_owner = smp_processor_id();
+			if (dev->hard_start_xmit(skb, dev) == 0) {
+				dev->xmit_lock_owner = -1;
+				spin_unlock_bh(&dev->xmit_lock);
+				return 0;
+			}
+			dev->xmit_lock_owner = -1;
+			spin_unlock_bh(&dev->xmit_lock);
+			if (net_ratelimit())
+				printk(KERN_DEBUG "Virtual device %s asks to queue packet!\n", dev->name);
+		} else {
+			/* Recursion is detected! It is possible, unfortunately */
+			local_bh_enable();
+			if (net_ratelimit())
+				printk(KERN_DEBUG "Dead loop on virtual device %s, fix it urgently!\n", dev->name);
 		}
-		if (net_ratelimit())
-			printk(KERN_DEBUG "Virtual device %s asks to queue packet!\n", dev->name);
 	}
-	spin_unlock_bh(&dev->xmit_lock);
 
 	kfree_skb(skb);
-
-#ifdef CONFIG_NET_PROFILE
-	NET_PROFILE_LEAVE(dev_queue_xmit);
-	end_bh_atomic();
-#endif
-
 	return 0;
 }
 
@@ -660,9 +659,6 @@
 int netdev_dropping = 0;
 int netdev_max_backlog = 300;
 atomic_t netdev_rx_dropped;
-#ifdef CONFIG_CPU_IS_SLOW
-int net_cpu_congestion;
-#endif
 
 #ifdef CONFIG_NET_HW_FLOWCONTROL
 int netdev_throttle_events;
@@ -852,14 +848,6 @@
 	struct packet_type *pt_prev;
 	unsigned short type;
 	unsigned long start_time = jiffies;
-#ifdef CONFIG_CPU_IS_SLOW
-	static unsigned long start_busy = 0;
-	static unsigned long ave_busy = 0;
-
-	if (start_busy == 0)
-		start_busy = start_time;
-	net_cpu_congestion = ave_busy>>8;
-#endif
 
 	NET_PROFILE_ENTER(net_bh);
 	/*
@@ -869,9 +857,9 @@
 	 *	latency on a transmit interrupt bh.
 	 */
 
-	if (qdisc_head.forw != &qdisc_head)
+	if (qdisc_pending())
 		qdisc_run_queues();
-  
+
 	/*
 	 *	Any data left to process. This may occur because a
 	 *	mark_bh() is done after we empty the queue including
@@ -899,19 +887,6 @@
 		 */
 		skb = skb_dequeue(&backlog);
 
-#ifdef CONFIG_CPU_IS_SLOW
-		if (ave_busy > 128*16) {
-			kfree_skb(skb);
-			while ((skb = skb_dequeue(&backlog)) != NULL)
-				kfree_skb(skb);
-			break;
-		}
-#endif
-
-
-#if 0
-		NET_PROFILE_SKB_PASSED(skb, net_bh_skb);
-#endif
 #ifdef CONFIG_NET_FASTROUTE
 		if (skb->pkt_type == PACKET_FASTROUTE) {
 			dev_queue_xmit(skb);
@@ -1022,16 +997,9 @@
 	 *	One last output flush.
 	 */
 
-	if (qdisc_head.forw != &qdisc_head)
+	if (qdisc_pending())
 		qdisc_run_queues();
 
-#ifdef  CONFIG_CPU_IS_SLOW
-        if (1) {
-		unsigned long start_idle = jiffies;
-		ave_busy += ((start_idle - start_busy)<<3) - (ave_busy>>4);
-		start_busy = 0;
-	}
-#endif
 #ifdef CONFIG_NET_HW_FLOWCONTROL
 	if (netdev_dropping)
 		netdev_wakeup();
@@ -1065,14 +1033,6 @@
  */
 
 /*
- *	This call is useful, but I'd remove it too.
- *
- *	The reason is purely aestetical, it is the only call
- *	from SIOC* family using struct ifreq in reversed manner.
- *	Besides that, it is pretty silly to put "drawing" facility
- *	to kernel, it is useful only to print ifindices
- *	in readable form, is not it? --ANK
- *
  *	We need this ioctl for efficient implementation of the
  *	if_indextoname() function required by the IPv6 API.  Without
  *	it, we would have to search all the interfaces to find a
@@ -1138,7 +1098,7 @@
 	 */
 
 	total = 0;
-	read_lock_bh(&dev_base_lock);
+	read_lock(&dev_base_lock);
 	for (dev = dev_base; dev != NULL; dev = dev->next) {
 		for (i=0; i<NPROTO; i++) {
 			if (gifconf_list[i]) {
@@ -1152,7 +1112,7 @@
 			}
 		}
   	}
-	read_unlock_bh(&dev_base_lock);
+	read_unlock(&dev_base_lock);
 
 	if(pos != NULL) {
 		int err = copy_to_user(ifc.ifc_buf, pos, total);
@@ -1232,7 +1192,7 @@
 	len+=size;
 	
 
-	read_lock_bh(&dev_base_lock);
+	read_lock(&dev_base_lock);
 	for (dev = dev_base; dev != NULL; dev = dev->next) {
 		size = sprintf_stats(buffer+len, dev);
 		len+=size;
@@ -1245,7 +1205,7 @@
 		if(pos>offset+length)
 			break;
 	}
-	read_unlock_bh(&dev_base_lock);
+	read_unlock(&dev_base_lock);
 	
 	*start=buffer+(offset-begin);	/* Start of wanted data */
 	len-=(offset-begin);		/* Start slop */
@@ -1347,7 +1307,7 @@
 	pos+=size;
 	len+=size;
 
-	read_lock_bh(&dev_base_lock);
+	read_lock(&dev_base_lock);
 	for(dev = dev_base; dev != NULL; dev = dev->next) {
 		size = sprintf_wireless_stats(buffer+len, dev);
 		len+=size;
@@ -1360,7 +1320,7 @@
 		if(pos > offset + length)
 			break;
 	}
-	read_unlock_bh(&dev_base_lock);
+	read_unlock(&dev_base_lock);
 
 	*start = buffer + (offset - begin);	/* Start of wanted data */
 	len -= (offset - begin);		/* Start slop */
@@ -1736,11 +1696,10 @@
 				if (IW_IS_SET(cmd)) {
 					if (!suser())
 						return -EPERM;
-					rtnl_lock();
 				}
+				rtnl_lock();
 				ret = dev_ifsioc(&ifr, cmd);
-				if (IW_IS_SET(cmd))
-					rtnl_unlock();
+				rtnl_unlock();
 				if (!ret && IW_IS_GET(cmd) &&
 				    copy_to_user(arg, &ifr, sizeof(struct ifreq)))
 					return -EFAULT;
@@ -1769,6 +1728,10 @@
 {
 	struct device *d, **dp;
 
+	spin_lock_init(&dev->queue_lock);
+	spin_lock_init(&dev->xmit_lock);
+	dev->xmit_lock_owner = -1;
+
 	if (dev_boot_phase) {
 		/* This is NOT bug, but I am not sure, that all the
 		   devices, initialized before netdev module is started
@@ -1784,14 +1747,13 @@
 		printk(KERN_INFO "early initialization of device %s is deferred\n", dev->name);
 
 		/* Check for existence, and append to tail of chain */
-		write_lock_bh(&dev_base_lock);
 		for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) {
 			if (d == dev || strcmp(d->name, dev->name) == 0) {
-				write_unlock_bh(&dev_base_lock);
 				return -EEXIST;
 			}
 		}
 		dev->next = NULL;
+		write_lock_bh(&dev_base_lock);
 		*dp = dev;
 		write_unlock_bh(&dev_base_lock);
 		return 0;
@@ -1803,24 +1765,22 @@
 	if (dev->init && dev->init(dev) != 0)
 		return -EIO;
 
+	dev->ifindex = dev_new_index();
+	if (dev->iflink == -1)
+		dev->iflink = dev->ifindex;
+
 	/* Check for existence, and append to tail of chain */
-	write_lock_bh(&dev_base_lock);
 	for (dp=&dev_base; (d=*dp) != NULL; dp=&d->next) {
 		if (d == dev || strcmp(d->name, dev->name) == 0) {
-			write_unlock_bh(&dev_base_lock);
 			return -EEXIST;
 		}
 	}
 	dev->next = NULL;
 	dev_init_scheduler(dev);
+	write_lock_bh(&dev_base_lock);
 	*dp = dev;
 	write_unlock_bh(&dev_base_lock);
 
-	dev->ifindex = -1;
-	dev->ifindex = dev_new_index();
-	if (dev->iflink == -1)
-		dev->iflink = dev->ifindex;
-
 	/* Notify protocols, that a new device appeared. */
 	notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
 
@@ -1831,15 +1791,35 @@
 {
 	struct device *d, **dp;
 
-	if (dev_boot_phase == 0) {
-		/* If device is running, close it.
-		   It is very bad idea, really we should
-		   complain loudly here, but random hackery
-		   in linux/drivers/net likes it.
-		 */
-		if (dev->flags & IFF_UP)
-			dev_close(dev);
+	/* If device is running, close it first. */
+	if (dev->flags & IFF_UP)
+		dev_close(dev);
 
+	/* And unlink it from device chain. */
+	for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) {
+		if (d == dev) {
+			write_lock_bh(&dev_base_lock);
+			*dp = d->next;
+			write_unlock_bh(&dev_base_lock);
+
+			/* Sorry. It is known "feature". The race is clear.
+			   Keep it after device reference counting will
+			   be complete.
+			 */
+			synchronize_bh();
+			break;
+		}
+	}
+	if (d == NULL)
+		return -ENODEV;
+
+	/* It is "synchronize_bh" to those of guys, who overslept
+	   in skb_alloc/page fault etc. that device is off-line.
+	   Again, it can be removed only if devices are refcounted.
+	 */
+	dev_lock_wait();
+
+	if (dev_boot_phase == 0) {
 #ifdef CONFIG_NET_FASTROUTE
 		dev_clear_fastroute(dev);
 #endif
@@ -1856,27 +1836,11 @@
 		 *	Flush the multicast chain
 		 */
 		dev_mc_discard(dev);
-
-		/* To avoid pointers looking to nowhere,
-		   we wait for end of critical section */
-		dev_lock_wait();
 	}
 
-	/* And unlink it from device chain. */
-	write_lock_bh(&dev_base_lock);
-	for (dp = &dev_base; (d=*dp) != NULL; dp=&d->next) {
-		if (d == dev) {
-			*dp = d->next;
-			d->next = NULL;
-			write_unlock_bh(&dev_base_lock);
-
-			if (dev->destructor)
-				dev->destructor(dev);
-			return 0;
-		}
-	}
-	write_unlock_bh(&dev_base_lock);
-	return -ENODEV;
+	if (dev->destructor)
+		dev->destructor(dev);
+	return 0;
 }
 
 
@@ -2018,16 +1982,24 @@
 	 *	If the call to dev->init fails, the dev is removed
 	 *	from the chain disconnecting the device until the
 	 *	next reboot.
+	 *
+	 *	NB At boot phase networking is dead. No locking is required.
+	 *	But we still preserve dev_base_lock for sanity.
 	 */
 
 	dp = &dev_base;
 	while ((dev = *dp) != NULL) {
+		spin_lock_init(&dev->queue_lock);
+		spin_lock_init(&dev->xmit_lock);
+		dev->xmit_lock_owner = -1;
 		dev->iflink = -1;
 		if (dev->init && dev->init(dev)) {
 			/*
 			 *	It failed to come up. Unhook it.
 			 */
+			write_lock_bh(&dev_base_lock);
 			*dp = dev->next;
+			write_unlock_bh(&dev_base_lock);
 		} else {
 			dp = &dev->next;
 			dev->ifindex = dev_new_index();
@@ -2055,6 +2027,7 @@
 
 	dev_boot_phase = 0;
 
+	dst_init();
 	dev_mcast_init();
 
 #ifdef CONFIG_IP_PNP

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)