patch-2.4.9 linux/drivers/md/raid1.c

Next file: linux/drivers/media/radio/Config.in
Previous file: linux/drivers/md/md.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.4.8/linux/drivers/md/raid1.c linux/drivers/md/raid1.c
@@ -33,6 +33,9 @@
 
 #define MAX_WORK_PER_DISK 128
 
+#define	NR_RESERVED_BUFS	32
+
+
 /*
  * The following can be used to debug the driver
  */
@@ -62,7 +65,7 @@
 	while(cnt) {
 		struct buffer_head *t;
 		md_spin_lock_irq(&conf->device_lock);
-		if (conf->freebh_cnt >= cnt)
+		if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
 			while (cnt) {
 				t = conf->freebh;
 				conf->freebh = t->b_next;
@@ -75,15 +78,18 @@
 		md_spin_unlock_irq(&conf->device_lock);
 		if (cnt == 0)
 			break;
-		t = (struct buffer_head *)kmalloc(sizeof(struct buffer_head), GFP_NOIO);
+		t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
 		if (t) {
-			memset(t, 0, sizeof(*t));
 			t->b_next = bh;
 			bh = t;
 			cnt--;
 		} else {
 			PRINTK("raid1: waiting for %d bh\n", cnt);
-			wait_event(conf->wait_buffer, conf->freebh_cnt >= cnt);
+			conf->freebh_blocked = 1;
+			wait_disk_event(conf->wait_buffer,
+					!conf->freebh_blocked ||
+					conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
+			conf->freebh_blocked = 0;
 		}
 	}
 	return bh;
@@ -97,7 +103,7 @@
 		struct buffer_head *t = bh;
 		bh=bh->b_next;
 		if (t->b_pprev == NULL)
-			kfree(t);
+			kmem_cache_free(bh_cachep, t);
 		else {
 			t->b_next= conf->freebh;
 			conf->freebh = t;
@@ -110,14 +116,13 @@
 
 static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
 {
-	/* allocate cnt buffer_heads, possibly less if kalloc fails */
+	/* allocate cnt buffer_heads, possibly less if kmalloc fails */
 	int i = 0;
 
 	while (i < cnt) {
 		struct buffer_head *bh;
-		bh = kmalloc(sizeof(*bh), GFP_KERNEL);
+		bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
 		if (!bh) break;
-		memset(bh, 0, sizeof(*bh));
 
 		md_spin_lock_irq(&conf->device_lock);
 		bh->b_pprev = &conf->freebh;
@@ -131,21 +136,18 @@
 	return i;
 }
 
-static int raid1_shrink_bh(raid1_conf_t *conf, int cnt)
+static void raid1_shrink_bh(raid1_conf_t *conf)
 {
-	/* discard cnt buffer_heads, if we can find them */
-	int i = 0;
+	/* discard all buffer_heads */
 
 	md_spin_lock_irq(&conf->device_lock);
-	while ((i < cnt) && conf->freebh) {
+	while (conf->freebh) {
 		struct buffer_head *bh = conf->freebh;
 		conf->freebh = bh->b_next;
-		kfree(bh);
-		i++;
+		kmem_cache_free(bh_cachep, bh);
 		conf->freebh_cnt--;
 	}
 	md_spin_unlock_irq(&conf->device_lock);
-	return i;
 }
 		
 
@@ -155,9 +157,10 @@
 
 	do {
 		md_spin_lock_irq(&conf->device_lock);
-		if (conf->freer1) {
+		if (!conf->freer1_blocked && conf->freer1) {
 			r1_bh = conf->freer1;
 			conf->freer1 = r1_bh->next_r1;
+			conf->freer1_cnt--;
 			r1_bh->next_r1 = NULL;
 			r1_bh->state = 0;
 			r1_bh->bh_req.b_state = 0;
@@ -170,7 +173,12 @@
 			memset(r1_bh, 0, sizeof(*r1_bh));
 			return r1_bh;
 		}
-		wait_event(conf->wait_buffer, conf->freer1);
+		conf->freer1_blocked = 1;
+		wait_disk_event(conf->wait_buffer,
+				!conf->freer1_blocked ||
+				conf->freer1_cnt > NR_RESERVED_BUFS/2
+			);
+		conf->freer1_blocked = 0;
 	} while (1);
 }
 
@@ -186,7 +194,11 @@
 		spin_lock_irqsave(&conf->device_lock, flags);
 		r1_bh->next_r1 = conf->freer1;
 		conf->freer1 = r1_bh;
+		conf->freer1_cnt++;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
+		/* don't need to wakeup wait_buffer because
+		 *  raid1_free_bh below will do that
+		 */
 	} else {
 		kfree(r1_bh);
 	}
@@ -203,13 +215,10 @@
 		if (!r1_bh)
 			break;
 		memset(r1_bh, 0, sizeof(*r1_bh));
-
-		md_spin_lock_irq(&conf->device_lock);
 		set_bit(R1BH_PreAlloc, &r1_bh->state);
-		r1_bh->next_r1 = conf->freer1;
-		conf->freer1 = r1_bh;
-		md_spin_unlock_irq(&conf->device_lock);
+		r1_bh->mddev = conf->mddev;
 
+		raid1_free_r1bh(r1_bh);
 		i++;
 	}
 	return i;
@@ -221,6 +230,7 @@
 	while (conf->freer1) {
 		struct raid1_bh *r1_bh = conf->freer1;
 		conf->freer1 = r1_bh->next_r1;
+		conf->freer1_cnt--;
 		kfree(r1_bh);
 	}
 	md_spin_unlock_irq(&conf->device_lock);
@@ -551,7 +561,7 @@
 	struct buffer_head *bh_req, *bhl;
 	struct raid1_bh * r1_bh;
 	int disks = MD_SB_DISKS;
-	int i, sum_bhs = 0, sectors;
+	int i, sum_bhs = 0;
 	struct mirror_info *mirror;
 
 	if (!buffer_locked(bh))
@@ -592,7 +602,6 @@
 	r1_bh->mddev = mddev;
 	r1_bh->cmd = rw;
 
-	sectors = bh->b_size >> 9;
 	if (rw == READ) {
 		/*
 		 * read balancing logic:
@@ -665,6 +674,11 @@
 		sum_bhs++;
 	}
 	if (bhl) raid1_free_bh(conf,bhl);
+	if (!sum_bhs) {
+		/* Gag - all mirrors non-operational.. */
+		raid1_end_bh_io(r1_bh, 0);
+		return 0;
+	}
 	md_atomic_set(&r1_bh->remaining, sum_bhs);
 
 	/*
@@ -727,12 +741,14 @@
 	mark_disk_faulty(sb->disks+mirror->number);
 	mark_disk_nonsync(sb->disks+mirror->number);
 	mark_disk_inactive(sb->disks+mirror->number);
-	sb->active_disks--;
+	if (!mirror->write_only)
+		sb->active_disks--;
 	sb->working_disks--;
 	sb->failed_disks++;
 	mddev->sb_dirty = 1;
 	md_wakeup_thread(conf->thread);
-	conf->working_disks--;
+	if (!mirror->write_only)
+		conf->working_disks--;
 	printk (DISK_FAILED, partition_name (mirror->dev),
 				 conf->working_disks);
 }
@@ -744,28 +760,27 @@
 	int disks = MD_SB_DISKS;
 	int i;
 
-	if (conf->working_disks == 1) {
-		/*
-		 * Uh oh, we can do nothing if this is our last disk, but
-		 * first check if this is a queued request for a device
-		 * which has just failed.
-		 */
-		for (i = 0; i < disks; i++) {
-			if (mirrors[i].dev==dev && !mirrors[i].operational)
-				return 0;
-		}
-		printk (LAST_DISK);
-	} else {
-		/*
-		 * Mark disk as unusable
+	/* Find the drive.
+	 * If it is not operational, then we have already marked it as dead
+	 * else if it is the last working disks, ignore the error, let the
+	 * next level up know.
+	 * else mark the drive as failed
+	 */
+
+	for (i = 0; i < disks; i++)
+		if (mirrors[i].dev==dev && mirrors[i].operational)
+			break;
+	if (i == disks)
+		return 0;
+
+	if (i < conf->raid_disks && conf->working_disks == 1) {
+		/* Don't fail the drive, act as though we were just a
+		 * normal single drive
 		 */
-		for (i = 0; i < disks; i++) {
-			if (mirrors[i].dev==dev && mirrors[i].operational) {
-				mark_disk_bad(mddev, i);
-				break;
-			}
-		}
+
+		return 1;
 	}
+	mark_disk_bad(mddev, i);
 	return 0;
 }
 
@@ -1185,6 +1200,15 @@
 				md_atomic_set(&r1_bh->remaining, sum_bhs);
 				if (bhl) raid1_free_bh(conf, bhl);
 				mbh = r1_bh->mirror_bh_list;
+
+				if (!sum_bhs) {
+					/* nowhere to write this too... I guess we
+					 * must be done
+					 */
+					sync_request_done(bh->b_blocknr, conf);
+					md_done_sync(mddev, bh->b_size>>9, 0);
+					raid1_free_buf(r1_bh);
+				} else
 				while (mbh) {
 					struct buffer_head *bh1 = mbh;
 					mbh = mbh->b_next;
@@ -1192,18 +1216,12 @@
 					md_sync_acct(bh1->b_dev, bh1->b_size/512);
 				}
 			} else {
-				dev = bh->b_dev;
-				raid1_map (mddev, &bh->b_dev);
-				if (bh->b_dev == dev) {
-					printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
-					md_done_sync(mddev, bh->b_size>>9, 0);
-				} else {
-					printk (REDIRECT_SECTOR,
-						partition_name(bh->b_dev), bh->b_blocknr);
-					bh->b_rdev = bh->b_dev;
-					bh->b_rsector = bh->b_blocknr; 
-					generic_make_request(READ, bh);
-				}
+				/* There is no point trying a read-for-reconstruct
+				 * as reconstruct is about to be aborted
+				 */
+
+				printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
+				md_done_sync(mddev, bh->b_size>>9, 0);
 			}
 
 			break;
@@ -1615,12 +1633,14 @@
 	 * As a minimum, 1 r1bh and raid_disks buffer_heads
 	 * would probably get us by in tight memory situations,
 	 * but a few more is probably a good idea.
-	 * For now, try 16 r1bh and 16*raid_disks bufferheads
-	 * This will allow at least 16 concurrent reads or writes
-	 * even if kmalloc starts failing
-	 */
-	if (raid1_grow_r1bh(conf, 16) < 16 ||
-	    raid1_grow_bh(conf, 16*conf->raid_disks)< 16*conf->raid_disks) {
+	 * For now, try NR_RESERVED_BUFS r1bh and
+	 * NR_RESERVED_BUFS*raid_disks bufferheads
+	 * This will allow at least NR_RESERVED_BUFS concurrent
+	 * reads or writes even if kmalloc starts failing
+	 */
+	if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
+	    raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
+	                      < NR_RESERVED_BUFS*conf->raid_disks) {
 		printk(MEM_ERROR, mdidx(mddev));
 		goto out_free_conf;
 	}
@@ -1711,7 +1731,7 @@
 
 out_free_conf:
 	raid1_shrink_r1bh(conf);
-	raid1_shrink_bh(conf, conf->freebh_cnt);
+	raid1_shrink_bh(conf);
 	raid1_shrink_buffers(conf);
 	kfree(conf);
 	mddev->private = NULL;
@@ -1772,7 +1792,7 @@
 	if (conf->resync_thread)
 		md_unregister_thread(conf->resync_thread);
 	raid1_shrink_r1bh(conf);
-	raid1_shrink_bh(conf, conf->freebh_cnt);
+	raid1_shrink_bh(conf);
 	raid1_shrink_buffers(conf);
 	kfree(conf);
 	mddev->private = NULL;

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)