patch-2.4.6 linux/fs/buffer.c

Next file: linux/fs/dcache.c
Previous file: linux/fs/block_dev.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.4.5/linux/fs/buffer.c linux/fs/buffer.c
@@ -61,7 +61,7 @@
 
 #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9])
 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
-#define NR_RESERVED (2*MAX_BUF_PER_PAGE)
+#define NR_RESERVED (10*MAX_BUF_PER_PAGE)
 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this 
 					     number of unused buffer heads */
 
@@ -161,144 +161,131 @@
 	atomic_dec(&bh->b_count);
 }
 
-/* Call sync_buffers with wait!=0 to ensure that the call does not
- * return until all buffer writes have completed.  Sync() may return
- * before the writes have finished; fsync() may not.
- */
+/* End-of-write handler.. Just mark it up-to-date and unlock the buffer. */
+static void end_buffer_write(struct buffer_head *bh, int uptodate)
+{
+	mark_buffer_uptodate(bh, uptodate);
+	unlock_buffer(bh);
+}
 
-/* Godamity-damn.  Some buffers (bitmaps for filesystems)
- * spontaneously dirty themselves without ever brelse being called.
- * We will ultimately want to put these in a separate list, but for
- * now we search all of the lists for dirty buffers.
+/*
+ * The buffers have been marked clean and locked.  Just submit the dang
+ * things.. 
+ *
+ * We'll wait for the first one of them - "sync" is not exactly
+ * performance-critical, and this makes us not hog the IO subsystem
+ * completely, while still allowing for a fair amount of concurrent IO. 
  */
-static int sync_buffers(kdev_t dev, int wait)
+static void write_locked_buffers(struct buffer_head **array, unsigned int count)
 {
-	int i, retry, pass = 0, err = 0;
-	struct buffer_head * bh, *next;
-
-	/* One pass for no-wait, three for wait:
-	 * 0) write out all dirty, unlocked buffers;
-	 * 1) write out all dirty buffers, waiting if locked;
-	 * 2) wait for completion by waiting for all buffers to unlock.
-	 */
+	struct buffer_head *wait = *array;
+	atomic_inc(&wait->b_count);
 	do {
-		retry = 0;
-
-		/* We search all lists as a failsafe mechanism, not because we expect
-		 * there to be dirty buffers on any of the other lists.
-		 */
-repeat:
-		spin_lock(&lru_list_lock);
-		bh = lru_list[BUF_DIRTY];
-		if (!bh)
-			goto repeat2;
-
-		for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) {
-			next = bh->b_next_free;
+		struct buffer_head * bh = *array++;
+		bh->b_end_io = end_buffer_write;
+		submit_bh(WRITE, bh);
+	} while (--count);
+	wait_on_buffer(wait);
+	atomic_dec(&wait->b_count);
+}
 
-			if (!lru_list[BUF_DIRTY])
-				break;
-			if (dev && bh->b_dev != dev)
-				continue;
-			if (buffer_locked(bh)) {
-				/* Buffer is locked; skip it unless wait is
-				 * requested AND pass > 0.
-				 */
-				if (!wait || !pass) {
-					retry = 1;
-					continue;
-				}
-				atomic_inc(&bh->b_count);
-				spin_unlock(&lru_list_lock);
-				wait_on_buffer (bh);
-				atomic_dec(&bh->b_count);
-				goto repeat;
-			}
+#define NRSYNC (32)
+static void write_unlocked_buffers(kdev_t dev)
+{
+	struct buffer_head *next;
+	struct buffer_head *array[NRSYNC];
+	unsigned int count;
+	int nr;
 
-			/* If an unlocked buffer is not uptodate, there has
-			 * been an IO error. Skip it.
-			 */
-			if (wait && buffer_req(bh) && !buffer_locked(bh) &&
-			    !buffer_dirty(bh) && !buffer_uptodate(bh)) {
-				err = -EIO;
-				continue;
-			}
+repeat:
+	spin_lock(&lru_list_lock);
+	next = lru_list[BUF_DIRTY];
+	nr = nr_buffers_type[BUF_DIRTY] * 2;
+	count = 0;
+	while (next && --nr >= 0) {
+		struct buffer_head * bh = next;
+		next = bh->b_next_free;
 
-			/* Don't write clean buffers.  Don't write ANY buffers
-			 * on the third pass.
-			 */
-			if (!buffer_dirty(bh) || pass >= 2)
+		if (dev && bh->b_dev != dev)
+			continue;
+		if (test_and_set_bit(BH_Lock, &bh->b_state))
+			continue;
+		if (atomic_set_buffer_clean(bh)) {
+			__refile_buffer(bh);
+			array[count++] = bh;
+			if (count < NRSYNC)
 				continue;
 
-			atomic_inc(&bh->b_count);
 			spin_unlock(&lru_list_lock);
-			ll_rw_block(WRITE, 1, &bh);
-			atomic_dec(&bh->b_count);
-			retry = 1;
+			write_locked_buffers(array, count);
 			goto repeat;
 		}
+		unlock_buffer(bh);
+	}
+	spin_unlock(&lru_list_lock);
 
-    repeat2:
-		bh = lru_list[BUF_LOCKED];
-		if (!bh) {
-			spin_unlock(&lru_list_lock);
-			break;
-		}
-		for (i = nr_buffers_type[BUF_LOCKED]*2 ; i-- > 0 ; bh = next) {
-			next = bh->b_next_free;
+	if (count)
+		write_locked_buffers(array, count);
+}
 
-			if (!lru_list[BUF_LOCKED])
-				break;
-			if (dev && bh->b_dev != dev)
-				continue;
-			if (buffer_locked(bh)) {
-				/* Buffer is locked; skip it unless wait is
-				 * requested AND pass > 0.
-				 */
-				if (!wait || !pass) {
-					retry = 1;
-					continue;
-				}
-				atomic_inc(&bh->b_count);
-				spin_unlock(&lru_list_lock);
-				wait_on_buffer (bh);
-				spin_lock(&lru_list_lock);
-				atomic_dec(&bh->b_count);
-				goto repeat2;
-			}
+static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
+{
+	struct buffer_head * next;
+	int nr;
+
+repeat:
+	spin_lock(&lru_list_lock);
+	next = lru_list[index];
+	nr = nr_buffers_type[index] * 2;
+	while (next && --nr >= 0) {
+		struct buffer_head *bh = next;
+		next = bh->b_next_free;
+
+		if (!buffer_locked(bh)) {
+			if (refile)
+				__refile_buffer(bh);
+			continue;
 		}
-		spin_unlock(&lru_list_lock);
+		if (dev && bh->b_dev != dev)
+			continue;
 
-		/* If we are waiting for the sync to succeed, and if any dirty
-		 * blocks were written, then repeat; on the second pass, only
-		 * wait for buffers being written (do not pass to write any
-		 * more buffers on the second pass).
-		 */
-	} while (wait && retry && ++pass<=2);
-	return err;
+		atomic_inc(&bh->b_count);
+		spin_unlock(&lru_list_lock);
+		wait_on_buffer (bh);
+		atomic_dec(&bh->b_count);
+		goto repeat;
+	}
+	spin_unlock(&lru_list_lock);
+	return 0;
 }
 
-void sync_dev(kdev_t dev)
+/* Call sync_buffers with wait!=0 to ensure that the call does not
+ * return until all buffer writes have completed.  Sync() may return
+ * before the writes have finished; fsync() may not.
+ */
+
+/* Godamity-damn.  Some buffers (bitmaps for filesystems)
+ * spontaneously dirty themselves without ever brelse being called.
+ * We will ultimately want to put these in a separate list, but for
+ * now we search all of the lists for dirty buffers.
+ */
+static int sync_buffers(kdev_t dev, int wait)
 {
-	sync_supers(dev);
-	sync_inodes(dev);
-	DQUOT_SYNC(dev);
-	/* sync all the dirty buffers out to disk only _after_ all the
-	   high level layers finished generated buffer dirty data
-	   (or we'll return with some buffer still dirty on the blockdevice
-	   so breaking the semantics of this call) */
-	sync_buffers(dev, 0);
-	/*
-	 * FIXME(eric) we need to sync the physical devices here.
-	 * This is because some (scsi) controllers have huge amounts of
-	 * cache onboard (hundreds of Mb), and we need to instruct
-	 * them to commit all of the dirty memory to disk, and we should
-	 * not return until this has happened.
-	 *
-	 * This would need to get implemented by going through the assorted
-	 * layers so that each block major number can be synced, and this
-	 * would call down into the upper and mid-layer scsi.
+	int err = 0;
+
+	/* One pass for no-wait, three for wait:
+	 * 0) write out all dirty, unlocked buffers;
+	 * 1) wait for all dirty locked buffers;
+	 * 2) write out all dirty, unlocked buffers;
+	 * 2) wait for completion by waiting for all buffers to unlock.
 	 */
+	write_unlocked_buffers(dev);
+	if (wait) {
+		err = wait_for_locked_buffers(dev, BUF_DIRTY, 0);
+		write_unlocked_buffers(dev);
+		err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1);
+	}
+	return err;
 }
 
 int fsync_super(struct super_block *sb)
@@ -331,6 +318,15 @@
 	return sync_buffers(dev, 1);
 }
 
+/*
+ * There's no real reason to pretend we should
+ * ever do anything differently
+ */
+void sync_dev(kdev_t dev)
+{
+	fsync_dev(dev);
+}
+
 asmlinkage long sys_sync(void)
 {
 	fsync_dev(0);
@@ -646,8 +642,8 @@
 			/* Another device? */
 			if (bh->b_dev != dev)
 				continue;
-			/* Part of a mapping? */
-			if (bh->b_page->mapping)
+			/* Not hashed? */
+			if (!bh->b_pprev)
 				continue;
 			if (buffer_locked(bh)) {
 				atomic_inc(&bh->b_count);
@@ -711,6 +707,9 @@
 			bh_next = bh->b_next_free;
 			if (bh->b_dev != dev || bh->b_size == size)
 				continue;
+			/* Unhashed? */
+			if (!bh->b_pprev)
+				continue;
 			if (buffer_locked(bh)) {
 				atomic_inc(&bh->b_count);
 				spin_unlock(&lru_list_lock);
@@ -759,7 +758,7 @@
 {
 	balance_dirty(NODEV);
 	if (free_shortage())
-		page_launder(GFP_BUFFER, 0);
+		page_launder(GFP_NOFS, 0);
 	if (!grow_buffers(size)) {
 		wakeup_bdflush(1);
 		current->policy |= SCHED_YIELD;
@@ -1220,11 +1219,11 @@
 	}
 	spin_unlock(&unused_list_lock);
 
-	/* This is critical.  We can't swap out pages to get
-	 * more buffer heads, because the swap-out may need
-	 * more buffer-heads itself.  Thus SLAB_BUFFER.
+	/* This is critical.  We can't call out to the FS
+	 * to get more buffer heads, because the FS may need
+	 * more buffer-heads itself.  Thus SLAB_NOFS.
 	 */
-	if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) {
+	if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
 		bh->b_blocknr = -1;
 		bh->b_this_page = NULL;
 		return bh;
@@ -1348,11 +1347,9 @@
 	 */
 	run_task_queue(&tq_disk);
 
-	/* 
-	 * Set our state for sleeping, then check again for buffer heads.
-	 * This ensures we won't miss a wake_up from an interrupt.
-	 */
-	wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
+	current->policy |= SCHED_YIELD;
+	__set_current_state(TASK_RUNNING);
+	schedule();
 	goto try_again;
 }
 
@@ -2242,7 +2239,7 @@
 		return 0;
 	}
 
-	page = alloc_page(GFP_BUFFER);
+	page = alloc_page(GFP_NOFS);
 	if (!page)
 		goto out;
 	LockPage(page);
@@ -2304,7 +2301,7 @@
  *	1 - start IO for dirty buffers
  *	2 - wait for completion of locked buffers
  */
-static void sync_page_buffers(struct buffer_head *bh, int wait)
+static void sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask)
 {
 	struct buffer_head * tmp = bh;
 
@@ -2312,7 +2309,7 @@
 		struct buffer_head *p = tmp;
 		tmp = tmp->b_this_page;
 		if (buffer_locked(p)) {
-			if (wait > 1)
+			if (gfp_mask & __GFP_WAIT)
 				__wait_on_buffer(p);
 		} else if (buffer_dirty(p))
 			ll_rw_block(WRITE, 1, &p);
@@ -2336,7 +2333,7 @@
  *       obtain a reference to a buffer head within a page.  So we must
  *	 lock out all of these paths to cleanly toss the page.
  */
-int try_to_free_buffers(struct page * page, int wait)
+int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
 {
 	struct buffer_head * tmp, * bh = page->buffers;
 	int index = BUFSIZE_INDEX(bh->b_size);
@@ -2387,10 +2384,10 @@
 	spin_unlock(&free_list[index].lock);
 	write_unlock(&hash_table_lock);
 	spin_unlock(&lru_list_lock);
-	if (wait) {
-		sync_page_buffers(bh, wait);
+	if (gfp_mask & __GFP_IO) {
+		sync_page_buffers(bh, gfp_mask);
 		/* We waited synchronously, so we can free the buffers. */
-		if (wait > 1 && !loop) {
+		if ((gfp_mask & __GFP_WAIT) && !loop) {
 			loop = 1;
 			goto cleaned_buffers_try_again;
 		}

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)