patch-2.3.7 linux/fs/buffer.c

Next file: linux/fs/devices.c
Previous file: linux/fs/block_dev.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.3.6/linux/fs/buffer.c linux/fs/buffer.c
@@ -24,6 +24,8 @@
  * - RMK
  */
 
+#include <linux/sched.h>
+#include <linux/fs.h>
 #include <linux/malloc.h>
 #include <linux/locks.h>
 #include <linux/errno.h>
@@ -113,7 +115,7 @@
 
 /* These are the min and max parameter values that we will allow to be assigned */
 int bdflush_min[N_PARAM] = {  0,  10,    5,   25,  0,   1*HZ,   1*HZ, 1, 1};
-int bdflush_max[N_PARAM] = {100,5000, 2000, 2000,100, 600*HZ, 600*HZ, 2047, 5};
+int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,1000, 6000*HZ, 6000*HZ, 2047, 5};
 
 void wakeup_bdflush(int);
 
@@ -422,7 +424,25 @@
 #define _hashfn(dev,block) (((unsigned)(HASHDEV(dev)^block)) & bh_hash_mask)
 #define hash(dev,block) hash_table[_hashfn(dev,block)]
 
-static inline void remove_from_hash_queue(struct buffer_head * bh)
+static void insert_into_hash_list(struct buffer_head * bh)
+{
+	bh->b_next = NULL;
+	bh->b_pprev = NULL;
+	if (bh->b_dev) {
+		struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
+		struct buffer_head *next = *bhp;
+
+		if (next) {
+			bh->b_next = next;
+			next->b_pprev = &bh->b_next;
+		}
+		*bhp = bh;
+		bh->b_pprev = bhp;
+		nr_hashed_buffers++;
+	}
+}
+
+static void remove_from_hash_queue(struct buffer_head * bh)
 {
 	struct buffer_head **pprev = bh->b_pprev;
 	if (pprev) {
@@ -433,16 +453,43 @@
 		}
 		*pprev = next;
 		bh->b_pprev = NULL;
+		nr_hashed_buffers--;
 	}
-	nr_hashed_buffers--;
 }
 
-static inline void remove_from_lru_list(struct buffer_head * bh)
+static void insert_into_lru_list(struct buffer_head * bh)
 {
-	if (!(bh->b_prev_free) || !(bh->b_next_free))
-		panic("VFS: LRU block list corrupted");
+	struct buffer_head **bhp = &lru_list[bh->b_list];
+
 	if (bh->b_dev == B_FREE)
-		panic("LRU list corrupted");
+		BUG();
+
+	if(!*bhp) {
+		*bhp = bh;
+		bh->b_prev_free = bh;
+	}
+
+	if (bh->b_next_free)
+		panic("VFS: buffer LRU pointers corrupted");
+
+	bh->b_next_free = *bhp;
+	bh->b_prev_free = (*bhp)->b_prev_free;
+	(*bhp)->b_prev_free->b_next_free = bh;
+	(*bhp)->b_prev_free = bh;
+
+	nr_buffers++;
+	nr_buffers_type[bh->b_list]++;
+}
+
+static void remove_from_lru_list(struct buffer_head * bh)
+{
+	if (!(bh->b_prev_free) || !(bh->b_next_free))
+		return;
+
+	if (bh->b_dev == B_FREE) {
+		printk("LRU list corrupted");
+		*(int*)0 = 0;
+	}
 	bh->b_prev_free->b_next_free = bh->b_next_free;
 	bh->b_next_free->b_prev_free = bh->b_prev_free;
 
@@ -451,9 +498,12 @@
 	if (lru_list[bh->b_list] == bh)
 		 lru_list[bh->b_list] = NULL;
 	bh->b_next_free = bh->b_prev_free = NULL;
+
+	nr_buffers--;
+	nr_buffers_type[bh->b_list]--;
 }
 
-static inline void remove_from_free_list(struct buffer_head * bh)
+static void remove_from_free_list(struct buffer_head * bh)
 {
 	int isize = BUFSIZE_INDEX(bh->b_size);
 	if (!(bh->b_prev_free) || !(bh->b_next_free))
@@ -475,21 +525,20 @@
 
 static void remove_from_queues(struct buffer_head * bh)
 {
-	if(bh->b_dev == B_FREE) {
-		remove_from_free_list(bh); /* Free list entries should not be
-					      in the hash queue */
-		return;
-	}
-	nr_buffers_type[bh->b_list]--;
+	if (bh->b_dev == B_FREE)
+		BUG();
 	remove_from_hash_queue(bh);
 	remove_from_lru_list(bh);
 }
 
-static inline void put_last_free(struct buffer_head * bh)
+static void put_last_free(struct buffer_head * bh)
 {
 	if (bh) {
 		struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];
 
+		if (bh->b_count)
+			BUG();
+
 		bh->b_dev = B_FREE;  /* So it is obvious we are on the free list. */
 
 		/* Add to back of free list. */
@@ -505,47 +554,6 @@
 	}
 }
 
-static void insert_into_queues(struct buffer_head * bh)
-{
-	/* put at end of free list */
-	if(bh->b_dev == B_FREE) {
-		put_last_free(bh);
-	} else {
-		struct buffer_head **bhp = &lru_list[bh->b_list];
-
-		if(!*bhp) {
-			*bhp = bh;
-			bh->b_prev_free = bh;
-		}
-
-		if (bh->b_next_free)
-			panic("VFS: buffer LRU pointers corrupted");
-
-		bh->b_next_free = *bhp;
-		bh->b_prev_free = (*bhp)->b_prev_free;
-		(*bhp)->b_prev_free->b_next_free = bh;
-		(*bhp)->b_prev_free = bh;
-
-		nr_buffers_type[bh->b_list]++;
-
-		/* Put the buffer in new hash-queue if it has a device. */
-		bh->b_next = NULL;
-		bh->b_pprev = NULL;
-		if (bh->b_dev) {
-			struct buffer_head **bhp = &hash(bh->b_dev, bh->b_blocknr);
-			struct buffer_head *next = *bhp;
-
-			if (next) {
-				bh->b_next = next;
-				next->b_pprev = &bh->b_next;
-			}
-			*bhp = bh;
-			bh->b_pprev = bhp;
-		}
-		nr_hashed_buffers++;
-	}
-}
-
 struct buffer_head * find_buffer(kdev_t dev, int block, int size)
 {		
 	struct buffer_head * next;
@@ -636,6 +644,7 @@
 			if (bh->b_size == size)
 				 continue;
 			bhnext->b_count++;
+			bh->b_count++;
 			wait_on_buffer(bh);
 			bhnext->b_count--;
 			if (bh->b_dev == dev && bh->b_size != size) {
@@ -644,9 +653,10 @@
 				clear_bit(BH_Req, &bh->b_state);
 				bh->b_flushtime = 0;
 			}
+			if (--bh->b_count)
+				continue;
 			remove_from_queues(bh);
-			bh->b_dev=B_FREE;
-			insert_into_queues(bh);
+			put_last_free(bh);
 		}
 	}
 }
@@ -666,7 +676,6 @@
 void init_buffer(struct buffer_head *bh, kdev_t dev, int block,
 		 bh_end_io_t *handler, void *dev_id)
 {
-	bh->b_count = 1;
 	bh->b_list = BUF_CLEAN;
 	bh->b_flushtime = 0;
 	bh->b_dev = dev;
@@ -702,7 +711,7 @@
 		if (!buffer_dirty(bh)) {
 			bh->b_flushtime = 0;
 		}
-		return bh;
+		goto out;
 	}
 
 	isize = BUFSIZE_INDEX(size);
@@ -716,9 +725,13 @@
 	 * and that it's unused (b_count=0), unlocked, and clean.
 	 */
 	init_buffer(bh, dev, block, end_buffer_io_sync, NULL);
-	bh->b_state=0;
-	insert_into_queues(bh);
-	return bh;
+	bh->b_count = 1;
+	bh->b_state = 0;
+
+	/* Insert the buffer into the regular lists */
+	insert_into_lru_list(bh);
+	insert_into_hash_list(bh);
+	goto out;
 
 	/*
 	 * If we block while refilling the free list, somebody may
@@ -729,6 +742,8 @@
 	if (!find_buffer(dev,block,size))
 		goto get_free;
 	goto repeat;
+out:
+	return bh;
 }
 
 void set_writetime(struct buffer_head * buf, int flag)
@@ -746,15 +761,56 @@
 	}
 }
 
-
 /*
  * Put a buffer into the appropriate list, without side-effects.
  */
-static inline void file_buffer(struct buffer_head *bh, int list)
+static void file_buffer(struct buffer_head *bh, int list)
 {
-	remove_from_queues(bh);
+	remove_from_lru_list(bh);
 	bh->b_list = list;
-	insert_into_queues(bh);
+	insert_into_lru_list(bh);
+}
+
+/*
+ * if a new dirty buffer is created we need to balance bdflush.
+ *
+ * in the future we might want to make bdflush aware of different
+ * pressures on different devices - thus the (currently unused)
+ * 'dev' parameter.
+ */
+void balance_dirty(kdev_t dev)
+{
+	int dirty = nr_buffers_type[BUF_DIRTY];
+	int ndirty = bdf_prm.b_un.ndirty;
+
+	if (dirty > ndirty) {
+		int wait = 0;
+		if (dirty > 2*ndirty)
+			wait = 1;
+		wakeup_bdflush(wait);
+	}
+}
+
+atomic_t too_many_dirty_buffers;
+
+static inline void __mark_dirty(struct buffer_head *bh, int flag)
+{
+	set_writetime(bh, flag);
+	refile_buffer(bh);
+	if (atomic_read(&too_many_dirty_buffers))
+		balance_dirty(bh->b_dev);
+}
+
+void __mark_buffer_dirty(struct buffer_head *bh, int flag)
+{
+	__mark_dirty(bh, flag);
+}
+
+void __atomic_mark_buffer_dirty(struct buffer_head *bh, int flag)
+{
+	lock_kernel();
+	__mark_dirty(bh, flag);
+	unlock_kernel();
 }
 
 /*
@@ -765,36 +821,19 @@
 {
 	int dispose;
 
-	if(buf->b_dev == B_FREE) {
+	if (buf->b_dev == B_FREE) {
 		printk("Attempt to refile free buffer\n");
 		return;
 	}
+
+	dispose = BUF_CLEAN;
+	if (buffer_locked(buf))
+		dispose = BUF_LOCKED;
 	if (buffer_dirty(buf))
 		dispose = BUF_DIRTY;
-	else if (buffer_locked(buf))
-		dispose = BUF_LOCKED;
-	else
-		dispose = BUF_CLEAN;
-	if(dispose != buf->b_list) {
-		file_buffer(buf, dispose);
-		if(dispose == BUF_DIRTY) {
-			int too_many = (nr_buffers * bdf_prm.b_un.nfract/100);
-
-			/* This buffer is dirty, maybe we need to start flushing.
-			 * If too high a percentage of the buffers are dirty...
-			 */
-			if (nr_buffers_type[BUF_DIRTY] > too_many)
-				wakeup_bdflush(1);
 
-			/* If this is a loop device, and
-			 * more than half of the buffers are dirty...
-			 * (Prevents no-free-buffers deadlock with loop device.)
-			 */
-			if (MAJOR(buf->b_dev) == LOOP_MAJOR &&
-			    nr_buffers_type[BUF_DIRTY]*2>nr_buffers)
-				wakeup_bdflush(1);
-		}
-	}
+	if (dispose != buf->b_list)
+		file_buffer(buf, dispose);
 }
 
 /*
@@ -809,6 +848,7 @@
 
 	if (buf->b_count) {
 		buf->b_count--;
+		wake_up(&buffer_wait);
 		return;
 	}
 	printk("VFS: brelse: Trying to free free buffer\n");
@@ -890,7 +930,6 @@
 
 /*	if (blocks) printk("breada (new) %d blocks\n",blocks); */
 
-
 	bhlist[0] = bh;
 	j = 1;
 	for(i=1; i<blocks; i++) {
@@ -928,7 +967,8 @@
 		return;
 	}
 
-	memset(bh,0,sizeof(*bh));
+//	memset(bh, 0, sizeof(*bh));
+	bh->b_blocknr = -1;
 	init_waitqueue_head(&bh->b_wait);
 	nr_unused_buffer_heads++;
 	bh->b_next_free = unused_list;
@@ -1153,17 +1193,12 @@
 	struct page *page;
 
 	mark_buffer_uptodate(bh, uptodate);
-	unlock_buffer(bh);
 
 	/* This is a temporary buffer used for page I/O. */
 	page = mem_map + MAP_NR(bh->b_data);
-	if (!PageLocked(page))
-		goto not_locked;
-	if (bh->b_count != 1)
-		goto bad_count;
 
-	if (!test_bit(BH_Uptodate, &bh->b_state))
-		set_bit(PG_error, &page->flags);
+	if (!uptodate)
+		SetPageError(page);
 
 	/*
 	 * Be _very_ careful from here on. Bad things can happen if
@@ -1179,69 +1214,63 @@
 	 */
 	save_flags(flags);
 	cli();
-	bh->b_count--;
-	tmp = bh;
-	do {
-		if (tmp->b_count)
+	unlock_buffer(bh);
+	tmp = bh->b_this_page;
+	while (tmp != bh) {
+		if (buffer_locked(tmp))
 			goto still_busy;
 		tmp = tmp->b_this_page;
-	} while (tmp != bh);
+	}
 
 	/* OK, the async IO on this page is complete. */
-	free_async_buffers(bh);
 	restore_flags(flags);
-	clear_bit(PG_locked, &page->flags);
-	wake_up(&page->wait);
+
 	after_unlock_page(page);
+	/*
+	 * if none of the buffers had errors then we can set the
+	 * page uptodate:
+	 */
+	if (!PageError(page))
+		SetPageUptodate(page);
+	if (page->owner != -1)
+		PAGE_BUG(page);
+	page->owner = (int)current;
+	UnlockPage(page);
+
 	return;
 
 still_busy:
 	restore_flags(flags);
 	return;
-
-not_locked:
-	printk ("Whoops: end_buffer_io_async: async io complete on unlocked page\n");
-	return;
-
-bad_count:
-	printk ("Whoops: end_buffer_io_async: b_count != 1 on async io.\n");
-	return;
 }
 
-/*
- * Start I/O on a page.
- * This function expects the page to be locked and may return before I/O is complete.
- * You then have to check page->locked, page->uptodate, and maybe wait on page->wait.
- */
-int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
+static int create_page_buffers (int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
 {
-	struct buffer_head *bh, *prev, *next, *arr[MAX_BUF_PER_PAGE];
-	int block, nr;
+	struct buffer_head *head, *bh, *tail;
+	int block;
 
 	if (!PageLocked(page))
-		panic("brw_page: page not locked for I/O");
-	clear_bit(PG_uptodate, &page->flags);
-	clear_bit(PG_error, &page->flags);
+		BUG();
+	if (page->owner != (int)current)
+		PAGE_BUG(page);
 	/*
 	 * Allocate async buffer heads pointing to this page, just for I/O.
-	 * They do _not_ show up in the buffer hash table!
-	 * They are _not_ registered in page->buffers either!
+	 * They show up in the buffer hash table and are registered in
+	 * page->buffers.
 	 */
-	bh = create_buffers(page_address(page), size, 1);
-	if (!bh) {
-		/* WSH: exit here leaves page->count incremented */
-		clear_bit(PG_locked, &page->flags);
-		wake_up(&page->wait);
-		return -ENOMEM;
-	}
-	nr = 0;
-	next = bh;
-	do {
-		struct buffer_head * tmp;
+	lock_kernel();
+	head = create_buffers(page_address(page), size, 1);
+	unlock_kernel();
+	if (page->buffers)
+		BUG();
+	if (!head)
+		BUG();
+	tail = head;
+	for (bh = head; bh; bh = bh->b_this_page) {
 		block = *(b++);
 
-		init_buffer(next, dev, block, end_buffer_io_async, NULL);
-		set_bit(BH_Uptodate, &next->b_state);
+		tail = bh;
+		init_buffer(bh, dev, block, end_buffer_io_async, NULL);
 
 		/*
 		 * When we use bmap, we define block zero to represent
@@ -1250,51 +1279,379 @@
 		 * two cases.
 		 */
 		if (bmap && !block) {
-			memset(next->b_data, 0, size);
-			next->b_count--;
-			continue;
+			set_bit(BH_Uptodate, &bh->b_state);
+			memset(bh->b_data, 0, size);
+		}
+	}
+	tail->b_this_page = head;
+	get_page(page);
+	page->buffers = head;
+	return 0;
+}
+
+/*
+ * We don't have to release all buffers here, but
+ * we have to be sure that no dirty buffer is left
+ * and no IO is going on (no buffer is locked), because
+ * we have truncated the file and are going to free the
+ * blocks on-disk..
+ */
+int block_flushpage(struct inode *inode, struct page *page, unsigned long offset)
+{
+	struct buffer_head *head, *bh, *next;
+	unsigned int curr_off = 0;
+
+	if (!PageLocked(page))
+		BUG();
+	if (!page->buffers)
+		return 0;
+	lock_kernel();
+
+	head = page->buffers;
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+		next = bh->b_this_page;
+
+		/*
+		 * is this block fully flushed?
+		 */
+		if (offset <= curr_off) {
+			if (bh->b_blocknr) {
+				bh->b_count++;
+				wait_on_buffer(bh);
+				if (bh->b_dev == B_FREE)
+					BUG();
+				mark_buffer_clean(bh);
+				bh->b_blocknr = 0;
+				bh->b_count--;
+			}
+		}
+		curr_off = next_off;
+		bh = next;
+	} while (bh != head);
+
+	/*
+	 * subtle. We release buffer-heads only if this is
+	 * the 'final' flushpage. We have invalidated the bmap
+	 * cached value unconditionally, so real IO is not
+	 * possible anymore.
+	 */
+	if (!offset)
+		try_to_free_buffers(page);
+
+	unlock_kernel();
+	return 0;
+}
+
+static void create_empty_buffers (struct page *page,
+			struct inode *inode, unsigned long blocksize)
+{
+	struct buffer_head *bh, *head, *tail;
+
+	lock_kernel();
+	head = create_buffers(page_address(page), blocksize, 1);
+	unlock_kernel();
+	if (page->buffers)
+		BUG();
+
+	bh = head;
+	do {
+		bh->b_dev = inode->i_dev;
+		bh->b_blocknr = 0;
+		tail = bh;
+		bh = bh->b_this_page;
+	} while (bh);
+	tail->b_this_page = head;
+	page->buffers = head;
+	get_page(page);
+}
+
+/*
+ * block_write_full_page() is SMP-safe - currently it's still
+ * being called with the kernel lock held, but the code is ready.
+ */
+int block_write_full_page (struct file *file, struct page *page, fs_getblock_t fs_get_block)
+{
+	struct dentry *dentry = file->f_dentry;
+	struct inode *inode = dentry->d_inode;
+	int err, created, i;
+	unsigned long block, phys, offset;
+	struct buffer_head *bh, *head;
+
+	if (!PageLocked(page))
+		BUG();
+
+	if (!page->buffers)
+		create_empty_buffers(page, inode, inode->i_sb->s_blocksize);
+	head = page->buffers;
+
+	offset = page->offset;
+	block = offset >> inode->i_sb->s_blocksize_bits;
+
+	// FIXME: currently we assume page alignment.
+	if (offset & (PAGE_SIZE-1))
+		BUG();
+
+	bh = head;
+	i = 0;
+	do {
+		if (!bh)
+			BUG();
+
+		if (!bh->b_blocknr) {
+			err = -EIO;
+			down(&inode->i_sem);
+			phys = fs_get_block (inode, block, 1, &err, &created);
+			up(&inode->i_sem);
+			if (!phys)
+				goto out;
+
+			init_buffer(bh, inode->i_dev, phys, end_buffer_io_sync, NULL);
+			bh->b_state = (1<<BH_Uptodate);
+		} else {
+			/*
+			 * block already exists, just mark it uptodate and
+			 * dirty:
+			 */
+			bh->b_end_io = end_buffer_io_sync;
+			set_bit(BH_Uptodate, &bh->b_state);
 		}
-		tmp = get_hash_table(dev, block, size);
-		if (tmp) {
-			if (!buffer_uptodate(tmp)) {
-				if (rw == READ)
-					ll_rw_block(READ, 1, &tmp);
-				wait_on_buffer(tmp);
+		atomic_mark_buffer_dirty(bh,0);
+
+		bh = bh->b_this_page;
+		block++;
+	} while (bh != head);
+
+	SetPageUptodate(page);
+	return 0;
+out:
+	ClearPageUptodate(page);
+	return err;
+}
+
+int block_write_partial_page (struct file *file, struct page *page, unsigned long offset, unsigned long bytes, const char * buf, fs_getblock_t fs_get_block)
+{
+	struct dentry *dentry = file->f_dentry;
+	struct inode *inode = dentry->d_inode;
+	unsigned long block;
+	int err, created, partial;
+	unsigned long blocksize, start_block, end_block;
+	unsigned long start_offset, start_bytes, end_bytes;
+	unsigned long bbits, phys, blocks, i, len;
+	struct buffer_head *bh, *head;
+	char * target_buf;
+
+	target_buf = (char *)page_address(page) + offset;
+
+	if (!PageLocked(page))
+		BUG();
+
+	blocksize = inode->i_sb->s_blocksize;
+	if (!page->buffers)
+		create_empty_buffers(page, inode, blocksize);
+	head = page->buffers;
+
+	bbits = inode->i_sb->s_blocksize_bits;
+	block = page->offset >> bbits;
+	blocks = PAGE_SIZE >> bbits;
+	start_block = offset >> bbits;
+	end_block = (offset + bytes - 1) >> bbits;
+	start_offset = offset & (blocksize - 1);
+	start_bytes = blocksize - start_offset;
+	if (start_bytes > bytes)
+		start_bytes = bytes;
+	end_bytes = (offset+bytes) & (blocksize - 1);
+	if (end_bytes > bytes)
+		end_bytes = bytes;
+
+	if (offset < 0 || offset >= PAGE_SIZE)
+		BUG();
+	if (bytes+offset < 0 || bytes+offset > PAGE_SIZE)
+		BUG();
+	if (start_block < 0 || start_block >= blocks)
+		BUG();
+	if (end_block < 0 || end_block >= blocks)
+		BUG();
+	// FIXME: currently we assume page alignment.
+	if (page->offset & (PAGE_SIZE-1))
+		BUG();
+
+	i = 0;
+	bh = head;
+	partial = 0;
+	do {
+		if (!bh)
+			BUG();
+
+		if ((i < start_block) || (i > end_block)) {
+			if (!buffer_uptodate(bh))
+				partial = 1;
+			goto skip;
+		}
+		if (!bh->b_blocknr) {
+			err = -EIO;
+			down(&inode->i_sem);
+			phys = fs_get_block (inode, block, 1, &err, &created);
+			up(&inode->i_sem);
+			if (!phys)
+				goto out;
+
+			init_buffer(bh, inode->i_dev, phys, end_buffer_io_sync, NULL);
+
+			/*
+			 * if partially written block which has contents on
+			 * disk, then we have to read it first.
+			 * We also rely on the fact that filesystem holes
+			 * cannot be written.
+			 */
+			if (!created && (start_offset ||
+					(end_bytes && (i == end_block)))) {
+				bh->b_state = 0;
+				ll_rw_block(READ, 1, &bh);
+				lock_kernel();
+				wait_on_buffer(bh);
+				unlock_kernel();
+				err = -EIO;
+				if (!buffer_uptodate(bh))
+					goto out;
 			}
-			if (rw == READ) 
-				memcpy(next->b_data, tmp->b_data, size);
-			else {
-				memcpy(tmp->b_data, next->b_data, size);
-				mark_buffer_dirty(tmp, 0);
+
+			bh->b_state = (1<<BH_Uptodate);
+		} else {
+			/*
+			 * block already exists, just mark it uptodate:
+			 */
+			bh->b_end_io = end_buffer_io_sync;
+			set_bit(BH_Uptodate, &bh->b_state);
+		}
+
+		err = -EFAULT;
+		if (start_offset) {
+			len = start_bytes;
+			start_offset = 0;
+		} else
+		if (end_bytes && (i == end_block)) {
+			len = end_bytes;
+			end_bytes = 0;
+		} else {
+			/*
+			 * Overwritten block.
+			 */
+			len = blocksize;
+		}
+		if (copy_from_user(target_buf, buf, len))
+			goto out;
+		target_buf += len;
+		buf += len;
+
+		/*
+		 * we dirty buffers only after copying the data into
+		 * the page - this way we can dirty the buffer even if
+		 * the bh is still doing IO.
+		 */
+		atomic_mark_buffer_dirty(bh,0);
+skip:
+		i++;
+		block++;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	/*
+	 * is this a partial write that happened to make all buffers
+	 * uptodate then we can optimize away a bogus readpage() for
+	 * the next read(). Here we 'discover' wether the page went
+	 * uptodate as a result of this (potentially partial) write.
+	 */
+	if (!partial)
+		SetPageUptodate(page);
+	return bytes;
+out:
+	ClearPageUptodate(page);
+	return err;
+}
+
+/*
+ * Start I/O on a page.
+ * This function expects the page to be locked and may return
+ * before I/O is complete. You then have to check page->locked,
+ * page->uptodate, and maybe wait on page->wait.
+ *
+ * brw_page() is SMP-safe, although it's being called with the
+ * kernel lock held - but the code is ready.
+ */
+int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size, int bmap)
+{
+	struct buffer_head *head, *bh, *arr[MAX_BUF_PER_PAGE];
+	int nr, fresh /* temporary debugging flag */, block;
+
+	if (!PageLocked(page))
+		panic("brw_page: page not locked for I/O");
+//	clear_bit(PG_error, &page->flags);
+	/*
+	 * We pretty much rely on the page lock for this, because
+	 * create_page_buffers() might sleep.
+	 */
+	fresh = 0;
+	if (!page->buffers) {
+		create_page_buffers(rw, page, dev, b, size, bmap);
+		fresh = 1;
+	}
+	if (!page->buffers)
+		BUG();
+	page->owner = -1;
+
+	head = page->buffers;
+	bh = head;
+	nr = 0;
+	do {
+		block = *(b++);
+
+		if (fresh && (bh->b_count != 0))
+			BUG();
+		if (rw == READ) {
+			if (!fresh)
+				BUG();
+			if (bmap && !block) {
+				if (block)
+					BUG();
+			} else {
+				if (bmap && !block)
+					BUG();
+				if (!buffer_uptodate(bh)) {
+					arr[nr++] = bh;
+				}
 			}
-			brelse(tmp);
-			next->b_count--;
-			continue;
+		} else { /* WRITE */
+			if (!bh->b_blocknr) {
+				if (!block)
+					BUG();
+				bh->b_blocknr = block;
+			} else {
+				if (!block)
+					BUG();
+			}
+			set_bit(BH_Uptodate, &bh->b_state);
+			atomic_mark_buffer_dirty(bh, 0);
+			arr[nr++] = bh;
 		}
-		if (rw == READ)
-			clear_bit(BH_Uptodate, &next->b_state);
-		else
-			set_bit(BH_Dirty, &next->b_state);
-		arr[nr++] = next;
-	} while (prev = next, (next = next->b_this_page) != NULL);
-	prev->b_this_page = bh;
-	
-	if (nr) {
+		bh = bh->b_this_page;
+	} while (bh != head);
+	if (rw == READ)
+		++current->maj_flt;
+	if ((rw == READ) && nr) {
+		if (Page_Uptodate(page))
+			BUG();
 		ll_rw_block(rw, nr, arr);
-		/* The rest of the work is done in mark_buffer_uptodate()
-		 * and unlock_buffer(). */
 	} else {
-		unsigned long flags;
-		clear_bit(PG_locked, &page->flags);
-		set_bit(PG_uptodate, &page->flags);
-		wake_up(&page->wait);
-		save_flags(flags);
-		cli();
-		free_async_buffers(bh);
-		restore_flags(flags);
-		after_unlock_page(page);
+		if (!nr && rw == READ) {
+			SetPageUptodate(page);
+			page->owner = (int)current;
+			UnlockPage(page);
+		}
+		if (nr && (rw == WRITE))
+			ll_rw_block(rw, nr, arr);
 	}
-	++current->maj_flt;
 	return 0;
 }
 
@@ -1305,6 +1662,7 @@
 {
 	if (on) {
 		struct buffer_head *tmp = bh;
+		struct page *page;
 		set_bit(BH_Uptodate, &bh->b_state);
 		/* If a page has buffers and all these buffers are uptodate,
 		 * then the page is uptodate. */
@@ -1313,7 +1671,8 @@
 				return;
 			tmp=tmp->b_this_page;
 		} while (tmp && tmp != bh);
-		set_bit(PG_uptodate, &mem_map[MAP_NR(bh->b_data)].flags);
+		page = mem_map + MAP_NR(bh->b_data);
+		SetPageUptodate(page);
 		return;
 	}
 	clear_bit(BH_Uptodate, &bh->b_state);
@@ -1326,30 +1685,70 @@
  * mark_buffer_uptodate() functions propagate buffer state into the
  * page struct once IO has completed.
  */
-int generic_readpage(struct file * file, struct page * page)
+int block_read_full_page(struct file * file, struct page * page)
 {
 	struct dentry *dentry = file->f_dentry;
 	struct inode *inode = dentry->d_inode;
-	unsigned long block;
-	int *p, nr[PAGE_SIZE/512];
-	int i;
+	unsigned long iblock, phys_block;
+	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+	unsigned int blocksize, blocks;
+	int nr;
 
-	atomic_inc(&page->count);
-	set_bit(PG_locked, &page->flags);
-	set_bit(PG_free_after, &page->flags);
-	
-	i = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
-	block = page->offset >> inode->i_sb->s_blocksize_bits;
-	p = nr;
+	if (!PageLocked(page))
+		PAGE_BUG(page);
+	blocksize = inode->i_sb->s_blocksize;
+	if (!page->buffers)
+		create_empty_buffers(page, inode, blocksize);
+	head = page->buffers;
+
+	blocks = PAGE_SIZE >> inode->i_sb->s_blocksize_bits;
+	iblock = page->offset >> inode->i_sb->s_blocksize_bits;
+	page->owner = -1;
+	head = page->buffers;
+	bh = head;
+	nr = 0;
 	do {
-		*p = inode->i_op->bmap(inode, block);
-		i--;
-		block++;
-		p++;
-	} while (i > 0);
+		phys_block = bh->b_blocknr;
+		/*
+		 * important, we have to retry buffers that already have
+		 * their bnr cached but had an IO error!
+		 */
+		if (!buffer_uptodate(bh)) {
+			phys_block = inode->i_op->bmap(inode, iblock);
+			/*
+			 * this is safe to do because we hold the page lock:
+			 */
+			if (phys_block) {
+				init_buffer(bh, inode->i_dev, phys_block,
+						end_buffer_io_async, NULL);
+				arr[nr] = bh;
+				nr++;
+			} else {
+				/*
+				 * filesystem 'hole' represents zero-contents:
+				 */
+				memset(bh->b_data, 0, blocksize);
+				set_bit(BH_Uptodate, &bh->b_state);
+			}
+		}
+		iblock++;
+		bh = bh->b_this_page;
+	} while (bh != head);
 
-	/* IO start */
-	brw_page(READ, page, inode->i_dev, nr, inode->i_sb->s_blocksize, 1);
+	++current->maj_flt;
+	if (nr) {
+		if (Page_Uptodate(page))
+			BUG();
+		ll_rw_block(READ, nr, arr);
+	} else {
+		/*
+		 * all buffers are uptodate - we can set the page
+		 * uptodate as well.
+		 */
+		SetPageUptodate(page);
+		page->owner = (int)current;
+		UnlockPage(page);
+	}
 	return 0;
 }
 
@@ -1392,7 +1791,6 @@
 			tmp->b_next_free = tmp;
 		}
 		insert_point = tmp;
-		++nr_buffers;
 		if (tmp->b_this_page)
 			tmp = tmp->b_this_page;
 		else
@@ -1409,7 +1807,7 @@
  * Can the buffer be thrown out?
  */
 #define BUFFER_BUSY_BITS	((1<<BH_Dirty) | (1<<BH_Lock) | (1<<BH_Protected))
-#define buffer_busy(bh)		((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
+#define buffer_busy(bh)	((bh)->b_count || ((bh)->b_state & BUFFER_BUSY_BITS))
 
 /*
  * try_to_free_buffers() checks if all the buffers on this particular page
@@ -1418,9 +1816,9 @@
  * Wake up bdflush() if this fails - if we're running low on memory due
  * to dirty buffers, we need to flush them out as quickly as possible.
  */
-int try_to_free_buffers(struct page * page_map)
+int try_to_free_buffers(struct page * page)
 {
-	struct buffer_head * tmp, * bh = page_map->buffers;
+	struct buffer_head * tmp, * bh = page->buffers;
 
 	tmp = bh;
 	do {
@@ -1429,8 +1827,6 @@
 		tmp = tmp->b_this_page;
 		if (!buffer_busy(p))
 			continue;
-
-		wakeup_bdflush(0);
 		return 0;
 	} while (tmp != bh);
 
@@ -1438,8 +1834,13 @@
 	do {
 		struct buffer_head * p = tmp;
 		tmp = tmp->b_this_page;
-		nr_buffers--;
-		remove_from_queues(p);
+
+		/* The buffer can be either on the regular queues or on the free list.. */		
+		if (p->b_dev == B_FREE)
+			remove_from_free_list(p);
+		else
+			remove_from_queues(p);
+
 		put_unused_buffer_head(p);
 	} while (tmp != bh);
 
@@ -1447,10 +1848,12 @@
 	wake_up(&buffer_wait);
 
 	/* And free the page */
-	buffermem -= PAGE_SIZE;
-	page_map->buffers = NULL;
-	__free_page(page_map);
-	return 1;
+	page->buffers = NULL;
+	if (__free_page(page)) {
+		buffermem -= PAGE_SIZE;
+		return 1;
+	}
+	return 0;
 }
 
 /* ================== Debugging =================== */
@@ -1509,11 +1912,11 @@
 	   the heuristic from working with large databases and getting
 	   fsync times (ext2) manageable, is the following */
 
-	memory_size >>= 20;
+	memory_size >>= 22;
 	for (order = 5; (1UL << order) < memory_size; order++);
 
 	/* try to allocate something until we get it or we're asking
-           for something that is really too small */
+	   for something that is really too small */
 
 	do {
 		nr_hash = (1UL << order) * PAGE_SIZE /
@@ -1521,6 +1924,7 @@
 		hash_table = (struct buffer_head **)
 		    __get_free_pages(GFP_ATOMIC, order);
 	} while (hash_table == NULL && --order > 4);
+	printk("buffer-cache hash table entries: %d (order: %d, %ld bytes)\n", nr_hash, order, (1UL<<order) * PAGE_SIZE);
 	
 	if (!hash_table)
 		panic("Failed to allocate buffer hash table\n");
@@ -1565,11 +1969,11 @@
 {
 	if (current == bdflush_tsk)
 		return;
-	wake_up(&bdflush_wait);
-	if (wait) {
+	if (wait)
 		run_task_queue(&tq_disk);
+	wake_up(&bdflush_wait);
+	if (wait)
 		sleep_on(&bdflush_done);
-	}
 }
 
 
@@ -1801,6 +2205,7 @@
 #endif
 					  bh->b_count--;
 					  next->b_count--;
+					  wake_up(&buffer_wait);
 				  }
 		 }
 #ifdef DEBUG
@@ -1818,9 +2223,14 @@
 		run_task_queue(&tq_disk);
 		wake_up(&bdflush_done);
 		
-		/* If there are still a lot of dirty buffers around, skip the sleep
-		   and flush some more */
-		if(ndirty == 0 || nr_buffers_type[BUF_DIRTY] <= nr_buffers * bdf_prm.b_un.nfract/100) {
+		/*
+		 * If there are still a lot of dirty buffers around,
+		 * skip the sleep and flush some more
+		 */
+		if ((ndirty == 0) || (nr_buffers_type[BUF_DIRTY] <=
+				 nr_buffers * bdf_prm.b_un.nfract/100)) {
+
+			atomic_set(&too_many_dirty_buffers, 0);
 			spin_lock_irq(&current->sigmask_lock);
 			flush_signals(current);
 			spin_unlock_irq(&current->sigmask_lock);

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)