[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]

Re: writing processes are blocking in log_wait_common with data=ordered



Hi,

On Thu, May 02, 2002 at 01:29:49PM -0400, David Mansfield wrote:

> The last thing I tried is running the program on a server here running the 
> beta of 'Red Hat Advanced Server' which is running their kernel 
> '2.4.9-26beta.31' and I got:
> 
> 1399 blks/sec ext3-ordered !!!
> 
> It turns out that this kernel ignores the O_SYNC on open... ha.

It looks like a generic 2.4.9 bug.

What's happening is that ext3 has its own extra logic for tracking
buffers when it comes to writing pages from the page cache, and part
of what the jbd layer does marks the pages as dirty.

Then, the core VFS __block_prepare_write comes along, and it does

			if (!atomic_set_buffer_dirty(bh)) {
				__mark_dirty(bh);
				buffer_insert_inode_queue(bh, inode);
				need_balance_dirty = 1;
			}

so if the buffer becomes dirty, it gets hooked onto the appropriate
inode's queue where O_SYNC can find it.  Unfortunately, ext3 has
already marked it dirty, so this code doesn't run.

It looks as if the same problem is still present in 2.4.18, except
ameliorated somewhat by new code in ext3_file_write:

	/*
	 * Nasty: if the file is subject to synchronous writes then we need
	 * to force generic_osync_inode() to call ext3_write_inode().
	 * We do that by marking the inode dirty.  This adds much more
	 * computational expense than we need, but we're going to sync
	 * anyway.
	 */
	if (IS_SYNC(inode) || (file->f_flags & O_SYNC))
		mark_inode_dirty(inode);

That's actually the wrong fix --- for a multi-page write, it only
guarantees that the first page gets flushed (and not even that if
another task happens to flush the inode out before the page write
completes); and it forces an unnecessary inode sync even when we
haven't modified i_size and don't need to sync metadata.

So 2.4.18 is broken too, but so much less so that it doesn't show up
in benchmarks.  The 2.4.9 fix should fix 2.4.18 too, and will let us
remove the above performance degradation.  The patch I've been testing
for 2.4.9 is attached.

Cheers,
 Stephen
--- linux/fs/ext3/inode.c.~1~	Thu May  2 19:23:39 2002
+++ linux/fs/ext3/inode.c	Fri May  3 15:54:09 2002
@@ -949,11 +949,13 @@
 }
 
 static int walk_page_buffers(	handle_t *handle,
+				struct inode *inode,
 				struct buffer_head *head,
 				unsigned from,
 				unsigned to,
 				int *partial,
 				int (*fn)(	handle_t *handle,
+						struct inode *inode,
 						struct buffer_head *bh))
 {
 	struct buffer_head *bh;
@@ -971,7 +973,7 @@
 				*partial = 1;
 			continue;
 		}
-		err = (*fn)(handle, bh);
+		err = (*fn)(handle, inode, bh);
 		if (!ret)
 			ret = err;
 	}
@@ -1004,7 +1006,7 @@
  * write.  
  */
 
-static int do_journal_get_write_access(handle_t *handle, 
+static int do_journal_get_write_access(handle_t *handle, struct inode *inode,
 				       struct buffer_head *bh)
 {
 	return ext3_journal_get_write_access(handle, bh);
@@ -1028,7 +1030,7 @@
 		goto prepare_write_failed;
 
 	if (ext3_should_journal_data(inode)) {
-		ret = walk_page_buffers(handle, page->buffers,
+		ret = walk_page_buffers(handle, inode, page->buffers,
 				from, to, NULL, do_journal_get_write_access);
 		if (ret) {
 			/*
@@ -1049,24 +1051,32 @@
 	return ret;
 }
 
-static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh)
+static int journal_dirty_sync_data(handle_t *handle, struct inode *inode,
+				   struct buffer_head *bh)
 {
-	return ext3_journal_dirty_data(handle, bh, 0);
+	int ret = ext3_journal_dirty_data(handle, bh, 0);
+	if (bh->b_inode != inode)
+		buffer_insert_inode_queue(bh, inode);
+	return ret;
 }
 
 /*
  * For ext3_writepage().  We also brelse() the buffer to account for
  * the bget() which ext3_writepage() performs.
  */
-static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh)
+static int journal_dirty_async_data(handle_t *handle, struct inode *inode, 
+				    struct buffer_head *bh)
 {
 	int ret = ext3_journal_dirty_data(handle, bh, 1);
+	if (bh->b_inode != inode)
+		buffer_insert_inode_queue(bh, inode);
 	__brelse(bh);
 	return ret;
 }
 
 /* For commit_write() in data=journal mode */
-static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
+static int commit_write_fn(handle_t *handle, struct inode *inode, 
+			   struct buffer_head *bh)
 {
 	set_bit(BH_Uptodate, &bh->b_state);
 	return ext3_journal_dirty_metadata(handle, bh);
@@ -1101,7 +1111,7 @@
 		int partial = 0;
 		loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
 
-		ret = walk_page_buffers(handle, page->buffers,
+		ret = walk_page_buffers(handle, inode, page->buffers,
 			from, to, &partial, commit_write_fn);
 		if (!partial)
 			SetPageUptodate(page);
@@ -1111,7 +1121,7 @@
 		EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
 	} else {
 		if (ext3_should_order_data(inode)) {
-			ret = walk_page_buffers(handle, page->buffers,
+			ret = walk_page_buffers(handle, inode, page->buffers,
 				from, to, NULL, journal_dirty_sync_data);
 		}
 		/* Be careful here if generic_commit_write becomes a
@@ -1193,7 +1203,8 @@
 	return generic_block_bmap(mapping,block,ext3_get_block);
 }
 
-static int bget_one(handle_t *handle, struct buffer_head *bh)
+static int bget_one(handle_t *handle, struct inode *inode, 
+		    struct buffer_head *bh)
 {
 	atomic_inc(&bh->b_count);
 	return 0;
@@ -1292,7 +1303,7 @@
 			create_empty_buffers(page,
 				inode->i_dev, inode->i_sb->s_blocksize);
 		page_buffers = page->buffers;
-		walk_page_buffers(handle, page_buffers, 0,
+		walk_page_buffers(handle, inode, page_buffers, 0,
 				PAGE_CACHE_SIZE, NULL, bget_one);
 	}
 
@@ -1310,7 +1321,7 @@
 
 	/* And attach them to the current transaction */
 	if (order_data) {
-		err = walk_page_buffers(handle, page_buffers,
+		err = walk_page_buffers(handle, inode, page_buffers,
 			0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data);
 		if (!ret)
 			ret = err;

[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]