[Cluster-devel] [PATCH] GFS2: directly write blocks past i_size

Steven Whitehouse swhiteho at redhat.com
Thu Mar 17 09:26:08 UTC 2011


Hi,

On Thu, 2011-03-17 at 00:08 -0500, Benjamin Marzinski wrote:
> GFS2 was relying on the writepage code to write out the zeroed data for
> fallocate.  However, with FALLOC_FL_KEEP_SIZE set, this may be past i_size.
> If it is, it will be ignored.  To work around this, gfs2 now calls
> write_dirty_buffer directly on the buffer_heads when FALLOC_FL_KEEP_SIZE
> is set, and it's writing past i_size.
> 
> Signed-off-by: Benjamin Marzinski <bmarzins at redhat.com>
> ---
>  fs/gfs2/file.c |   62 +++++++++++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 52 insertions(+), 10 deletions(-)
> 
Generally looks good, but a few minor points....

> Index: gfs2-2.6-nmw/fs/gfs2/file.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/file.c
> +++ gfs2-2.6-nmw/fs/gfs2/file.c
> @@ -617,18 +617,55 @@ static ssize_t gfs2_file_aio_write(struc
>  	return generic_file_aio_write(iocb, iov, nr_segs, pos);
>  }
>  
> -static void empty_write_end(struct page *page, unsigned from,
> -			   unsigned to)
> +static int empty_write_end(struct page *page, unsigned from,
> +			   unsigned to, int mode)
>  {
> -	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
> +	struct inode *inode = page->mapping->host;
> +	struct gfs2_inode *ip = GFS2_I(inode);
> +	struct buffer_head *bh;
> +	int waiting = 0;
> +	unsigned offset, blksize = 1 << inode->i_blkbits;
> +	loff_t i_size = i_size_read(inode);
> +	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
>  
>  	zero_user(page, from, to-from);
>  	mark_page_accessed(page);
>  
> -	if (!gfs2_is_writeback(ip))
> -		gfs2_page_add_databufs(ip, page, from, to);
> + 	if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
> +		if (!gfs2_is_writeback(ip))
> +			gfs2_page_add_databufs(ip, page, from, to);
>  
> -	block_commit_write(page, from, to);
> +		block_commit_write(page, from, to);
> +		return 0;
> +	}
> +
> +second_pass:
> +	offset = 0;
> +	bh = page_buffers(page);
> +	while (offset < from) {
> +		offset += blksize;
> +		bh = bh->b_this_page;
> +	}
> +	while (offset < to) {
> +		if (waiting){
> +			wait_on_buffer(bh);
> +			if (!buffer_uptodate(bh))
> +				return -EIO;
> +		}
> +		else {
The else should start on the line above to make it more obvious.

> +			set_buffer_uptodate(bh);
> +			mark_buffer_dirty(bh);
> +			clear_buffer_new(bh);
> +			write_dirty_buffer(bh, WRITE);
Should this be WRITE_SYNC or WRITE_SYNC_PLUG I wonder?

> +		}
> +		offset += blksize;
> +		bh = bh->b_this_page;
> +	}
> +	if (!waiting) {
> +		waiting = 1;
> +		goto second_pass;
> +	}
I think the code might be a bit cleaner if it was just written as two
loops, one after the other since most of the loop content seems to be
different according to weather "waiting" is set or not.

Otherwise I think this is a good solution,

Steve.

> +	return 0;
>  }
>  
>  static int needs_empty_write(sector_t block, struct inode *inode)
> @@ -643,7 +680,8 @@ static int needs_empty_write(sector_t bl
>  	return !buffer_mapped(&bh_map);
>  }
>  
> -static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
> +static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
> +			      int mode)
>  {
>  	struct inode *inode = page->mapping->host;
>  	unsigned start, end, next, blksize;
> @@ -668,7 +706,9 @@ static int write_empty_blocks(struct pag
>  							  gfs2_block_map);
>  				if (unlikely(ret))
>  					return ret;
> -				empty_write_end(page, start, end);
> +				ret = empty_write_end(page, start, end, mode);
> +				if (unlikely(ret))
> +					return ret;
>  				end = 0;
>  			}
>  			start = next;
> @@ -682,7 +722,9 @@ static int write_empty_blocks(struct pag
>  		ret = __block_write_begin(page, start, end - start, gfs2_block_map);
>  		if (unlikely(ret))
>  			return ret;
> -		empty_write_end(page, start, end);
> +		ret = empty_write_end(page, start, end, mode);
> +		if (unlikely(ret))
> +			return ret;
>  	}
>  
>  	return 0;
> @@ -731,7 +773,7 @@ static int fallocate_chunk(struct inode 
>  
>  		if (curr == end)
>  			to = end_offset;
> -		error = write_empty_blocks(page, from, to);
> +		error = write_empty_blocks(page, from, to, mode);
>  		if (!error && offset + to > inode->i_size &&
>  		    !(mode & FALLOC_FL_KEEP_SIZE)) {
>  			i_size_write(inode, offset + to);
> 





More information about the Cluster-devel mailing list