[Cluster-devel] [PATCH] GFS2: directly write blocks past i_size

Benjamin Marzinski bmarzins at redhat.com
Thu Mar 17 14:05:41 UTC 2011


On Thu, Mar 17, 2011 at 09:26:08AM +0000, Steven Whitehouse wrote:
> Hi,
> 
> On Thu, 2011-03-17 at 00:08 -0500, Benjamin Marzinski wrote:
> > GFS2 was relying on the writepage code to write out the zeroed data for
> > fallocate.  However, with FALLOC_FL_KEEP_SIZE set, this may be past i_size.
> > If it is, it will be ignored.  To work around this, gfs2 now calls
> > write_dirty_buffer directly on the buffer_heads when FALLOC_FL_KEEP_SIZE
> > is set, and it's writing past i_size.
> > 
> > Signed-off-by: Benjamin Marzinski <bmarzins at redhat.com>
> > ---
> >  fs/gfs2/file.c |   62 +++++++++++++++++++++++++++++++++++++++++++++++----------
> >  1 file changed, 52 insertions(+), 10 deletions(-)
> > 
> Generally looks good, but a few minor points....
> 
> > Index: gfs2-2.6-nmw/fs/gfs2/file.c
> > ===================================================================
> > --- gfs2-2.6-nmw.orig/fs/gfs2/file.c
> > +++ gfs2-2.6-nmw/fs/gfs2/file.c
> > @@ -617,18 +617,55 @@ static ssize_t gfs2_file_aio_write(struc
> >  	return generic_file_aio_write(iocb, iov, nr_segs, pos);
> >  }
> >  
> > -static void empty_write_end(struct page *page, unsigned from,
> > -			   unsigned to)
> > +static int empty_write_end(struct page *page, unsigned from,
> > +			   unsigned to, int mode)
> >  {
> > -	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
> > +	struct inode *inode = page->mapping->host;
> > +	struct gfs2_inode *ip = GFS2_I(inode);
> > +	struct buffer_head *bh;
> > +	int waiting = 0;
> > +	unsigned offset, blksize = 1 << inode->i_blkbits;
> > +	loff_t i_size = i_size_read(inode);
> > +	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
> >  
> >  	zero_user(page, from, to-from);
> >  	mark_page_accessed(page);
> >  
> > -	if (!gfs2_is_writeback(ip))
> > -		gfs2_page_add_databufs(ip, page, from, to);
> > + 	if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
> > +		if (!gfs2_is_writeback(ip))
> > +			gfs2_page_add_databufs(ip, page, from, to);
> >  
> > -	block_commit_write(page, from, to);
> > +		block_commit_write(page, from, to);
> > +		return 0;
> > +	}
> > +
> > +second_pass:
> > +	offset = 0;
> > +	bh = page_buffers(page);
> > +	while (offset < from) {
> > +		offset += blksize;
> > +		bh = bh->b_this_page;
> > +	}
> > +	while (offset < to) {
> > +		if (waiting){
> > +			wait_on_buffer(bh);
> > +			if (!buffer_uptodate(bh))
> > +				return -EIO;
> > +		}
> > +		else {
> The else should start on the line above to make it more obvious.
> 
> > +			set_buffer_uptodate(bh);
> > +			mark_buffer_dirty(bh);
> > +			clear_buffer_new(bh);
> > +			write_dirty_buffer(bh, WRITE);
> Should this be WRITE_SYNC or WRITE_SYNC_PLUG I wonder?
> 
> > +		}
> > +		offset += blksize;
> > +		bh = bh->b_this_page;
> > +	}
> > +	if (!waiting) {
> > +		waiting = 1;
> > +		goto second_pass;
> > +	}
> I think the code might be a bit cleaner if it was just written as two
> loops, one after the other since most of the loop content seems to be
> different according to weather "waiting" is set or not.
> 
> Otherwise I think this is a good solution,
> 
> Steve.
> 


Sure.  This patch kept evolving, and I probably should have given it a
better cleanup once I was satisfied that it worked.  I send off a new
version.

-Ben

> > +	return 0;
> >  }
> >  
> >  static int needs_empty_write(sector_t block, struct inode *inode)
> > @@ -643,7 +680,8 @@ static int needs_empty_write(sector_t bl
> >  	return !buffer_mapped(&bh_map);
> >  }
> >  
> > -static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
> > +static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
> > +			      int mode)
> >  {
> >  	struct inode *inode = page->mapping->host;
> >  	unsigned start, end, next, blksize;
> > @@ -668,7 +706,9 @@ static int write_empty_blocks(struct pag
> >  							  gfs2_block_map);
> >  				if (unlikely(ret))
> >  					return ret;
> > -				empty_write_end(page, start, end);
> > +				ret = empty_write_end(page, start, end, mode);
> > +				if (unlikely(ret))
> > +					return ret;
> >  				end = 0;
> >  			}
> >  			start = next;
> > @@ -682,7 +722,9 @@ static int write_empty_blocks(struct pag
> >  		ret = __block_write_begin(page, start, end - start, gfs2_block_map);
> >  		if (unlikely(ret))
> >  			return ret;
> > -		empty_write_end(page, start, end);
> > +		ret = empty_write_end(page, start, end, mode);
> > +		if (unlikely(ret))
> > +			return ret;
> >  	}
> >  
> >  	return 0;
> > @@ -731,7 +773,7 @@ static int fallocate_chunk(struct inode 
> >  
> >  		if (curr == end)
> >  			to = end_offset;
> > -		error = write_empty_blocks(page, from, to);
> > +		error = write_empty_blocks(page, from, to, mode);
> >  		if (!error && offset + to > inode->i_size &&
> >  		    !(mode & FALLOC_FL_KEEP_SIZE)) {
> >  			i_size_write(inode, offset + to);
> > 
> 




More information about the Cluster-devel mailing list