[Cluster-devel] [PATCH] GFS2: rewrite fallocate code to write blocks directly

Steven Whitehouse swhiteho at redhat.com
Sun Sep 18 13:45:20 UTC 2011


Hi,

This looks really good. Sorry for not spotting it first time around - I
think it got lost in the backlog when I came back from holiday. Its now
in the -nmw tree. Thanks,

Steve.

On Mon, 2011-09-12 at 18:15 -0500, Benjamin Marzinski wrote:
> GFS2's fallocate code currently goes through the page cache. Since it's only
> writing to the end of the file or to holes in it, it doesn't need to, and it
> was causing issues on low memory environments. This patch pulls in some of
> Steve's block allocation work, and uses it to simply allocate the blocks for
> the file, and zero them out at allocation time.  It provides a slight
> performance increase, and it dramatically simplifies the code.
> 
> Signed-off-by: Benjamin Marzinski <bmarzins at redhat.com>
> ---
>  fs/gfs2/bmap.c   |   12 +++
>  fs/gfs2/file.c   |  171 +++++++------------------------------------------------
>  fs/gfs2/incore.h |    3 
>  3 files changed, 39 insertions(+), 147 deletions(-)
> 
> Index: gfs2-2.6-nmw/fs/gfs2/file.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/file.c
> +++ gfs2-2.6-nmw/fs/gfs2/file.c
> @@ -669,135 +669,18 @@ static ssize_t gfs2_file_aio_write(struc
>  	return generic_file_aio_write(iocb, iov, nr_segs, pos);
>  }
>  
> -static int empty_write_end(struct page *page, unsigned from,
> -			   unsigned to, int mode)
> -{
> -	struct inode *inode = page->mapping->host;
> -	struct gfs2_inode *ip = GFS2_I(inode);
> -	struct buffer_head *bh;
> -	unsigned offset, blksize = 1 << inode->i_blkbits;
> -	pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
> -
> -	zero_user(page, from, to-from);
> -	mark_page_accessed(page);
> -
> -	if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
> -		if (!gfs2_is_writeback(ip))
> -			gfs2_page_add_databufs(ip, page, from, to);
> -
> -		block_commit_write(page, from, to);
> -		return 0;
> -	}
> -
> -	offset = 0;
> -	bh = page_buffers(page);
> -	while (offset < to) {
> -		if (offset >= from) {
> -			set_buffer_uptodate(bh);
> -			mark_buffer_dirty(bh);
> -			clear_buffer_new(bh);
> -			write_dirty_buffer(bh, WRITE);
> -		}
> -		offset += blksize;
> -		bh = bh->b_this_page;
> -	}
> -
> -	offset = 0;
> -	bh = page_buffers(page);
> -	while (offset < to) {
> -		if (offset >= from) {
> -			wait_on_buffer(bh);
> -			if (!buffer_uptodate(bh))
> -				return -EIO;
> -		}
> -		offset += blksize;
> -		bh = bh->b_this_page;
> -	}
> -	return 0;
> -}
> -
> -static int needs_empty_write(sector_t block, struct inode *inode)
> -{
> -	int error;
> -	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
> -
> -	bh_map.b_size = 1 << inode->i_blkbits;
> -	error = gfs2_block_map(inode, block, &bh_map, 0);
> -	if (unlikely(error))
> -		return error;
> -	return !buffer_mapped(&bh_map);
> -}
> -
> -static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
> -			      int mode)
> -{
> -	struct inode *inode = page->mapping->host;
> -	unsigned start, end, next, blksize;
> -	sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
> -	int ret;
> -
> -	blksize = 1 << inode->i_blkbits;
> -	next = end = 0;
> -	while (next < from) {
> -		next += blksize;
> -		block++;
> -	}
> -	start = next;
> -	do {
> -		next += blksize;
> -		ret = needs_empty_write(block, inode);
> -		if (unlikely(ret < 0))
> -			return ret;
> -		if (ret == 0) {
> -			if (end) {
> -				ret = __block_write_begin(page, start, end - start,
> -							  gfs2_block_map);
> -				if (unlikely(ret))
> -					return ret;
> -				ret = empty_write_end(page, start, end, mode);
> -				if (unlikely(ret))
> -					return ret;
> -				end = 0;
> -			}
> -			start = next;
> -		}
> -		else
> -			end = next;
> -		block++;
> -	} while (next < to);
> -
> -	if (end) {
> -		ret = __block_write_begin(page, start, end - start, gfs2_block_map);
> -		if (unlikely(ret))
> -			return ret;
> -		ret = empty_write_end(page, start, end, mode);
> -		if (unlikely(ret))
> -			return ret;
> -	}
> -
> -	return 0;
> -}
> -
>  static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
>  			   int mode)
>  {
>  	struct gfs2_inode *ip = GFS2_I(inode);
>  	struct buffer_head *dibh;
>  	int error;
> -	u64 start = offset >> PAGE_CACHE_SHIFT;
> -	unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
> -	u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
> -	pgoff_t curr;
> -	struct page *page;
> -	unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
> -	unsigned int from, to;
> -
> -	if (!end_offset)
> -		end_offset = PAGE_CACHE_SIZE;
> +	unsigned int nr_blks;
> +	sector_t lblock = offset >> inode->i_blkbits;
>  
>  	error = gfs2_meta_inode_buffer(ip, &dibh);
>  	if (unlikely(error))
> -		goto out;
> +		return error;
>  
>  	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
>  
> @@ -807,39 +690,31 @@ static int fallocate_chunk(struct inode 
>  			goto out;
>  	}
>  
> -	curr = start;
> -	offset = start << PAGE_CACHE_SHIFT;
> -	from = start_offset;
> -	to = PAGE_CACHE_SIZE;
> -	while (curr <= end) {
> -		page = grab_cache_page_write_begin(inode->i_mapping, curr,
> -						   AOP_FLAG_NOFS);
> -		if (unlikely(!page)) {
> -			error = -ENOMEM;
> -			goto out;
> -		}
> +	while (len) {
> +		struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
> +		bh_map.b_size = len;
> +		set_buffer_zeronew(&bh_map);
>  
> -		if (curr == end)
> -			to = end_offset;
> -		error = write_empty_blocks(page, from, to, mode);
> -		if (!error && offset + to > inode->i_size &&
> -		    !(mode & FALLOC_FL_KEEP_SIZE)) {
> -			i_size_write(inode, offset + to);
> -		}
> -		unlock_page(page);
> -		page_cache_release(page);
> -		if (error)
> +		error = gfs2_block_map(inode, lblock, &bh_map, 1);
> +		if (unlikely(error))
>  			goto out;
> -		curr++;
> -		offset += PAGE_CACHE_SIZE;
> -		from = 0;
> +		len -= bh_map.b_size;
> +		nr_blks = bh_map.b_size >> inode->i_blkbits;
> +		lblock += nr_blks;
> +		if (!buffer_new(&bh_map))
> +			continue;
> +		if (unlikely(!buffer_zeronew(&bh_map))) {
> +			error = -EIO;
> +			goto out;
> +		}
>  	}
> +	if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))
> +		i_size_write(inode, offset + len);
>  
>  	mark_inode_dirty(inode);
>  
> -	brelse(dibh);
> -
>  out:
> +	brelse(dibh);
>  	return error;
>  }
>  
> @@ -879,6 +754,7 @@ static long gfs2_fallocate(struct file *
>  	int error;
>  	loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
>  	loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
> +	loff_t max_chunk_size = UINT_MAX & bsize_mask;
>  	next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
>  
>  	/* We only support the FALLOC_FL_KEEP_SIZE mode */
> @@ -932,7 +808,8 @@ retry:
>  			goto out_qunlock;
>  		}
>  		max_bytes = bytes;
> -		calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
> +		calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
> +				&max_bytes, &data_blocks, &ind_blocks);
>  		al->al_requested = data_blocks + ind_blocks;
>  
>  		rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
> Index: gfs2-2.6-nmw/fs/gfs2/bmap.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/bmap.c
> +++ gfs2-2.6-nmw/fs/gfs2/bmap.c
> @@ -10,6 +10,7 @@
>  #include <linux/spinlock.h>
>  #include <linux/completion.h>
>  #include <linux/buffer_head.h>
> +#include <linux/blkdev.h>
>  #include <linux/gfs2_ondisk.h>
>  #include <linux/crc32.h>
>  
> @@ -427,12 +428,14 @@ static int gfs2_bmap_alloc(struct inode 
>  {
>  	struct gfs2_inode *ip = GFS2_I(inode);
>  	struct gfs2_sbd *sdp = GFS2_SB(inode);
> +	struct super_block *sb = sdp->sd_vfs;
>  	struct buffer_head *dibh = mp->mp_bh[0];
>  	u64 bn, dblock = 0;
>  	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
>  	unsigned dblks = 0;
>  	unsigned ptrs_per_blk;
>  	const unsigned end_of_metadata = height - 1;
> +	int ret;
>  	int eob = 0;
>  	enum alloc_state state;
>  	__be64 *ptr;
> @@ -535,6 +538,15 @@ static int gfs2_bmap_alloc(struct inode 
>  			dblock = bn;
>  			while (n-- > 0)
>  				*ptr++ = cpu_to_be64(bn++);
> +			if (buffer_zeronew(bh_map)) {
> +				ret = sb_issue_zeroout(sb, dblock, dblks,
> +						       GFP_NOFS);
> +				if (ret) {
> +					fs_err(sdp,
> +					       "Failed to zero data buffers\n");
> +					clear_buffer_zeronew(bh_map);
> +				}
> +			}
>  			break;
>  		}
>  	} while ((state != ALLOC_DATA) || !dblock);
> Index: gfs2-2.6-nmw/fs/gfs2/incore.h
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/incore.h
> +++ gfs2-2.6-nmw/fs/gfs2/incore.h
> @@ -103,12 +103,15 @@ struct gfs2_rgrpd {
>  enum gfs2_state_bits {
>  	BH_Pinned = BH_PrivateStart,
>  	BH_Escaped = BH_PrivateStart + 1,
> +	BH_Zeronew = BH_PrivateStart + 2,
>  };
>  
>  BUFFER_FNS(Pinned, pinned)
>  TAS_BUFFER_FNS(Pinned, pinned)
>  BUFFER_FNS(Escaped, escaped)
>  TAS_BUFFER_FNS(Escaped, escaped)
> +BUFFER_FNS(Zeronew, zeronew)
> +TAS_BUFFER_FNS(Zeronew, zeronew)
>  
>  struct gfs2_bufdata {
>  	struct buffer_head *bd_bh;
> 





More information about the Cluster-devel mailing list