[Cluster-devel] [PATCH] GFS2: rewrite fallocate code to write blocks directly
Steven Whitehouse
swhiteho at redhat.com
Sun Sep 18 13:45:20 UTC 2011
Hi,
This looks really good. Sorry for not spotting it first time around - I
think it got lost in the backlog when I came back from holiday. Its now
in the -nmw tree. Thanks,
Steve.
On Mon, 2011-09-12 at 18:15 -0500, Benjamin Marzinski wrote:
> GFS2's fallocate code currently goes through the page cache. Since it's only
> writing to the end of the file or to holes in it, it doesn't need to, and it
> was causing issues on low memory environments. This patch pulls in some of
> Steve's block allocation work, and uses it to simply allocate the blocks for
> the file, and zero them out at allocation time. It provides a slight
> performance increase, and it dramatically simplifies the code.
>
> Signed-off-by: Benjamin Marzinski <bmarzins at redhat.com>
> ---
> fs/gfs2/bmap.c | 12 +++
> fs/gfs2/file.c | 171 +++++++------------------------------------------------
> fs/gfs2/incore.h | 3
> 3 files changed, 39 insertions(+), 147 deletions(-)
>
> Index: gfs2-2.6-nmw/fs/gfs2/file.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/file.c
> +++ gfs2-2.6-nmw/fs/gfs2/file.c
> @@ -669,135 +669,18 @@ static ssize_t gfs2_file_aio_write(struc
> return generic_file_aio_write(iocb, iov, nr_segs, pos);
> }
>
> -static int empty_write_end(struct page *page, unsigned from,
> - unsigned to, int mode)
> -{
> - struct inode *inode = page->mapping->host;
> - struct gfs2_inode *ip = GFS2_I(inode);
> - struct buffer_head *bh;
> - unsigned offset, blksize = 1 << inode->i_blkbits;
> - pgoff_t end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
> -
> - zero_user(page, from, to-from);
> - mark_page_accessed(page);
> -
> - if (page->index < end_index || !(mode & FALLOC_FL_KEEP_SIZE)) {
> - if (!gfs2_is_writeback(ip))
> - gfs2_page_add_databufs(ip, page, from, to);
> -
> - block_commit_write(page, from, to);
> - return 0;
> - }
> -
> - offset = 0;
> - bh = page_buffers(page);
> - while (offset < to) {
> - if (offset >= from) {
> - set_buffer_uptodate(bh);
> - mark_buffer_dirty(bh);
> - clear_buffer_new(bh);
> - write_dirty_buffer(bh, WRITE);
> - }
> - offset += blksize;
> - bh = bh->b_this_page;
> - }
> -
> - offset = 0;
> - bh = page_buffers(page);
> - while (offset < to) {
> - if (offset >= from) {
> - wait_on_buffer(bh);
> - if (!buffer_uptodate(bh))
> - return -EIO;
> - }
> - offset += blksize;
> - bh = bh->b_this_page;
> - }
> - return 0;
> -}
> -
> -static int needs_empty_write(sector_t block, struct inode *inode)
> -{
> - int error;
> - struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
> -
> - bh_map.b_size = 1 << inode->i_blkbits;
> - error = gfs2_block_map(inode, block, &bh_map, 0);
> - if (unlikely(error))
> - return error;
> - return !buffer_mapped(&bh_map);
> -}
> -
> -static int write_empty_blocks(struct page *page, unsigned from, unsigned to,
> - int mode)
> -{
> - struct inode *inode = page->mapping->host;
> - unsigned start, end, next, blksize;
> - sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
> - int ret;
> -
> - blksize = 1 << inode->i_blkbits;
> - next = end = 0;
> - while (next < from) {
> - next += blksize;
> - block++;
> - }
> - start = next;
> - do {
> - next += blksize;
> - ret = needs_empty_write(block, inode);
> - if (unlikely(ret < 0))
> - return ret;
> - if (ret == 0) {
> - if (end) {
> - ret = __block_write_begin(page, start, end - start,
> - gfs2_block_map);
> - if (unlikely(ret))
> - return ret;
> - ret = empty_write_end(page, start, end, mode);
> - if (unlikely(ret))
> - return ret;
> - end = 0;
> - }
> - start = next;
> - }
> - else
> - end = next;
> - block++;
> - } while (next < to);
> -
> - if (end) {
> - ret = __block_write_begin(page, start, end - start, gfs2_block_map);
> - if (unlikely(ret))
> - return ret;
> - ret = empty_write_end(page, start, end, mode);
> - if (unlikely(ret))
> - return ret;
> - }
> -
> - return 0;
> -}
> -
> static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
> int mode)
> {
> struct gfs2_inode *ip = GFS2_I(inode);
> struct buffer_head *dibh;
> int error;
> - u64 start = offset >> PAGE_CACHE_SHIFT;
> - unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
> - u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
> - pgoff_t curr;
> - struct page *page;
> - unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
> - unsigned int from, to;
> -
> - if (!end_offset)
> - end_offset = PAGE_CACHE_SIZE;
> + unsigned int nr_blks;
> + sector_t lblock = offset >> inode->i_blkbits;
>
> error = gfs2_meta_inode_buffer(ip, &dibh);
> if (unlikely(error))
> - goto out;
> + return error;
>
> gfs2_trans_add_bh(ip->i_gl, dibh, 1);
>
> @@ -807,39 +690,31 @@ static int fallocate_chunk(struct inode
> goto out;
> }
>
> - curr = start;
> - offset = start << PAGE_CACHE_SHIFT;
> - from = start_offset;
> - to = PAGE_CACHE_SIZE;
> - while (curr <= end) {
> - page = grab_cache_page_write_begin(inode->i_mapping, curr,
> - AOP_FLAG_NOFS);
> - if (unlikely(!page)) {
> - error = -ENOMEM;
> - goto out;
> - }
> + while (len) {
> + struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
> + bh_map.b_size = len;
> + set_buffer_zeronew(&bh_map);
>
> - if (curr == end)
> - to = end_offset;
> - error = write_empty_blocks(page, from, to, mode);
> - if (!error && offset + to > inode->i_size &&
> - !(mode & FALLOC_FL_KEEP_SIZE)) {
> - i_size_write(inode, offset + to);
> - }
> - unlock_page(page);
> - page_cache_release(page);
> - if (error)
> + error = gfs2_block_map(inode, lblock, &bh_map, 1);
> + if (unlikely(error))
> goto out;
> - curr++;
> - offset += PAGE_CACHE_SIZE;
> - from = 0;
> + len -= bh_map.b_size;
> + nr_blks = bh_map.b_size >> inode->i_blkbits;
> + lblock += nr_blks;
> + if (!buffer_new(&bh_map))
> + continue;
> + if (unlikely(!buffer_zeronew(&bh_map))) {
> + error = -EIO;
> + goto out;
> + }
> }
> + if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE))
> + i_size_write(inode, offset + len);
>
> mark_inode_dirty(inode);
>
> - brelse(dibh);
> -
> out:
> + brelse(dibh);
> return error;
> }
>
> @@ -879,6 +754,7 @@ static long gfs2_fallocate(struct file *
> int error;
> loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
> loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
> + loff_t max_chunk_size = UINT_MAX & bsize_mask;
> next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
>
> /* We only support the FALLOC_FL_KEEP_SIZE mode */
> @@ -932,7 +808,8 @@ retry:
> goto out_qunlock;
> }
> max_bytes = bytes;
> - calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
> + calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
> + &max_bytes, &data_blocks, &ind_blocks);
> al->al_requested = data_blocks + ind_blocks;
>
> rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
> Index: gfs2-2.6-nmw/fs/gfs2/bmap.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/bmap.c
> +++ gfs2-2.6-nmw/fs/gfs2/bmap.c
> @@ -10,6 +10,7 @@
> #include <linux/spinlock.h>
> #include <linux/completion.h>
> #include <linux/buffer_head.h>
> +#include <linux/blkdev.h>
> #include <linux/gfs2_ondisk.h>
> #include <linux/crc32.h>
>
> @@ -427,12 +428,14 @@ static int gfs2_bmap_alloc(struct inode
> {
> struct gfs2_inode *ip = GFS2_I(inode);
> struct gfs2_sbd *sdp = GFS2_SB(inode);
> + struct super_block *sb = sdp->sd_vfs;
> struct buffer_head *dibh = mp->mp_bh[0];
> u64 bn, dblock = 0;
> unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
> unsigned dblks = 0;
> unsigned ptrs_per_blk;
> const unsigned end_of_metadata = height - 1;
> + int ret;
> int eob = 0;
> enum alloc_state state;
> __be64 *ptr;
> @@ -535,6 +538,15 @@ static int gfs2_bmap_alloc(struct inode
> dblock = bn;
> while (n-- > 0)
> *ptr++ = cpu_to_be64(bn++);
> + if (buffer_zeronew(bh_map)) {
> + ret = sb_issue_zeroout(sb, dblock, dblks,
> + GFP_NOFS);
> + if (ret) {
> + fs_err(sdp,
> + "Failed to zero data buffers\n");
> + clear_buffer_zeronew(bh_map);
> + }
> + }
> break;
> }
> } while ((state != ALLOC_DATA) || !dblock);
> Index: gfs2-2.6-nmw/fs/gfs2/incore.h
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/incore.h
> +++ gfs2-2.6-nmw/fs/gfs2/incore.h
> @@ -103,12 +103,15 @@ struct gfs2_rgrpd {
> enum gfs2_state_bits {
> BH_Pinned = BH_PrivateStart,
> BH_Escaped = BH_PrivateStart + 1,
> + BH_Zeronew = BH_PrivateStart + 2,
> };
>
> BUFFER_FNS(Pinned, pinned)
> TAS_BUFFER_FNS(Pinned, pinned)
> BUFFER_FNS(Escaped, escaped)
> TAS_BUFFER_FNS(Escaped, escaped)
> +BUFFER_FNS(Zeronew, zeronew)
> +TAS_BUFFER_FNS(Zeronew, zeronew)
>
> struct gfs2_bufdata {
> struct buffer_head *bd_bh;
>
More information about the Cluster-devel
mailing list